[poppler] utils/HtmlOutputDev.cc utils/pdftohtml.1 utils/pdftohtml.cc
Albert Astals Cid
aacid at kemper.freedesktop.org
Tue Mar 13 15:55:17 PDT 2012
utils/HtmlOutputDev.cc | 7 +++++--
utils/pdftohtml.1 | 5 +++++
utils/pdftohtml.cc | 7 +++++++
3 files changed, 17 insertions(+), 2 deletions(-)
New commits:
commit e5b914b2bfbb5e95ecde5f1ce148374b1d58dadd
Author: Ihar Filipau <thephilips at gmail.com>
Date: Tue Mar 13 23:54:26 2012 +0100
Add possibilty of controlling word breaks percentage
Bug #47022
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 17541a2..19f1c84 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -31,6 +31,7 @@
// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
// Copyright (C) 2011 Stephen Reichling <sreichling at chegg.com>
// Copyright (C) 2011, 2012 Igor Slepchin <igor.slepchin at gmail.com>
+// Copyright (C) 2012 Ihar Filipau <thephilips at gmail.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -96,6 +97,8 @@ extern GBool xml;
extern GBool showHidden;
extern GBool noMerge;
+extern double wordBreakThreshold;
+
static GBool debug = gFalse;
static GooString *gstr_buff0 = NULL; // a workspace in which I format strings
@@ -379,7 +382,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
// right, which will not necessarily be the case, e.g. if rotated;
// It assesses whether or not two characters are close enough to
// be part of the same string
- fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin) &&
+ fabs(x1 - curStr->xRight[n-1]) > wordBreakThreshold * (curStr->yMax - curStr->yMin) &&
// rotation is (cos q, sin q, -sin q, cos q, 0, 0)
// sin q is zero iff there is no rotation, or 180 deg. rotation;
// for 180 rotation, cos q will be negative
@@ -625,7 +628,7 @@ void HtmlPage::coalesce() {
{
// printf("yes\n");
n = str1->len + str2->len;
- if ((addSpace = horSpace > 0.1 * space)) {
+ if ((addSpace = horSpace > wordBreakThreshold * space)) {
++n;
}
if (addLineBreak) {
diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1
index 6763bbe..44137e4 100644
--- a/utils/pdftohtml.1
+++ b/utils/pdftohtml.1
@@ -84,6 +84,11 @@ do not merge paragraphs
.TP
.B \-nodrm
override document DRM settings
+.TP
+.B \-wbt <fp>
+adjust the word break threshold percent. Default is 10.
+Word break occurs when distance between two adjacent characters is
+greater than this percent of character height.
.SH AUTHOR
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 7347161..6735f5d 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -20,6 +20,7 @@
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
// Copyright (C) 2012 Igor Slepchin <igor.redhat at gmail.com>
+// Copyright (C) 2012 Ihar Filipau <thephilips at gmail.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -82,6 +83,7 @@ GBool stout=gFalse;
GBool xml=gFalse;
static GBool errQuiet=gFalse;
static GBool noDrm=gFalse;
+double wordBreakThreshold=10; // 10%, below converted into a coefficient - 0.1
GBool showHidden = gFalse;
GBool noMerge = gFalse;
@@ -142,6 +144,8 @@ static const ArgDesc argDesc[] = {
"user password (for encrypted files)"},
{"-nodrm", argFlag, &noDrm, 0,
"override document DRM settings"},
+ {"-wbt", argFP, &wordBreakThreshold, 0,
+ "word break threshold (default 10 percent)"},
{NULL}
};
@@ -221,6 +225,9 @@ int main(int argc, char *argv[]) {
}
}
+ // convert from user-friendly percents into a coefficient
+ wordBreakThreshold /= 100.0;
+
// open PDF file
if (ownerPassword[0]) {
ownerPW = new GooString(ownerPassword);
More information about the poppler
mailing list