[poppler] utils/HtmlOutputDev.cc utils/pdftohtml.1 utils/pdftohtml.cc

Albert Astals Cid aacid at kemper.freedesktop.org
Tue Mar 13 15:55:17 PDT 2012


 utils/HtmlOutputDev.cc |    7 +++++--
 utils/pdftohtml.1      |    5 +++++
 utils/pdftohtml.cc     |    7 +++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

New commits:
commit e5b914b2bfbb5e95ecde5f1ce148374b1d58dadd
Author: Ihar Filipau <thephilips at gmail.com>
Date:   Tue Mar 13 23:54:26 2012 +0100

    Add possibilty of controlling word breaks percentage
    
    Bug #47022

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 17541a2..19f1c84 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -31,6 +31,7 @@
 // Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
 // Copyright (C) 2011 Stephen Reichling <sreichling at chegg.com>
 // Copyright (C) 2011, 2012 Igor Slepchin <igor.slepchin at gmail.com>
+// Copyright (C) 2012 Ihar Filipau <thephilips at gmail.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -96,6 +97,8 @@ extern GBool xml;
 extern GBool showHidden;
 extern GBool noMerge;
 
+extern double wordBreakThreshold;
+
 static GBool debug = gFalse;
 static GooString *gstr_buff0 = NULL; // a workspace in which I format strings
 
@@ -379,7 +382,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
       // right, which will not necessarily be the case, e.g. if rotated;
       // It assesses whether or not two characters are close enough to
       // be part of the same string
-      fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin) &&
+      fabs(x1 - curStr->xRight[n-1]) > wordBreakThreshold * (curStr->yMax - curStr->yMin) &&
       // rotation is (cos q, sin q, -sin q, cos q, 0, 0)
       // sin q is zero iff there is no rotation, or 180 deg. rotation;
       // for 180 rotation, cos q will be negative
@@ -625,7 +628,7 @@ void HtmlPage::coalesce() {
     {
 //      printf("yes\n");
       n = str1->len + str2->len;
-      if ((addSpace = horSpace > 0.1 * space)) {
+      if ((addSpace = horSpace > wordBreakThreshold * space)) {
         ++n;
       }
       if (addLineBreak) {
diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1
index 6763bbe..44137e4 100644
--- a/utils/pdftohtml.1
+++ b/utils/pdftohtml.1
@@ -84,6 +84,11 @@ do not merge paragraphs
 .TP
 .B \-nodrm
 override document DRM settings
+.TP
+.B \-wbt <fp>
+adjust the word break threshold percent. Default is 10.
+Word break occurs when distance between two adjacent characters is
+greater than this percent of character height.
 
 .SH AUTHOR
 
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 7347161..6735f5d 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -20,6 +20,7 @@
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
 // Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
 // Copyright (C) 2012 Igor Slepchin <igor.redhat at gmail.com>
+// Copyright (C) 2012 Ihar Filipau <thephilips at gmail.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -82,6 +83,7 @@ GBool stout=gFalse;
 GBool xml=gFalse;
 static GBool errQuiet=gFalse;
 static GBool noDrm=gFalse;
+double wordBreakThreshold=10;  // 10%, below converted into a coefficient - 0.1
 
 GBool showHidden = gFalse;
 GBool noMerge = gFalse;
@@ -142,6 +144,8 @@ static const ArgDesc argDesc[] = {
    "user password (for encrypted files)"},
   {"-nodrm", argFlag, &noDrm, 0,
    "override document DRM settings"},
+  {"-wbt",    argFP,    &wordBreakThreshold, 0,
+   "word break threshold (default 10 percent)"},
   {NULL}
 };
 
@@ -221,6 +225,9 @@ int main(int argc, char *argv[]) {
     }
   }
 
+  // convert from user-friendly percents into a coefficient
+  wordBreakThreshold /= 100.0;
+
   // open PDF file
   if (ownerPassword[0]) {
     ownerPW = new GooString(ownerPassword);


More information about the poppler mailing list