[poppler] poppler/TextOutputDev.cc poppler/TextOutputDev.h utils/pdftotext.1 utils/pdftotext.cc

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Dec 7 23:32:02 UTC 2021


 poppler/TextOutputDev.cc |   14 +++++++++++---
 poppler/TextOutputDev.h  |    6 ++++++
 utils/pdftotext.1        |    3 +++
 utils/pdftotext.cc       |    9 +++++++++
 4 files changed, 29 insertions(+), 3 deletions(-)

New commits:
commit f20d9e5f739b7c8dce74ebc60a6dd1e06106c12e
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date:   Sun Jul 11 14:08:58 2021 -0400

    TextOutputDev: require more spacing between columns
    
    Require more spacing for adjacent text to be
    considered a separate column of text.
    
    We do that by increasing 'minColSpacing1' parameter,
    which marks the distance, within which, an adjacent
    word will be pulled to the current block.
    
    We provide a way to tweak the default value:
    double getMinColSpacing1();
    void setMinColSpacing1(double val);
    
    Fixes issue #1093

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 5dc37c93..67a6246d 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -122,8 +122,9 @@
 #define maxWordSpacing 1.5
 
 // Maximum horizontal spacing which will allow a word to be pulled
-// into a block.
-#define minColSpacing1 0.3
+// into a block, as a fraction of the font size.
+// This default value can be tweaked via API.
+double TextOutputDev::minColSpacing1_default = 0.7;
 
 // Minimum spacing between columns, as a fraction of the font size.
 #define minColSpacing2 1.0
@@ -2814,6 +2815,11 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
 }
 
 void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
+{
+    coalesce(physLayout, fixedPitch, doHTML, TextOutputDev::minColSpacing1_default);
+}
+
+void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1)
 {
     TextWord *word0, *word1, *word2;
     TextLine *line;
@@ -5605,6 +5611,7 @@ TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixe
     textEOL = defaultEndOfLine();
     textPageBreaks = true;
     ok = true;
+    minColSpacing1 = minColSpacing1_default;
 
     // open file
     needClose = false;
@@ -5648,6 +5655,7 @@ TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA
     textEOL = defaultEndOfLine();
     textPageBreaks = true;
     ok = true;
+    minColSpacing1 = minColSpacing1_default;
 }
 
 TextOutputDev::~TextOutputDev()
@@ -5669,7 +5677,7 @@ void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
 void TextOutputDev::endPage()
 {
     text->endPage();
-    text->coalesce(physLayout, fixedPitch, doHTML);
+    text->coalesce(physLayout, fixedPitch, doHTML, minColSpacing1);
     if (outputStream) {
         text->dump(outputStream, outputFunc, physLayout, textEOL, textPageBreaks);
     }
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 984507ba..9df36278 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -596,6 +596,7 @@ public:
 
     // Coalesce strings that look like parts of the same line.
     void coalesce(bool physLayout, double fixedPitch, bool doHTML);
+    void coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1);
 
     // Find a string.  If <startAtTop> is true, starts looking at the
     // top of the page; else if <startAtLast> is true, starts looking
@@ -756,6 +757,8 @@ private:
 class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev
 {
 public:
+    static double minColSpacing1_default;
+
     // Open a text output file.  If <fileName> is NULL, no file is
     // written (this is useful, e.g., for searching text).  If
     // <physLayoutA> is true, the original physical layout of the text
@@ -885,6 +888,8 @@ public:
     }
     void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; }
     void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; }
+    double getMinColSpacing1() const { return minColSpacing1; }
+    void setMinColSpacing1(double val) { minColSpacing1 = val; }
 
 private:
     TextOutputFunc outputFunc; // output function
@@ -897,6 +902,7 @@ private:
     double fixedPitch; // if physLayout is true and this is non-zero,
                        //   assume fixed-pitch characters with this
                        //   width
+    double minColSpacing1; // see default value defined with same name at TextOutputDev.cc
     bool rawOrder; // keep text in content stream order
     bool discardDiag; // Diagonal text, i.e., text that is not close to one of the
                       // 0, 90, 180, or 270 degree axes, is discarded. This is useful
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 3ae217b4..39163389 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -85,6 +85,9 @@ block, line, and word in the file.
 .B \-cropbox
 Use the crop box rather than the media box with \-bbox and \-bbox-layout.
 .TP
+.BI \-colspacing " number"
+Specifies how much spacing we allow after a word before considering adjacent text to be a new column, measured as a fraction of the font size. Current default is 0.7, old releases had a 0.3 default.
+.TP
 .BI \-enc " encoding-name"
 Sets the encoding to use for text output. This defaults to "UTF-8".
 .TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 7b45359f..0caca87f 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -84,6 +84,7 @@ static bool bbox = false;
 static bool bboxLayout = false;
 static bool physLayout = false;
 static bool useCropBox = false;
+static double colspacing = TextOutputDev::minColSpacing1_default;
 static double fixedPitch = 0;
 static bool rawOrder = false;
 static bool discardDiag = false;
@@ -117,6 +118,8 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
                                    { "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html.  Sets -htmlmeta" },
                                    { "-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data.  Sets -htmlmeta" },
                                    { "-cropbox", argFlag, &useCropBox, 0, "use the crop box rather than media box" },
+                                   { "-colspacing", argFP, &colspacing, 0,
+                                     "how much spacing we allow after a word before considering adjacent text to be a new column, as a fraction of the font size (default is 0.7, old releases had a 0.3 default)" },
                                    { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
                                    { "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
                                    { "-q", argFlag, &quiet, 0, "don't print any messages or errors" },
@@ -178,6 +181,10 @@ int main(int argc, char *argv[])
     if (bbox) {
         htmlMeta = true;
     }
+    if (colspacing <= 0 || colspacing > 10) {
+        error(errCommandLine, -1, "Bogus value provided for -colspacing");
+        goto err1;
+    }
     if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) {
         fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
         fprintf(stderr, "%s\n", popplerCopyright);
@@ -342,6 +349,7 @@ int main(int argc, char *argv[])
 
         if (textOut->isOk()) {
             textOut->setTextEOL(textEOL);
+            textOut->setMinColSpacing1(colspacing);
             if (noPageBreaks) {
                 textOut->setTextPageBreaks(false);
             }
@@ -358,6 +366,7 @@ int main(int argc, char *argv[])
         textOut = new TextOutputDev(textFileName->c_str(), physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
         if (textOut->isOk()) {
             textOut->setTextEOL(textEOL);
+            textOut->setMinColSpacing1(colspacing);
             if (noPageBreaks) {
                 textOut->setTextPageBreaks(false);
             }


More information about the poppler mailing list