[poppler] utils/pdftotext.1 utils/pdftotext.cc

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Apr 1 13:33:10 UTC 2022


 utils/pdftotext.1  |    4 +
 utils/pdftotext.cc |  136 ++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 123 insertions(+), 17 deletions(-)

New commits:
commit 72cf19694952a8899f33fda1c448417d0a8d996a
Author: Luis Landeiro <ribeiro.luis at gmail.com>
Date:   Fri Apr 1 13:33:08 2022 +0000

    pdftotext: added TSV mode

diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 39163389..86360156 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -82,6 +82,10 @@ word in the file.
 Generate an XHTML file containing bounding box information for each
 block, line, and word in the file.
 .TP
+.B \-tsv
+Generate a TSV file containing the bounding box information for each
+block, line, and word in the file.
+.TP
 .B \-cropbox
 Use the crop box rather than the media box with \-bbox and \-bbox-layout.
 .TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 4862b6f2..987a48b2 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -69,11 +69,13 @@
 #include <iomanip>
 #include "Win32Console.h"
 #include "DateInfo.h"
+#include <cfloat>
 
 static void printInfoString(FILE *f, Dict *infoDict, const char *key, const char *text1, const char *text2, const UnicodeMap *uMap);
 static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *text1, const char *text2);
 void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
 void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
+void printTSVBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
 
 static int firstPage = 1;
 static int lastPage = 0;
@@ -100,6 +102,7 @@ static bool quiet = false;
 static bool printVersion = false;
 static bool printHelp = false;
 static bool printEnc = false;
+static bool tsvMode = false;
 
 static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" },
                                    { "-l", argInt, &lastPage, 0, "last page to convert" },
@@ -113,11 +116,12 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
                                    { "-raw", argFlag, &rawOrder, 0, "keep strings in content stream order" },
                                    { "-nodiag", argFlag, &discardDiag, 0, "discard diagonal text" },
                                    { "-htmlmeta", argFlag, &htmlMeta, 0, "generate a simple HTML file, including the meta information" },
+                                   { "-tsv", argFlag, &tsvMode, 0, "generate a simple TSV file, including the meta information for bounding boxes" },
                                    { "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
                                    { "-listenc", argFlag, &printEnc, 0, "list available encodings" },
                                    { "-eol", argString, textEOLStr, sizeof(textEOLStr), "output end-of-line convention (unix, dos, or mac)" },
                                    { "-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages" },
-                                   { "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html.  Sets -htmlmeta" },
+                                   { "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta" },
                                    { "-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data.  Sets -htmlmeta" },
                                    { "-cropbox", argFlag, &useCropBox, 0, "use the crop box rather than media box" },
                                    { "-colspacing", argFP, &colspacing, 0,
@@ -356,26 +360,46 @@ int main(int argc, char *argv[])
             fclose(f);
         }
     } else {
-        textOut = new TextOutputDev(textFileName->c_str(), physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
-        if (textOut->isOk()) {
-            textOut->setTextEOL(textEOL);
-            textOut->setMinColSpacing1(colspacing);
-            if (noPageBreaks) {
-                textOut->setTextPageBreaks(false);
-            }
-            if ((w == 0) && (h == 0) && (x == 0) && (y == 0)) {
-                doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, true, false, false);
-            } else {
 
-                for (int page = firstPage; page <= lastPage; ++page) {
-                    doc->displayPageSlice(textOut, page, resolution, resolution, 0, true, false, false, x, y, w, h);
+        if (tsvMode) {
+            textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
+            if (!textFileName->cmp("-")) {
+                f = stdout;
+            } else {
+                if (!(f = fopen(textFileName->c_str(), "wb"))) {
+                    error(errIO, -1, "Couldn't open text file '{0:t}'", textFileName);
+                    delete textOut;
+                    exitCode = 2;
+                    goto err3;
                 }
             }
-
+            printTSVBBox(f, doc.get(), textOut, firstPage, lastPage);
+            if (f != stdout) {
+                fclose(f);
+            }
         } else {
-            delete textOut;
-            exitCode = 2;
-            goto err3;
+            textOut = new TextOutputDev(textFileName->c_str(), physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
+            if (textOut->isOk()) {
+                textOut->setTextEOL(textEOL);
+                textOut->setMinColSpacing1(colspacing);
+                if (noPageBreaks) {
+                    textOut->setTextPageBreaks(false);
+                }
+
+                if ((w == 0) && (h == 0) && (x == 0) && (y == 0)) {
+                    doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, true, false, false);
+                } else {
+
+                    for (int page = firstPage; page <= lastPage; ++page) {
+                        doc->displayPageSlice(textOut, page, resolution, resolution, 0, true, false, false, x, y, w, h);
+                    }
+                }
+
+            } else {
+                delete textOut;
+                exitCode = 2;
+                goto err3;
+            }
         }
     }
     delete textOut;
@@ -538,6 +562,84 @@ void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int l
     fprintf(f, "</doc>\n");
 }
 
+void printTSVBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last)
+{
+    double xMin = 0, yMin = 0, xMax = 0, yMax = 0;
+    const TextFlow *flow;
+    const TextBlock *blk;
+    const TextLine *line;
+    const TextWord *word;
+    int blockNum = 0;
+    int lineNum = 0;
+    int flowNum = 0;
+    int wordNum = 0;
+    const int pageLevel = 1;
+    const int blockLevel = 3;
+    const int lineLevel = 4;
+    const int wordLevel = 5;
+    const int metaConf = -1;
+    const int wordConf = 100;
+
+    fputs("level\tpage_num\tpar_num\tblock_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n", f);
+
+    for (int page = first; page <= last; ++page) {
+        const double wid = useCropBox ? doc->getPageCropWidth(page) : doc->getPageMediaWidth(page);
+        const double hgt = useCropBox ? doc->getPageCropHeight(page) : doc->getPageMediaHeight(page);
+
+        fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%d\t###PAGE###\n", pageLevel, page, flowNum, blockNum, lineNum, wordNum, xMin, yMin, wid, hgt, metaConf);
+        doc->displayPage(textOut, page, resolution, resolution, 0, !useCropBox, useCropBox, false);
+
+        for (flow = textOut->getFlows(); flow; flow = flow->getNext()) {
+            // flow->getBBox(&xMin, &yMin, &xMax, &yMax);
+            // fprintf(f, "%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t\n", page,flowNum,blockNum,lineNum,wordNum,xMin,yMin,wid, hgt);
+
+            for (blk = flow->getBlocks(); blk; blk = blk->getNext()) {
+                blk->getBBox(&xMin, &yMin, &xMax, &yMax);
+                fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%d\t###FLOW###\n", blockLevel, page, flowNum, blockNum, lineNum, wordNum, xMin, yMin, xMax - xMin, yMax - yMin, metaConf);
+
+                for (line = blk->getLines(); line; line = line->getNext()) {
+
+                    double lxMin = 1E+37, lyMin = 1E+37;
+                    double lxMax = 0, lyMax = 0;
+                    GooString *lineWordsBuffer = new GooString();
+
+                    for (word = line->getWords(); word; word = word->getNext()) {
+                        word->getBBox(&xMin, &yMin, &xMax, &yMax);
+                        if (lxMin > xMin) {
+                            lxMin = xMin;
+                        }
+                        if (lxMax < xMax) {
+                            lxMax = xMax;
+                        }
+                        if (lyMin > yMin) {
+                            lyMin = yMin;
+                        }
+                        if (lyMax < yMax) {
+                            lyMax = yMax;
+                        }
+
+                        lineWordsBuffer->appendf("{0:d}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\t{5:d}\t{6:.2f}\t{7:.2f}\t{8:.2f}\t{9:.2f}\t{10:d}\t{11:t}\n", wordLevel, page, flowNum, blockNum, lineNum, wordNum, xMin, yMin, xMax - xMin, yMax - yMin,
+                                                 wordConf, word->getText());
+                        wordNum++;
+                    }
+
+                    // Print Link Bounding Box info
+                    fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%d\t###LINE###\n", lineLevel, page, flowNum, blockNum, lineNum, 0, lxMin, lyMin, lxMax - lxMin, lyMax - lyMin, metaConf);
+                    fprintf(f, "%s", lineWordsBuffer->c_str());
+                    delete lineWordsBuffer;
+                    wordNum = 0;
+                    lineNum++;
+                }
+                lineNum = 0;
+                blockNum++;
+            }
+            blockNum = 0;
+            flowNum++;
+        }
+        flowNum = 0;
+    }
+}
+
 void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last)
 {
     fprintf(f, "<doc>\n");


More information about the poppler mailing list