[poppler] utils/pdftotext.1 utils/pdftotext.cc

Albert Astals Cid aacid at kemper.freedesktop.org
Mon Aug 31 15:23:32 PDT 2015


 utils/pdftotext.1  |    4 +
 utils/pdftotext.cc |  115 +++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 99 insertions(+), 20 deletions(-)

New commits:
commit 911d9fc8d85b776418039b4eebb37200a0987554
Author: Jeremy Echols <jechols at uoregon.edu>
Date:   Tue Sep 1 00:22:28 2015 +0200

    pdftotext: Add -bbox-layout option
    
    Adds layout information for blocks and lines in addition to words
    
    Bug #89941

diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 0199b03..b53f82f 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -71,6 +71,10 @@ headers.
 Generate an XHTML file containing bounding box information for each
 word in the file.
 .TP
+.B \-bbox-layout
+Generate an XHTML file containing bounding box information for each
+block, line, and word in the file.
+.TP
 .BI \-enc " encoding-name"
 Sets the encoding to use for text output. This defaults to "UTF-8".
 .TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 15c741d..d931a96 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -24,6 +24,7 @@
 // Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
 // Copyright (C) 2013 Yury G. Kudryashov <urkud.urkud at gmail.com>
 // Copyright (C) 2013 Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
+// Copyright (C) 2015 Jeremy Echols <jechols at uoregon.edu>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -56,10 +57,14 @@
 #include "PDFDocEncoding.h"
 #include "Error.h"
 #include <string>
+#include <sstream>
+#include <iomanip>
 
 static void printInfoString(FILE *f, Dict *infoDict, const char *key,
 			    const char *text1, const char *text2, UnicodeMap *uMap);
 static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *fmt);
+void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
+void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
 
 static int firstPage = 1;
 static int lastPage = 0;
@@ -69,6 +74,7 @@ static int y = 0;
 static int w = 0;
 static int h = 0;
 static GBool bbox = gFalse;
+static GBool bboxLayout = gFalse;
 static GBool physLayout = gFalse;
 static double fixedPitch = 0;
 static GBool rawOrder = gFalse;
@@ -116,6 +122,8 @@ static const ArgDesc argDesc[] = {
    "don't insert page breaks between pages"},
   {"-bbox", argFlag,     &bbox,  0,
    "output bounding box for each word and page size to html.  Sets -htmlmeta"},
+  {"-bbox-layout", argFlag,     &bboxLayout,  0,
+   "like -bbox but with extra layout bounding box data.  Sets -htmlmeta"},
   {"-opw",     argString,   ownerPassword,  sizeof(ownerPassword),
    "owner password (for encrypted files)"},
   {"-upw",     argString,   userPassword,   sizeof(userPassword),
@@ -176,6 +184,9 @@ int main(int argc, char *argv[]) {
 
   // parse args
   ok = parseArgs(argDesc, &argc, argv);
+  if (bboxLayout) {
+    bbox = gTrue;
+  }
   if (bbox) {
     htmlMeta = gTrue;
   }
@@ -352,27 +363,12 @@ int main(int argc, char *argv[]) {
     textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta);
 
     if (textOut->isOk()) {
-      fprintf(f, "<doc>\n");
-      for (int page = firstPage; page <= lastPage; ++page) {
-        fprintf(f, "  <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
-        doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
-        TextWordList *wordlist = textOut->makeWordList();
-        const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
-        TextWord *word;
-        double xMinA, yMinA, xMaxA, yMaxA;
-        if (word_length == 0)
-          fprintf(stderr, "no word list\n");
-
-        for (int i = 0; i < word_length; ++i) {
-          word = wordlist->get(i);
-          word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
-          const std::string myString = myXmlTokenReplace(word->getText()->getCString());
-          fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
-        }
-        fprintf(f, "  </page>\n");
-        delete wordlist;
+      if (bboxLayout) {
+        printDocBBox(f, doc, textOut, firstPage, lastPage);
+      }
+      else {
+        printWordBBox(f, doc, textOut, firstPage, lastPage);
       }
-      fprintf(f, "</doc>\n");
     }
     if (f != stdout) {
       fclose(f);
@@ -492,3 +488,82 @@ static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *
   }
   obj.free();
 }
+
+void printLine(FILE *f, TextLine *line) {
+  double xMin, yMin, xMax, yMax;
+  double lineXMin = 0, lineYMin = 0, lineXMax = 0, lineYMax = 0;
+  TextWord *word;
+  std::stringstream wordXML;
+  wordXML << std::fixed << std::setprecision(6);
+
+  for (word = line->getWords(); word; word = word->getNext()) {
+    word->getBBox(&xMin, &yMin, &xMax, &yMax);
+
+    if (lineXMin == 0 || lineXMin > xMin) lineXMin = xMin;
+    if (lineYMin == 0 || lineYMin > yMin) lineYMin = yMin;
+    if (lineXMax < xMax) lineXMax = xMax;
+    if (lineYMax < yMax) lineYMax = yMax;
+
+    const std::string myString = myXmlTokenReplace(word->getText()->getCString());
+    wordXML << "          <word xMin=\"" << xMin << "\" yMin=\"" << yMin << "\" xMax=\"" <<
+            xMax << "\" yMax=\"" << yMax << "\">" << myString << "</word>\n";
+  }
+  fprintf(f, "        <line xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">\n",
+          lineXMin, lineYMin, lineXMax, lineYMax);
+  fputs(wordXML.str().c_str(), f);
+  fputs("        </line>\n", f);
+}
+
+void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) {
+  double xMin, yMin, xMax, yMax;
+  TextPage *textPage;
+  TextFlow *flow;
+  TextBlock *blk;
+  TextLine *line;
+
+  fprintf(f, "<doc>\n");
+  for (int page = first; page <= last; ++page) {
+    fprintf(f, "  <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
+    doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
+    textPage = textOut->takeText();
+    for (flow = textPage->getFlows(); flow; flow = flow->getNext()) {
+      fprintf(f, "    <flow>\n");
+      for (blk = flow->getBlocks(); blk; blk = blk->getNext()) {
+        blk->getBBox(&xMin, &yMin, &xMax, &yMax);
+        fprintf(f, "      <block xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">\n", xMin, yMin, xMax, yMax);
+        for (line = blk->getLines(); line; line = line->getNext()) {
+          printLine(f, line);
+        }
+        fprintf(f, "      </block>\n");
+      }
+      fprintf(f, "    </flow>\n");
+    }
+    fprintf(f, "  </page>\n");
+    textPage->decRefCnt();
+  }
+  fprintf(f, "</doc>\n");
+}
+
+void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) {
+  fprintf(f, "<doc>\n");
+  for (int page = first; page <= last; ++page) {
+    fprintf(f, "  <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
+    doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
+    TextWordList *wordlist = textOut->makeWordList();
+    const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
+    TextWord *word;
+    double xMinA, yMinA, xMaxA, yMaxA;
+    if (word_length == 0)
+      fprintf(stderr, "no word list\n");
+
+    for (int i = 0; i < word_length; ++i) {
+      word = wordlist->get(i);
+      word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
+      const std::string myString = myXmlTokenReplace(word->getText()->getCString());
+      fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
+    }
+    fprintf(f, "  </page>\n");
+    delete wordlist;
+  }
+  fprintf(f, "</doc>\n");
+}


More information about the poppler mailing list