[poppler] utils/pdftotext.1 utils/pdftotext.cc

Sun Oct 17 08:15:19 PDT 2010

utils/pdftotext.1  |    4 +
 utils/pdftotext.cc |  125 +++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 103 insertions(+), 26 deletions(-)

New commits:
commit 3b45c66de91243d7419657512cf726c9e1c26dfb
Author: Kenneth Berland <ken at hero.com>
Date:   Sun Oct 17 14:24:16 2010 +0100

    Add -bbox option to pdftotext

diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 4e3cb0b..eb3aa8e 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -63,6 +63,10 @@ Generate a simple HTML file, including the meta information.  This
 simply wraps the text in <pre> and </pre> and prepends the meta
 headers.
 .TP
+.B \-bbox
+Generate an XHTML file containing bounding box information for each
+word in the file.
+.TP
 .BI \-enc " encoding-name"
 Sets the encoding to use for text output. This defaults to "UTF-8".
 .TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index d4e004b..b189e84 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -19,6 +19,7 @@
 // Copyright (C) 2007-2008, 2010 Albert Astals Cid <aacid at kde.org>
 // Copyright (C) 2009 Jan Jockusch <jan at jockusch.de>
 // Copyright (C) 2010 Hib Eris <hib at hiberis.nl>
+// Copyright (C) 2010 Kenneth Berland <ken at hero.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -49,6 +50,7 @@
 #include "CharTypes.h"
 #include "UnicodeMap.h"
 #include "Error.h"
+#include <string>
 
 static void printInfoString(FILE *f, Dict *infoDict, char *key,
 			    char *text1, char *text2, UnicodeMap *uMap);
@@ -61,6 +63,7 @@ static int x = 0;
 static int y = 0;
 static int w = 0;
 static int h = 0;
+static GBool bbox = gFalse;
 static GBool physLayout = gFalse;
 static GBool rawOrder = gFalse;
 static GBool htmlMeta = gFalse;
@@ -103,6 +106,8 @@ static const ArgDesc argDesc[] = {
    "output end-of-line convention (unix, dos, or mac)"},
   {"-nopgbrk", argFlag,     &noPageBreaks,  0,
    "don't insert page breaks between pages"},
+  {"-bbox", argFlag,     &bbox,  0,
+   "output bounding box for each word and page size to html.  Sets -htmlmeta"},
   {"-opw",     argString,   ownerPassword,  sizeof(ownerPassword),
    "owner password (for encrypted files)"},
   {"-upw",     argString,   userPassword,   sizeof(userPassword),
@@ -122,6 +127,30 @@ static const ArgDesc argDesc[] = {
   {NULL}
 };
 
+static std::string myStringReplace(const std::string &inString, const std::string &oldToken, const std::string &newToken) {
+  std::string result = inString;
+  size_t foundLoc;
+  int advance = 0;
+  do {
+    foundLoc = result.find(oldToken, advance);
+    if (foundLoc != std::string::npos){
+      result.replace(foundLoc, oldToken.length(), newToken);
+      advance = foundLoc + newToken.length();
+    }
+  } while (foundLoc != std::string::npos );
+  return result;
+}
+
+static std::string myXmlTokenReplace(const char *inString){
+  std::string myString(inString);
+  myString = myStringReplace(myString, "&",  "&amp;" );
+  myString = myStringReplace(myString, "'",  "&apos;" );
+  myString = myStringReplace(myString, "\"", "&quot;" );
+  myString = myStringReplace(myString, "<",  "&lt;" );
+  myString = myStringReplace(myString, ">",  "&gt;" );
+  return myString;
+}
+
 int main(int argc, char *argv[]) {
   PDFDoc *doc;
   GooString *fileName;
@@ -139,6 +168,9 @@ int main(int argc, char *argv[]) {
 
   // parse args
   ok = parseArgs(argDesc, &argc, argv);
+  if (bbox) {
+    htmlMeta = gTrue;
+  }
   if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) {
     fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
     fprintf(stderr, "%s\n", popplerCopyright);
@@ -257,57 +289,98 @@ int main(int argc, char *argv[]) {
 	goto err3;
       }
     }
-    fputs("<html>\n", f);
+    fputs("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", f);
+    fputs("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n", f);
     fputs("<head>\n", f);
     doc->getDocInfo(&info);
     if (info.isDict()) {
-      printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
-		      uMap);
+      Object obj;
+      if (info.getDict()->lookup("Title", &obj)->isString()) {
+        printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n", uMap);
+      } else {
+        fputs("<title></title>\n", f);
+      }
+      obj.free();
       printInfoString(f, info.getDict(), "Subject",
-		      "<meta name=\"Subject\" content=\"", "\">\n", uMap);
+		      "<meta name=\"Subject\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Keywords",
-		      "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
+		      "<meta name=\"Keywords\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Author",
-		      "<meta name=\"Author\" content=\"", "\">\n", uMap);
+		      "<meta name=\"Author\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Creator",
-		      "<meta name=\"Creator\" content=\"", "\">\n", uMap);
+		      "<meta name=\"Creator\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Producer",
-		      "<meta name=\"Producer\" content=\"", "\">\n", uMap);
+		      "<meta name=\"Producer\" content=\"", "\"/>\n", uMap);
       printInfoDate(f, info.getDict(), "CreationDate",
-		    "<meta name=\"CreationDate\" content=\"\">\n");
+		    "<meta name=\"CreationDate\" content=\"\"/>\n");
       printInfoDate(f, info.getDict(), "LastModifiedDate",
-		    "<meta name=\"ModDate\" content=\"\">\n");
+		    "<meta name=\"ModDate\" content=\"\"/>\n");
     }
     info.free();
     fputs("</head>\n", f);
     fputs("<body>\n", f);
-    fputs("<pre>\n", f);
+    if (!bbox) fputs("<pre>\n", f);
     if (f != stdout) {
       fclose(f);
     }
   }
 
   // write text file
-  textOut = new TextOutputDev(textFileName->getCString(),
-			      physLayout, rawOrder, htmlMeta);
-  if (textOut->isOk()) {
-    if ((w==0) && (h==0) && (x==0) && (y==0)) {
-      doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
-			gTrue, gFalse, gFalse);
-    } else {
-      int page;
-      
-      for (page = firstPage; page <= lastPage; ++page) {
-	doc->displayPageSlice(textOut, page, resolution, resolution, 0,
+  if (bbox) {
+    textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta);
+    if (!(f = fopen(textFileName->getCString(), "ab"))) {
+      error(-1, "Couldn't open text file '%s' for append", textFileName->getCString());
+      exitCode = 2;
+      delete textOut;
+      goto err3;
+    }
+
+    if (textOut->isOk()) {
+      fprintf(f, "<doc>\n");
+      for (int page = firstPage; page <= lastPage; ++page) {
+        fprintf(f, "  <page width=\"%f\" height=\"%f\">\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page));
+        doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
+        TextWordList *wordlist = textOut->makeWordList();
+        const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
+        TextWord *word;
+        double xMinA, yMinA, xMaxA, yMaxA;
+        if (word_length == 0)
+          fprintf(stderr, "no word list\n");
+
+        for (int i = 0; i < word_length; ++i) {
+          word = wordlist->get(i);
+          word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
+          const std::string myString = myXmlTokenReplace(word->getText()->getCString());
+          fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
+        }
+        fprintf(f, "  </page>\n");
+        delete wordlist;
+      }
+      fprintf(f, "</doc>\n");
+    }
+    fclose(f);
+    delete textOut;
+  } else {
+    textOut = new TextOutputDev(textFileName->getCString(),
+				physLayout, rawOrder, htmlMeta);
+    if (textOut->isOk()) {
+      if ((w==0) && (h==0) && (x==0) && (y==0)) {
+	doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
+			  gTrue, gFalse, gFalse);
+      } else {
+	
+	for (int page = firstPage; page <= lastPage; ++page) {
+	  doc->displayPageSlice(textOut, page, resolution, resolution, 0,
 			      gTrue, gFalse, gFalse, 
 			      x, y, w, h);
-      }	
-    }	
+	}
+      }
 
-  } else {
+    } else {
     delete textOut;
     exitCode = 2;
     goto err3;
+    }
   }
   delete textOut;
 
@@ -322,7 +395,7 @@ int main(int argc, char *argv[]) {
 	goto err3;
       }
     }
-    fputs("</pre>\n", f);
+    if (!bbox) fputs("</pre>\n", f);
     fputs("</body>\n", f);
     fputs("</html>\n", f);
     if (f != stdout) {