diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index d4e004b..c15667e 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -61,6 +61,7 @@ static int x = 0; static int y = 0; static int w = 0; static int h = 0; +static int bbox = 0; static GBool physLayout = gFalse; static GBool rawOrder = gFalse; static GBool htmlMeta = gFalse; @@ -103,6 +104,8 @@ static const ArgDesc argDesc[] = { "output end-of-line convention (unix, dos, or mac)"}, {"-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages"}, + {"-bbox", argFlag, &bbox, 0, + "output bounding box for each word and page size to html. Sets -htmlmeta"}, {"-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)"}, {"-upw", argString, userPassword, sizeof(userPassword), @@ -128,7 +131,7 @@ int main(int argc, char *argv[]) { GooString *textFileName; GooString *ownerPW, *userPW; TextOutputDev *textOut; - FILE *f; + FILE *f = stdout; UnicodeMap *uMap; Object info; GBool ok; @@ -139,6 +142,9 @@ int main(int argc, char *argv[]) { // parse args ok = parseArgs(argDesc, &argc, argv); + if (bbox){ + htmlMeta = gTrue; + } if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) { fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION); fprintf(stderr, "%s\n", popplerCopyright); @@ -281,33 +287,61 @@ int main(int argc, char *argv[]) { info.free(); fputs("\n", f); fputs("\n", f); - fputs("
\n", f);
-    if (f != stdout) {
+    if (!bbox) fputs("
\n", f);
+    if (f != stdout && !bbox) { // if writing bbox, f remains open
       fclose(f);
     }
   }
 
   // write text file
-  textOut = new TextOutputDev(textFileName->getCString(),
-			      physLayout, rawOrder, htmlMeta);
-  if (textOut->isOk()) {
-    if ((w==0) && (h==0) && (x==0) && (y==0)) {
-      doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
-			gTrue, gFalse, gFalse);
-    } else {
-      int page;
-      
-      for (page = firstPage; page <= lastPage; ++page) {
+  if (bbox) {
+    textOut = new TextOutputDev("/dev/null",
+				physLayout, rawOrder, htmlMeta);
+    if (textOut->isOk()) {
+      fprintf(f, "\n");
+      for (int page = firstPage; page <= lastPage; ++page) {
+	fprintf(f, "  \n",doc->getPageCropWidth(page), doc->getPageCropHeight(page) );
+	doc->displayPage(textOut, page, resolution, resolution, 0,
+			 gTrue, gFalse, gFalse);
+	TextWordList *wordlist;
+	wordlist = textOut->makeWordList();
+	int word_length = wordlist->getLength();
+	TextWord *word;
+	double xMinA, yMinA, xMaxA, yMaxA;
+	if (!word_length)
+	  fprintf(stderr, "no word list\n");
+	
+	for (int i=0; i < word_length; i++){
+	  word = wordlist->get(i);
+	  word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);
+	  fprintf(f,"    %s\n", xMinA, yMinA, xMaxA, yMaxA, (char *) word->getText());
+	}
+	fprintf(f, "  \n");
+      }
+      fprintf(f, "\n");      
+    }
+    fclose(f);
+    
+  } else {
+    textOut = new TextOutputDev(textFileName->getCString(),
+				physLayout, rawOrder, htmlMeta);
+    if (textOut->isOk()) {
+      if ((w==0) && (h==0) && (x==0) && (y==0)) {
+	doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
+			  gTrue, gFalse, gFalse);
+      } else {
+	
+	for (int page = firstPage; page <= lastPage; ++page) {
 	doc->displayPageSlice(textOut, page, resolution, resolution, 0,
 			      gTrue, gFalse, gFalse, 
 			      x, y, w, h);
-      }	
-    }	
-
-  } else {
+	}
+      }
+    } else {
     delete textOut;
     exitCode = 2;
     goto err3;
+    }
   }
   delete textOut;