diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index d4e004b..dcf4802 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -49,6 +49,7 @@ #include "CharTypes.h" #include "UnicodeMap.h" #include "Error.h" +#include static void printInfoString(FILE *f, Dict *infoDict, char *key, char *text1, char *text2, UnicodeMap *uMap); @@ -61,6 +62,7 @@ static int x = 0; static int y = 0; static int w = 0; static int h = 0; +static int bbox = 0; static GBool physLayout = gFalse; static GBool rawOrder = gFalse; static GBool htmlMeta = gFalse; @@ -103,6 +105,8 @@ static const ArgDesc argDesc[] = { "output end-of-line convention (unix, dos, or mac)"}, {"-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages"}, + {"-bbox", argFlag, &bbox, 0, + "output bounding box for each word and page size to html. Sets -htmlmeta"}, {"-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)"}, {"-upw", argString, userPassword, sizeof(userPassword), @@ -122,6 +126,34 @@ static const ArgDesc argDesc[] = { {NULL} }; +using namespace std; + +string myStringReplace( string inString, string oldToken, string newToken ){ + + size_t foundLoc; + int advance = 0; + do { + foundLoc = inString.find(oldToken, advance); + if (foundLoc != string::npos){ + inString.replace( foundLoc, oldToken.length(), newToken ); + advance = foundLoc + newToken.length(); + } + } while (foundLoc != string::npos ); + return(inString); +} + +string myXmlTokenReplace( char* inString ){ + string myString(inString); + myString = myStringReplace(myString,string("&"),string("&") ); + myString = myStringReplace(myString,string("'"),string("'") ); + myString = myStringReplace(myString,string("\""),string(""") ); + myString = myStringReplace(myString,string("<"),string("<") ); + myString = myStringReplace(myString,string(">"),string(">") ); + return(myString); +} + + + int main(int argc, char *argv[]) { PDFDoc *doc; GooString *fileName; @@ -139,6 +171,9 @@ int main(int argc, char *argv[]) { // parse args ok = parseArgs(argDesc, &argc, argv); + if (bbox){ + htmlMeta = gTrue; + } if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) { fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION); fprintf(stderr, "%s\n", popplerCopyright); @@ -257,57 +292,98 @@ int main(int argc, char *argv[]) { goto err3; } } - fputs("\n", f); + fputs("",f); + fputs("\n", f); fputs("\n", f); doc->getDocInfo(&info); if (info.isDict()) { - printInfoString(f, info.getDict(), "Title", "", "\n", - uMap); + Object obj; + if (info.getDict()->lookup("Title", &obj)->isString()) { + printInfoString(f, info.getDict(), "Title", "", "\n", + uMap); + } else { + fputs("\n", f); + } printInfoString(f, info.getDict(), "Subject", - "\n", uMap); + "\n", uMap); printInfoString(f, info.getDict(), "Keywords", - "\n", uMap); + "\n", uMap); printInfoString(f, info.getDict(), "Author", - "\n", uMap); + "\n", uMap); printInfoString(f, info.getDict(), "Creator", - "\n", uMap); + "\n", uMap); printInfoString(f, info.getDict(), "Producer", - "\n", uMap); + "\n", uMap); printInfoDate(f, info.getDict(), "CreationDate", - "\n"); + "\n"); printInfoDate(f, info.getDict(), "LastModifiedDate", - "\n"); + "\n"); } info.free(); fputs("\n", f); fputs("\n", f); - fputs("
\n", f);

+    if (!bbox) fputs("
\n", f);

     if (f != stdout) {

       fclose(f);

     }

   }

 

   // write text file

-  textOut = new TextOutputDev(textFileName->getCString(),

-			      physLayout, rawOrder, htmlMeta);

-  if (textOut->isOk()) {

-    if ((w==0) && (h==0) && (x==0) && (y==0)) {

-      doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,

-			gTrue, gFalse, gFalse);

-    } else {

-      int page;

-      

-      for (page = firstPage; page <= lastPage; ++page) {

+  if (bbox) {

+    textOut = new TextOutputDev(NULL,

+				physLayout, rawOrder, htmlMeta);

+    if (!(f = fopen(textFileName->getCString(), "ab"))) {

+      error(-1, "Couldn't open text file '%s' for append", textFileName->getCString());

+      exitCode = 2;

+      goto err3;

+    }

+    

+    if (textOut->isOk()) {

+      fprintf(f, "\n");

+      for (int page = firstPage; page <= lastPage; ++page) {

+	fprintf(f, "  \n",doc->getPageCropWidth(page), doc->getPageCropHeight(page) );

+	doc->displayPage(textOut, page, resolution, resolution, 0,

+			 gTrue, gFalse, gFalse);

+	TextWordList *wordlist;

+	wordlist = textOut->makeWordList();

+	int word_length = wordlist->getLength();

+	TextWord *word;

+	double xMinA, yMinA, xMaxA, yMaxA;

+	if (!word_length)

+	  fprintf(stderr, "no word list\n");

+	

+	for (int i=0; i < word_length; i++){

+	  word = wordlist->get(i);

+	  word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);

+	  string myString = myXmlTokenReplace( (char*) word->getText() );

+	  fprintf(f,"    %s\n", xMinA, yMinA, xMaxA, yMaxA, (char*) myString.c_str() );

+	}

+	fprintf(f, "  \n");

+      }

+      fprintf(f, "\n");      

+    }

+    fclose(f);

+    

+  } else {

+    textOut = new TextOutputDev(textFileName->getCString(),

+				physLayout, rawOrder, htmlMeta);

+    if (textOut->isOk()) {

+      if ((w==0) && (h==0) && (x==0) && (y==0)) {

+	doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,

+			  gTrue, gFalse, gFalse);

+      } else {

+	

+	for (int page = firstPage; page <= lastPage; ++page) {

 	doc->displayPageSlice(textOut, page, resolution, resolution, 0,

 			      gTrue, gFalse, gFalse, 

 			      x, y, w, h);

-      }	

-    }	

-

-  } else {

+	}

+      }

+    } else {

     delete textOut;

     exitCode = 2;

     goto err3;

+    }

   }

   delete textOut;

 

@@ -322,7 +398,7 @@ int main(int argc, char *argv[]) {

 	goto err3;

       }

     }

-    fputs("
\n", f); + if (!bbox) fputs("
\n", f); fputs("\n", f); fputs("\n", f); if (f != stdout) {