diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index d4e004b..889a6c4 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -61,6 +61,7 @@ static int x = 0; static int y = 0; static int w = 0; static int h = 0; +static int bbox = 0; static GBool physLayout = gFalse; static GBool rawOrder = gFalse; static GBool htmlMeta = gFalse; @@ -103,6 +104,8 @@ static const ArgDesc argDesc[] = { "output end-of-line convention (unix, dos, or mac)"}, {"-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages"}, + {"-bbox", argFlag, &bbox, 0, + "output bounding box for each word and page size to html. Sets -htmlmeta"}, {"-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)"}, {"-upw", argString, userPassword, sizeof(userPassword), @@ -122,6 +125,47 @@ static const ArgDesc argDesc[] = { {NULL} }; +char* str_replace(char *str, char *oldstr, char *newstr) { + int i, count = 0; + int newlen = strlen(newstr); + int oldlen = strlen(oldstr); + + for (i = 0; str[i]; ++i) + if (strstr(&str[i], oldstr) == &str[i]) + ++count, i += oldlen - 1; + + char *ret = (char *) calloc(i + 1 + count * (newlen - oldlen), sizeof(char)); + + i = 0; + while (*str) + if (strstr(str, oldstr) == str) + strcpy(&ret[i], newstr), + i += newlen, + str += oldlen; + else + ret[i++] = *str++; + + ret[i] = '\0'; + + return ret; +} + + +char* myXmlTokenReplace( char* inString ){ + char* new0 = str_replace(inString,"&","&"); + char* new1 = str_replace(new0,"'","'"); + free(new0); + new0 = str_replace(new1,"\"","""); + free(new1); + new1 = str_replace(new0,"<","<"); + free(new0); + new0 = str_replace(new1,">",">"); + free(new1); + return( new0 ); +} + + + int main(int argc, char *argv[]) { PDFDoc *doc; GooString *fileName; @@ -139,6 +183,9 @@ int main(int argc, char *argv[]) { // parse args ok = parseArgs(argDesc, &argc, argv); + if (bbox){ + htmlMeta = gTrue; + } if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) { fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION); fprintf(stderr, "%s\n", popplerCopyright); @@ -257,57 +304,99 @@ int main(int argc, char *argv[]) { goto err3; } } - fputs("\n", f); + fputs("",f); + fputs("\n", f); fputs("
\n", f); doc->getDocInfo(&info); if (info.isDict()) { - printInfoString(f, info.getDict(), "Title", "\n", f); + if (!bbox) fputs("\n", f); fputs("\n", f); fputs("\n", f); if (f != stdout) {\n", f); if (f != stdout) { fclose(f); } } // write text file - textOut = new TextOutputDev(textFileName->getCString(), - physLayout, rawOrder, htmlMeta); - if (textOut->isOk()) { - if ((w==0) && (h==0) && (x==0) && (y==0)) { - doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, - gTrue, gFalse, gFalse); - } else { - int page; - - for (page = firstPage; page <= lastPage; ++page) { + if (bbox) { + textOut = new TextOutputDev("/dev/null", + physLayout, rawOrder, htmlMeta); + if (!(f = fopen(textFileName->getCString(), "ab"))) { + error(-1, "Couldn't open text file '%s' for append", textFileName->getCString()); + exitCode = 2; + goto err3; + } + + if (textOut->isOk()) { + fprintf(f, "\n", f); + if (!bbox) fputs("\n"); + for (int page = firstPage; page <= lastPage; ++page) { + fprintf(f, " \n"); + } + fclose(f); + + } else { + textOut = new TextOutputDev(textFileName->getCString(), + physLayout, rawOrder, htmlMeta); + if (textOut->isOk()) { + if ((w==0) && (h==0) && (x==0) && (y==0)) { + doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, + gTrue, gFalse, gFalse); + } else { + + for (int page = firstPage; page <= lastPage; ++page) { doc->displayPageSlice(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse, x, y, w, h); - } - } - - } else { + } + } + } else { delete textOut; exitCode = 2; goto err3; + } } delete textOut; @@ -322,7 +411,7 @@ int main(int argc, char *argv[]) { goto err3; } } - fputs("\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page) ); + doc->displayPage(textOut, page, resolution, resolution, 0, + gTrue, gFalse, gFalse); + TextWordList *wordlist; + wordlist = textOut->makeWordList(); + int word_length = wordlist->getLength(); + TextWord *word; + double xMinA, yMinA, xMaxA, yMaxA; + if (!word_length) + fprintf(stderr, "no word list\n"); + + for (int i=0; i < word_length; i++){ + word = wordlist->get(i); + word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA); + char* replacedText = myXmlTokenReplace( (char*) word->getText() ); + fprintf(f," \n"); + } + fprintf(f, "%s \n", xMinA, yMinA, xMaxA, yMaxA, replacedText ); + free(replacedText); + } + fprintf(f, "