[poppler] utils/pdftotext.1 utils/pdftotext.cc
Albert Astals Cid
aacid at kemper.freedesktop.org
Mon Aug 31 15:23:32 PDT 2015
utils/pdftotext.1 | 4 +
utils/pdftotext.cc | 115 +++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 99 insertions(+), 20 deletions(-)
New commits:
commit 911d9fc8d85b776418039b4eebb37200a0987554
Author: Jeremy Echols <jechols at uoregon.edu>
Date: Tue Sep 1 00:22:28 2015 +0200
pdftotext: Add -bbox-layout option
Adds layout information for blocks and lines in addition to words
Bug #89941
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 0199b03..b53f82f 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -71,6 +71,10 @@ headers.
Generate an XHTML file containing bounding box information for each
word in the file.
.TP
+.B \-bbox-layout
+Generate an XHTML file containing bounding box information for each
+block, line, and word in the file.
+.TP
.BI \-enc " encoding-name"
Sets the encoding to use for text output. This defaults to "UTF-8".
.TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 15c741d..d931a96 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -24,6 +24,7 @@
// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
// Copyright (C) 2013 Yury G. Kudryashov <urkud.urkud at gmail.com>
// Copyright (C) 2013 Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
+// Copyright (C) 2015 Jeremy Echols <jechols at uoregon.edu>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -56,10 +57,14 @@
#include "PDFDocEncoding.h"
#include "Error.h"
#include <string>
+#include <sstream>
+#include <iomanip>
static void printInfoString(FILE *f, Dict *infoDict, const char *key,
const char *text1, const char *text2, UnicodeMap *uMap);
static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *fmt);
+void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
+void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
static int firstPage = 1;
static int lastPage = 0;
@@ -69,6 +74,7 @@ static int y = 0;
static int w = 0;
static int h = 0;
static GBool bbox = gFalse;
+static GBool bboxLayout = gFalse;
static GBool physLayout = gFalse;
static double fixedPitch = 0;
static GBool rawOrder = gFalse;
@@ -116,6 +122,8 @@ static const ArgDesc argDesc[] = {
"don't insert page breaks between pages"},
{"-bbox", argFlag, &bbox, 0,
"output bounding box for each word and page size to html. Sets -htmlmeta"},
+ {"-bbox-layout", argFlag, &bboxLayout, 0,
+ "like -bbox but with extra layout bounding box data. Sets -htmlmeta"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
@@ -176,6 +184,9 @@ int main(int argc, char *argv[]) {
// parse args
ok = parseArgs(argDesc, &argc, argv);
+ if (bboxLayout) {
+ bbox = gTrue;
+ }
if (bbox) {
htmlMeta = gTrue;
}
@@ -352,27 +363,12 @@ int main(int argc, char *argv[]) {
textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta);
if (textOut->isOk()) {
- fprintf(f, "<doc>\n");
- for (int page = firstPage; page <= lastPage; ++page) {
- fprintf(f, " <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
- doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
- TextWordList *wordlist = textOut->makeWordList();
- const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
- TextWord *word;
- double xMinA, yMinA, xMaxA, yMaxA;
- if (word_length == 0)
- fprintf(stderr, "no word list\n");
-
- for (int i = 0; i < word_length; ++i) {
- word = wordlist->get(i);
- word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
- const std::string myString = myXmlTokenReplace(word->getText()->getCString());
- fprintf(f," <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
- }
- fprintf(f, " </page>\n");
- delete wordlist;
+ if (bboxLayout) {
+ printDocBBox(f, doc, textOut, firstPage, lastPage);
+ }
+ else {
+ printWordBBox(f, doc, textOut, firstPage, lastPage);
}
- fprintf(f, "</doc>\n");
}
if (f != stdout) {
fclose(f);
@@ -492,3 +488,82 @@ static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *
}
obj.free();
}
+
+void printLine(FILE *f, TextLine *line) {
+ double xMin, yMin, xMax, yMax;
+ double lineXMin = 0, lineYMin = 0, lineXMax = 0, lineYMax = 0;
+ TextWord *word;
+ std::stringstream wordXML;
+ wordXML << std::fixed << std::setprecision(6);
+
+ for (word = line->getWords(); word; word = word->getNext()) {
+ word->getBBox(&xMin, &yMin, &xMax, &yMax);
+
+ if (lineXMin == 0 || lineXMin > xMin) lineXMin = xMin;
+ if (lineYMin == 0 || lineYMin > yMin) lineYMin = yMin;
+ if (lineXMax < xMax) lineXMax = xMax;
+ if (lineYMax < yMax) lineYMax = yMax;
+
+ const std::string myString = myXmlTokenReplace(word->getText()->getCString());
+ wordXML << " <word xMin=\"" << xMin << "\" yMin=\"" << yMin << "\" xMax=\"" <<
+ xMax << "\" yMax=\"" << yMax << "\">" << myString << "</word>\n";
+ }
+ fprintf(f, " <line xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">\n",
+ lineXMin, lineYMin, lineXMax, lineYMax);
+ fputs(wordXML.str().c_str(), f);
+ fputs(" </line>\n", f);
+}
+
+void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) {
+ double xMin, yMin, xMax, yMax;
+ TextPage *textPage;
+ TextFlow *flow;
+ TextBlock *blk;
+ TextLine *line;
+
+ fprintf(f, "<doc>\n");
+ for (int page = first; page <= last; ++page) {
+ fprintf(f, " <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
+ doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
+ textPage = textOut->takeText();
+ for (flow = textPage->getFlows(); flow; flow = flow->getNext()) {
+ fprintf(f, " <flow>\n");
+ for (blk = flow->getBlocks(); blk; blk = blk->getNext()) {
+ blk->getBBox(&xMin, &yMin, &xMax, &yMax);
+ fprintf(f, " <block xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">\n", xMin, yMin, xMax, yMax);
+ for (line = blk->getLines(); line; line = line->getNext()) {
+ printLine(f, line);
+ }
+ fprintf(f, " </block>\n");
+ }
+ fprintf(f, " </flow>\n");
+ }
+ fprintf(f, " </page>\n");
+ textPage->decRefCnt();
+ }
+ fprintf(f, "</doc>\n");
+}
+
+void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) {
+ fprintf(f, "<doc>\n");
+ for (int page = first; page <= last; ++page) {
+ fprintf(f, " <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
+ doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
+ TextWordList *wordlist = textOut->makeWordList();
+ const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
+ TextWord *word;
+ double xMinA, yMinA, xMaxA, yMaxA;
+ if (word_length == 0)
+ fprintf(stderr, "no word list\n");
+
+ for (int i = 0; i < word_length; ++i) {
+ word = wordlist->get(i);
+ word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
+ const std::string myString = myXmlTokenReplace(word->getText()->getCString());
+ fprintf(f," <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
+ }
+ fprintf(f, " </page>\n");
+ delete wordlist;
+ }
+ fprintf(f, "</doc>\n");
+}
More information about the poppler
mailing list