[poppler] utils/pdftotext.1 utils/pdftotext.cc
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Mon Feb 8 23:29:06 UTC 2021
utils/pdftotext.1 | 3 +++
utils/pdftotext.cc | 14 ++++++++++----
2 files changed, 13 insertions(+), 4 deletions(-)
New commits:
commit 2f40575018d75a1412f5c4f8616dfe26d46f504e
Author: William Bader <williambader at hotmail.com>
Date: Mon Feb 8 23:29:05 2021 +0000
Add pdftotext -cropbox option
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index ea2874f7..3ae217b4 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -82,6 +82,9 @@ word in the file.
Generate an XHTML file containing bounding box information for each
block, line, and word in the file.
.TP
+.B \-cropbox
+Use the crop box rather than the media box with \-bbox and \-bbox-layout.
+.TP
.BI \-enc " encoding-name"
Sets the encoding to use for text output. This defaults to "UTF-8".
.TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index f17cfd53..4cb18dfd 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -82,6 +82,7 @@ static int h = 0;
static bool bbox = false;
static bool bboxLayout = false;
static bool physLayout = false;
+static bool useCropBox = false;
static double fixedPitch = 0;
static bool rawOrder = false;
static bool discardDiag = false;
@@ -114,6 +115,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
{ "-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages" },
{ "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta" },
{ "-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data. Sets -htmlmeta" },
+ { "-cropbox", argFlag, &useCropBox, 0, "use the crop box rather than media box" },
{ "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
{ "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
{ "-q", argFlag, &quiet, 0, "don't print any messages or errors" },
@@ -496,8 +498,10 @@ void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int l
fprintf(f, "<doc>\n");
for (int page = first; page <= last; ++page) {
- fprintf(f, " <page width=\"%f\" height=\"%f\">\n", doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
- doc->displayPage(textOut, page, resolution, resolution, 0, true, false, false);
+ const double wid = useCropBox ? doc->getPageCropWidth(page) : doc->getPageMediaWidth(page);
+ const double hgt = useCropBox ? doc->getPageCropHeight(page) : doc->getPageMediaHeight(page);
+ fprintf(f, " <page width=\"%f\" height=\"%f\">\n", wid, hgt);
+ doc->displayPage(textOut, page, resolution, resolution, 0, !useCropBox, useCropBox, false);
for (flow = textOut->getFlows(); flow; flow = flow->getNext()) {
fprintf(f, " <flow>\n");
for (blk = flow->getBlocks(); blk; blk = blk->getNext()) {
@@ -519,8 +523,10 @@ void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int
{
fprintf(f, "<doc>\n");
for (int page = first; page <= last; ++page) {
- fprintf(f, " <page width=\"%f\" height=\"%f\">\n", doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
- doc->displayPage(textOut, page, resolution, resolution, 0, true, false, false);
+ double wid = useCropBox ? doc->getPageCropWidth(page) : doc->getPageMediaWidth(page);
+ double hgt = useCropBox ? doc->getPageCropHeight(page) : doc->getPageMediaHeight(page);
+ fprintf(f, " <page width=\"%f\" height=\"%f\">\n", wid, hgt);
+ doc->displayPage(textOut, page, resolution, resolution, 0, !useCropBox, useCropBox, false);
TextWordList *wordlist = textOut->makeWordList();
const int word_length = wordlist != nullptr ? wordlist->getLength() : 0;
TextWord *word;
More information about the poppler
mailing list