[poppler] poppler/TextOutputDev.cc poppler/TextOutputDev.h utils/pdftotext.1 utils/pdftotext.cc
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Dec 7 23:32:02 UTC 2021
poppler/TextOutputDev.cc | 14 +++++++++++---
poppler/TextOutputDev.h | 6 ++++++
utils/pdftotext.1 | 3 +++
utils/pdftotext.cc | 9 +++++++++
4 files changed, 29 insertions(+), 3 deletions(-)
New commits:
commit f20d9e5f739b7c8dce74ebc60a6dd1e06106c12e
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date: Sun Jul 11 14:08:58 2021 -0400
TextOutputDev: require more spacing between columns
Require more spacing for adjacent text to be
considered a separate column of text.
We do that by increasing 'minColSpacing1' parameter,
which marks the distance, within which, an adjacent
word will be pulled to the current block.
We provide a way to tweak the default value:
double getMinColSpacing1();
void setMinColSpacing1(double val);
Fixes issue #1093
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 5dc37c93..67a6246d 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -122,8 +122,9 @@
#define maxWordSpacing 1.5
// Maximum horizontal spacing which will allow a word to be pulled
-// into a block.
-#define minColSpacing1 0.3
+// into a block, as a fraction of the font size.
+// This default value can be tweaked via API.
+double TextOutputDev::minColSpacing1_default = 0.7;
// Minimum spacing between columns, as a fraction of the font size.
#define minColSpacing2 1.0
@@ -2814,6 +2815,11 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
}
void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
+{
+ coalesce(physLayout, fixedPitch, doHTML, TextOutputDev::minColSpacing1_default);
+}
+
+void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1)
{
TextWord *word0, *word1, *word2;
TextLine *line;
@@ -5605,6 +5611,7 @@ TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixe
textEOL = defaultEndOfLine();
textPageBreaks = true;
ok = true;
+ minColSpacing1 = minColSpacing1_default;
// open file
needClose = false;
@@ -5648,6 +5655,7 @@ TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA
textEOL = defaultEndOfLine();
textPageBreaks = true;
ok = true;
+ minColSpacing1 = minColSpacing1_default;
}
TextOutputDev::~TextOutputDev()
@@ -5669,7 +5677,7 @@ void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
void TextOutputDev::endPage()
{
text->endPage();
- text->coalesce(physLayout, fixedPitch, doHTML);
+ text->coalesce(physLayout, fixedPitch, doHTML, minColSpacing1);
if (outputStream) {
text->dump(outputStream, outputFunc, physLayout, textEOL, textPageBreaks);
}
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 984507ba..9df36278 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -596,6 +596,7 @@ public:
// Coalesce strings that look like parts of the same line.
void coalesce(bool physLayout, double fixedPitch, bool doHTML);
+ void coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1);
// Find a string. If <startAtTop> is true, starts looking at the
// top of the page; else if <startAtLast> is true, starts looking
@@ -756,6 +757,8 @@ private:
class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev
{
public:
+ static double minColSpacing1_default;
+
// Open a text output file. If <fileName> is NULL, no file is
// written (this is useful, e.g., for searching text). If
// <physLayoutA> is true, the original physical layout of the text
@@ -885,6 +888,8 @@ public:
}
void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; }
void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; }
+ double getMinColSpacing1() const { return minColSpacing1; }
+ void setMinColSpacing1(double val) { minColSpacing1 = val; }
private:
TextOutputFunc outputFunc; // output function
@@ -897,6 +902,7 @@ private:
double fixedPitch; // if physLayout is true and this is non-zero,
// assume fixed-pitch characters with this
// width
+ double minColSpacing1; // see default value defined with same name at TextOutputDev.cc
bool rawOrder; // keep text in content stream order
bool discardDiag; // Diagonal text, i.e., text that is not close to one of the
// 0, 90, 180, or 270 degree axes, is discarded. This is useful
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 3ae217b4..39163389 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -85,6 +85,9 @@ block, line, and word in the file.
.B \-cropbox
Use the crop box rather than the media box with \-bbox and \-bbox-layout.
.TP
+.BI \-colspacing " number"
+Specifies how much spacing we allow after a word before considering adjacent text to be a new column, measured as a fraction of the font size. Current default is 0.7, old releases had a 0.3 default.
+.TP
.BI \-enc " encoding-name"
Sets the encoding to use for text output. This defaults to "UTF-8".
.TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 7b45359f..0caca87f 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -84,6 +84,7 @@ static bool bbox = false;
static bool bboxLayout = false;
static bool physLayout = false;
static bool useCropBox = false;
+static double colspacing = TextOutputDev::minColSpacing1_default;
static double fixedPitch = 0;
static bool rawOrder = false;
static bool discardDiag = false;
@@ -117,6 +118,8 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
{ "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta" },
{ "-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data. Sets -htmlmeta" },
{ "-cropbox", argFlag, &useCropBox, 0, "use the crop box rather than media box" },
+ { "-colspacing", argFP, &colspacing, 0,
+ "how much spacing we allow after a word before considering adjacent text to be a new column, as a fraction of the font size (default is 0.7, old releases had a 0.3 default)" },
{ "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
{ "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
{ "-q", argFlag, &quiet, 0, "don't print any messages or errors" },
@@ -178,6 +181,10 @@ int main(int argc, char *argv[])
if (bbox) {
htmlMeta = true;
}
+ if (colspacing <= 0 || colspacing > 10) {
+ error(errCommandLine, -1, "Bogus value provided for -colspacing");
+ goto err1;
+ }
if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) {
fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
fprintf(stderr, "%s\n", popplerCopyright);
@@ -342,6 +349,7 @@ int main(int argc, char *argv[])
if (textOut->isOk()) {
textOut->setTextEOL(textEOL);
+ textOut->setMinColSpacing1(colspacing);
if (noPageBreaks) {
textOut->setTextPageBreaks(false);
}
@@ -358,6 +366,7 @@ int main(int argc, char *argv[])
textOut = new TextOutputDev(textFileName->c_str(), physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
if (textOut->isOk()) {
textOut->setTextEOL(textEOL);
+ textOut->setMinColSpacing1(colspacing);
if (noPageBreaks) {
textOut->setTextPageBreaks(false);
}
More information about the poppler
mailing list