[poppler] poppler/TextOutputDev.cc poppler/TextOutputDev.h utils/pdftotext.1 utils/pdftotext.cc
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Thu Aug 1 22:11:46 UTC 2019
poppler/TextOutputDev.cc | 39 +++++++++++++++++++++++++++++++++------
poppler/TextOutputDev.h | 17 ++++++++++++-----
utils/pdftotext.1 | 5 +++++
utils/pdftotext.cc | 7 +++++--
4 files changed, 55 insertions(+), 13 deletions(-)
New commits:
commit 54f799e6fda99cf0cc826884247d92c6dc36d8e7
Author: Dan Shea <7741-dshea at users.noreply.gitlab.freedesktop.org>
Date: Thu Aug 1 22:11:44 2019 +0000
Add pdftotext -nodiag flag to remove diagonal text on output
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 645e38fd..f2569fbd 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -177,6 +177,10 @@
#define combMaxMidDelta 0.3
#define combMaxBaseDelta 0.4
+// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
+// (Or 1/tan(angle) for 90/270 degrees.)
+#define diagonalThreshold 0.1
+
namespace {
inline bool isAscii7 (Unicode uchar) {
@@ -2357,11 +2361,12 @@ TextWord *TextWordList::get(int idx) {
// TextPage
//------------------------------------------------------------------------
-TextPage::TextPage(bool rawOrderA) {
+TextPage::TextPage(bool rawOrderA, bool discardDiagA) {
int rot;
refCnt = 1;
rawOrder = rawOrderA;
+ discardDiag = discardDiagA;
curWord = nullptr;
charPos = 0;
curFont = nullptr;
@@ -2384,6 +2389,7 @@ TextPage::TextPage(bool rawOrderA) {
underlines = new std::vector<TextUnderline*>();
links = new std::vector<TextLink*>();
mergeCombining = true;
+ diagonal = false;
}
TextPage::~TextPage() {
@@ -2470,6 +2476,7 @@ void TextPage::clear() {
}
delete links;
+ diagonal = false;
curWord = nullptr;
charPos = 0;
curFont = nullptr;
@@ -2592,6 +2599,11 @@ void TextPage::beginWord(GfxState *state) {
} else {
rot = (m[2] > 0) ? 1 : 3;
}
+ if (fabs(m[0]) >= fabs(m[1])) {
+ diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
+ } else {
+ diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
+ }
// for vertical writing mode, the lines are effectively rotated 90
// degrees
@@ -2720,6 +2732,12 @@ void TextPage::addChar(GfxState *state, double x, double y,
beginWord(state);
}
+ // throw away diagonal chars
+ if (discardDiag && diagonal) {
+ charPos += nBytes;
+ return;
+ }
+
// page rotation and/or transform matrices can cause text to be
// drawn in reverse order -- in this case, swap the begin/end
// coordinates and break text into individual chars
@@ -2729,6 +2747,13 @@ void TextPage::addChar(GfxState *state, double x, double y,
(curWord->rot == 3 && h1 > 0)) {
endWord();
beginWord(state);
+
+ // throw away diagonal chars
+ if (discardDiag && diagonal) {
+ charPos += nBytes;
+ return;
+ }
+
x1 += w1;
y1 += h1;
w1 = -w1;
@@ -5648,11 +5673,12 @@ static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA,
double fixedPitchA, bool rawOrderA,
- bool append) {
+ bool append, bool discardDiagA) {
text = nullptr;
physLayout = physLayoutA;
fixedPitch = physLayout ? fixedPitchA : 0;
rawOrder = rawOrderA;
+ discardDiag = discardDiagA;
doHTML = false;
ok = true;
@@ -5679,21 +5705,22 @@ TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA,
}
// set up text object
- text = new TextPage(rawOrderA);
+ text = new TextPage(rawOrderA, discardDiagA);
actualText = new ActualText(text);
}
TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
bool physLayoutA, double fixedPitchA,
- bool rawOrderA) {
+ bool rawOrderA, bool discardDiagA) {
outputFunc = func;
outputStream = stream;
needClose = false;
physLayout = physLayoutA;
fixedPitch = physLayout ? fixedPitchA : 0;
rawOrder = rawOrderA;
+ discardDiag = discardDiagA;
doHTML = false;
- text = new TextPage(rawOrderA);
+ text = new TextPage(rawOrderA, discardDiagA);
actualText = new ActualText(text);
ok = true;
}
@@ -5961,7 +5988,7 @@ TextPage *TextOutputDev::takeText() {
TextPage *ret;
ret = text;
- text = new TextPage(rawOrder);
+ text = new TextPage(rawOrder, discardDiag);
return ret;
}
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 3ff1754a..7a29c8a0 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -553,7 +553,7 @@ class TextPage {
public:
// Constructor.
- TextPage(bool rawOrderA);
+ TextPage(bool rawOrderA, bool discardDiagA = false);
TextPage(const TextPage &) = delete;
TextPage& operator=(const TextPage &) = delete;
@@ -685,6 +685,7 @@ private:
int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
bool rawOrder; // keep text in content stream order
+ bool discardDiag; // discard diagonal text
bool mergeCombining; // merge when combining and base characters
// are drawn on top of each other
@@ -698,6 +699,7 @@ private:
int nTinyChars; // number of "tiny" chars seen so far
bool lastCharOverlap; // set if the last added char overlapped the
// previous char
+ bool diagonal; // whether the current text is diagonal
TextPool *pools[4]; // a "pool" of TextWords for each rotation
TextFlow *flows; // linked list of flows
@@ -772,18 +774,20 @@ public:
// written (this is useful, e.g., for searching text). If
// <physLayoutA> is true, the original physical layout of the text
// is maintained. If <rawOrder> is true, the text is kept in
- // content stream order.
+ // content stream order. If <discardDiag> is true, diagonal text
+ // is removed from output.
TextOutputDev(const char *fileName, bool physLayoutA,
double fixedPitchA, bool rawOrderA,
- bool append);
+ bool append, bool discardDiagA = false);
// Create a TextOutputDev which will write to a generic stream. If
// <physLayoutA> is true, the original physical layout of the text
// is maintained. If <rawOrder> is true, the text is kept in
- // content stream order.
+ // content stream order. If <discardDiag> is true, diagonal text
+ // is removed from output.
TextOutputDev(TextOutputFunc func, void *stream,
bool physLayoutA, double fixedPitchA,
- bool rawOrderA);
+ bool rawOrderA, bool discardDiagA = false);
// Destructor.
~TextOutputDev();
@@ -920,6 +924,9 @@ private:
// assume fixed-pitch characters with this
// width
bool rawOrder; // keep text in content stream order
+ bool discardDiag; // Diagonal text, i.e., text that is not close to one of the
+ //0, 90, 180, or 270 degree axes, is discarded. This is useful
+ // to skip watermarks drawn on top of body text, etc.
bool doHTML; // extra processing for HTML conversion
bool ok; // set up ok?
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index f1a0cb41..dd114e2c 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -62,6 +62,11 @@ Keep the text in content stream order. This is a hack which often
"undoes" column formatting, etc. Use of raw mode is no longer
recommended.
.TP
+.B \-nodiag
+Discard diagonal text (i.e., text that is not close to one of the
+0, 90, 180, or 270 degree axes). This is useful for skipping
+watermarks drawn on body text.
+.TP
.B \-htmlmeta
Generate a simple HTML file, including the meta information. This
simply wraps the text in <pre> and </pre> and prepends the meta
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 34b8f87d..88154ac3 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -83,6 +83,7 @@ static bool bboxLayout = false;
static bool physLayout = false;
static double fixedPitch = 0;
static bool rawOrder = false;
+static bool discardDiag = false;
static bool htmlMeta = false;
static char textEncName[128] = "";
static char textEOL[16] = "";
@@ -115,6 +116,8 @@ static const ArgDesc argDesc[] = {
"assume fixed-pitch (or tabular) text"},
{"-raw", argFlag, &rawOrder, 0,
"keep strings in content stream order"},
+ {"-nodiag", argFlag, &discardDiag, 0,
+ "discard diagonal text"},
{"-htmlmeta", argFlag, &htmlMeta, 0,
"generate a simple HTML file, including the meta information"},
{"-enc", argString, textEncName, sizeof(textEncName),
@@ -363,7 +366,7 @@ int main(int argc, char *argv[]) {
// write text file
if (htmlMeta && bbox) { // htmlMeta && is superfluous but makes gcc happier
- textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta);
+ textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
if (textOut->isOk()) {
if (bboxLayout) {
@@ -378,7 +381,7 @@ int main(int argc, char *argv[]) {
}
} else {
textOut = new TextOutputDev(textFileName->c_str(),
- physLayout, fixedPitch, rawOrder, htmlMeta);
+ physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
if (textOut->isOk()) {
if ((w==0) && (h==0) && (x==0) && (y==0)) {
doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
More information about the poppler
mailing list