[poppler] poppler/TextOutputDev.cc poppler/TextOutputDev.h utils/pdftotext.1 utils/pdftotext.cc

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Aug 1 22:11:46 UTC 2019


 poppler/TextOutputDev.cc |   39 +++++++++++++++++++++++++++++++++------
 poppler/TextOutputDev.h  |   17 ++++++++++++-----
 utils/pdftotext.1        |    5 +++++
 utils/pdftotext.cc       |    7 +++++--
 4 files changed, 55 insertions(+), 13 deletions(-)

New commits:
commit 54f799e6fda99cf0cc826884247d92c6dc36d8e7
Author: Dan Shea <7741-dshea at users.noreply.gitlab.freedesktop.org>
Date:   Thu Aug 1 22:11:44 2019 +0000

    Add pdftotext -nodiag flag to remove diagonal text on output

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 645e38fd..f2569fbd 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -177,6 +177,10 @@
 #define combMaxMidDelta 0.3
 #define combMaxBaseDelta 0.4
 
+// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
+// (Or 1/tan(angle) for 90/270 degrees.)
+#define diagonalThreshold 0.1
+
 namespace {
 
 inline bool isAscii7 (Unicode uchar) {
@@ -2357,11 +2361,12 @@ TextWord *TextWordList::get(int idx) {
 // TextPage
 //------------------------------------------------------------------------
 
-TextPage::TextPage(bool rawOrderA) {
+TextPage::TextPage(bool rawOrderA, bool discardDiagA) {
   int rot;
 
   refCnt = 1;
   rawOrder = rawOrderA;
+  discardDiag = discardDiagA;
   curWord = nullptr;
   charPos = 0;
   curFont = nullptr;
@@ -2384,6 +2389,7 @@ TextPage::TextPage(bool rawOrderA) {
   underlines = new std::vector<TextUnderline*>();
   links = new std::vector<TextLink*>();
   mergeCombining = true;
+  diagonal = false;
 }
 
 TextPage::~TextPage() {
@@ -2470,6 +2476,7 @@ void TextPage::clear() {
   }
   delete links;
 
+  diagonal = false;
   curWord = nullptr;
   charPos = 0;
   curFont = nullptr;
@@ -2592,6 +2599,11 @@ void TextPage::beginWord(GfxState *state) {
   } else {
     rot = (m[2] > 0) ? 1 : 3;
   }
+  if (fabs(m[0]) >= fabs(m[1]))  {
+    diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
+  } else {
+    diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
+  }
 
   // for vertical writing mode, the lines are effectively rotated 90
   // degrees
@@ -2720,6 +2732,12 @@ void TextPage::addChar(GfxState *state, double x, double y,
       beginWord(state);
     }
 
+    // throw away diagonal chars
+    if (discardDiag && diagonal) {
+      charPos += nBytes;
+      return;
+    }
+
     // page rotation and/or transform matrices can cause text to be
     // drawn in reverse order -- in this case, swap the begin/end
     // coordinates and break text into individual chars
@@ -2729,6 +2747,13 @@ void TextPage::addChar(GfxState *state, double x, double y,
         (curWord->rot == 3 && h1 > 0)) {
       endWord();
       beginWord(state);
+
+      // throw away diagonal chars
+      if (discardDiag && diagonal) {
+        charPos += nBytes;
+        return;
+      }
+
       x1 += w1;
       y1 += h1;
       w1 = -w1;
@@ -5648,11 +5673,12 @@ static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
 
 TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA,
 			     double fixedPitchA, bool rawOrderA,
-			     bool append) {
+			     bool append, bool discardDiagA) {
   text = nullptr;
   physLayout = physLayoutA;
   fixedPitch = physLayout ? fixedPitchA : 0;
   rawOrder = rawOrderA;
+  discardDiag = discardDiagA;
   doHTML = false;
   ok = true;
 
@@ -5679,21 +5705,22 @@ TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA,
   }
 
   // set up text object
-  text = new TextPage(rawOrderA);
+  text = new TextPage(rawOrderA, discardDiagA);
   actualText = new ActualText(text);
 }
 
 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
 			     bool physLayoutA, double fixedPitchA,
-			     bool rawOrderA) {
+			     bool rawOrderA, bool discardDiagA) {
   outputFunc = func;
   outputStream = stream;
   needClose = false;
   physLayout = physLayoutA;
   fixedPitch = physLayout ? fixedPitchA : 0;
   rawOrder = rawOrderA;
+  discardDiag = discardDiagA;
   doHTML = false;
-  text = new TextPage(rawOrderA);
+  text = new TextPage(rawOrderA, discardDiagA);
   actualText = new ActualText(text);
   ok = true;
 }
@@ -5961,7 +5988,7 @@ TextPage *TextOutputDev::takeText() {
   TextPage *ret;
 
   ret = text;
-  text = new TextPage(rawOrder);
+  text = new TextPage(rawOrder, discardDiag);
   return ret;
 }
 
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 3ff1754a..7a29c8a0 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -553,7 +553,7 @@ class TextPage {
 public:
 
   // Constructor.
-  TextPage(bool rawOrderA);
+  TextPage(bool rawOrderA, bool discardDiagA = false);
 
   TextPage(const TextPage &) = delete;
   TextPage& operator=(const TextPage &) = delete;
@@ -685,6 +685,7 @@ private:
   int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
 
   bool rawOrder;		// keep text in content stream order
+  bool discardDiag;		// discard diagonal text
   bool mergeCombining;		// merge when combining and base characters
 				// are drawn on top of each other
 
@@ -698,6 +699,7 @@ private:
   int nTinyChars;		// number of "tiny" chars seen so far
   bool lastCharOverlap;	// set if the last added char overlapped the
 				//   previous char
+  bool diagonal;		// whether the current text is diagonal
 
   TextPool *pools[4];		// a "pool" of TextWords for each rotation
   TextFlow *flows;		// linked list of flows
@@ -772,18 +774,20 @@ public:
   // written (this is useful, e.g., for searching text).  If
   // <physLayoutA> is true, the original physical layout of the text
   // is maintained.  If <rawOrder> is true, the text is kept in
-  // content stream order.
+  // content stream order.  If <discardDiag> is true, diagonal text
+  // is removed from output.
   TextOutputDev(const char *fileName, bool physLayoutA,
 		double fixedPitchA, bool rawOrderA,
-		bool append);
+		bool append, bool discardDiagA = false);
 
   // Create a TextOutputDev which will write to a generic stream.  If
   // <physLayoutA> is true, the original physical layout of the text
   // is maintained.  If <rawOrder> is true, the text is kept in
-  // content stream order.
+  // content stream order.  If <discardDiag> is true, diagonal text
+  // is removed from output.
   TextOutputDev(TextOutputFunc func, void *stream,
 		bool physLayoutA, double fixedPitchA,
-		bool rawOrderA);
+		bool rawOrderA, bool discardDiagA = false);
 
   // Destructor.
   ~TextOutputDev();
@@ -920,6 +924,9 @@ private:
 				//   assume fixed-pitch characters with this
 				//   width
   bool rawOrder;		// keep text in content stream order
+  bool discardDiag;     // Diagonal text, i.e., text that is not close to one of the
+				//0, 90, 180, or 270 degree axes, is discarded. This is useful
+				// to skip watermarks drawn on top of body text, etc.
   bool doHTML;			// extra processing for HTML conversion
   bool ok;			// set up ok?
 
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index f1a0cb41..dd114e2c 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -62,6 +62,11 @@ Keep the text in content stream order.  This is a hack which often
 "undoes" column formatting, etc.  Use of raw mode is no longer
 recommended.
 .TP
+.B \-nodiag
+Discard diagonal text (i.e., text that is not close to one of the
+0, 90, 180, or 270 degree axes). This is useful for skipping
+watermarks drawn on body text.
+.TP
 .B \-htmlmeta
 Generate a simple HTML file, including the meta information.  This
 simply wraps the text in <pre> and </pre> and prepends the meta
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 34b8f87d..88154ac3 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -83,6 +83,7 @@ static bool bboxLayout = false;
 static bool physLayout = false;
 static double fixedPitch = 0;
 static bool rawOrder = false;
+static bool discardDiag = false;
 static bool htmlMeta = false;
 static char textEncName[128] = "";
 static char textEOL[16] = "";
@@ -115,6 +116,8 @@ static const ArgDesc argDesc[] = {
    "assume fixed-pitch (or tabular) text"},
   {"-raw",     argFlag,     &rawOrder,      0,
    "keep strings in content stream order"},
+  {"-nodiag",  argFlag,     &discardDiag,   0,
+   "discard diagonal text"},
   {"-htmlmeta", argFlag,   &htmlMeta,       0,
    "generate a simple HTML file, including the meta information"},
   {"-enc",     argString,   textEncName,    sizeof(textEncName),
@@ -363,7 +366,7 @@ int main(int argc, char *argv[]) {
 
   // write text file
   if (htmlMeta && bbox) { // htmlMeta && is superfluous but makes gcc happier
-    textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta);
+    textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
 
     if (textOut->isOk()) {
       if (bboxLayout) {
@@ -378,7 +381,7 @@ int main(int argc, char *argv[]) {
     }
   } else {
     textOut = new TextOutputDev(textFileName->c_str(),
-				physLayout, fixedPitch, rawOrder, htmlMeta);
+				physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
     if (textOut->isOk()) {
       if ((w==0) && (h==0) && (x==0) && (y==0)) {
 	doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,


More information about the poppler mailing list