[poppler] poppler/TextOutputDev.cc poppler/UTF.cc poppler/UTF.h

Carlos Garcia Campos carlosgc at kemper.freedesktop.org
Mon Oct 3 16:00:02 UTC 2016


 poppler/TextOutputDev.cc |    2 +-
 poppler/UTF.cc           |   12 ++++++++++++
 poppler/UTF.h            |    2 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

New commits:
commit 3cfbc4efde1df6dcb9ef18a0fb26c7e199e6e8f5
Author: Jason Crain <jason at inspiresomeone.us>
Date:   Wed Sep 28 14:56:02 2016 +0000

    TextOutputDev: Break words on all whitespace characters
    
    Some PDF creators like Chrome use no-break spaces or other whitespace
    characters between words, causing pdftotext -bbox to not break words as
    expected.  Fix this by breaking words on any character with the Unicode
    whitespace property.
    
    Bug #97399

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 56ea3cc..e0dda08 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2607,7 +2607,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
   }
 
   // break words at space character
-  if (uLen == 1 && u[0] == (Unicode)0x20) {
+  if (uLen == 1 && UnicodeIsWhitespace(u[0])) {
     charPos += nBytes;
     endWord();
     return;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 3b3ae35..c140bd4 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -26,6 +26,7 @@
 #include "goo/gmem.h"
 #include "PDFDocEncoding.h"
 #include "UTF.h"
+#include <algorithm>
 
 bool UnicodeIsValid(Unicode ucs4)
 {
@@ -117,3 +118,14 @@ int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
   *ucs4 = u;
   return len;
 }
+
+bool UnicodeIsWhitespace(Unicode ucs4)
+{
+  static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D,
+    0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
+    0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F,
+    0x3000 };
+  Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
+  Unicode const *i = std::lower_bound(spaces, end, ucs4);
+  return (i != end && *i == ucs4);
+}
diff --git a/poppler/UTF.h b/poppler/UTF.h
index 248c168..5a47902 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -35,5 +35,7 @@ int TextStringToUCS4(GooString *textStr, Unicode **ucs4);
 // check if UCS-4 character is valid
 bool UnicodeIsValid(Unicode ucs4);
 
+// is a unicode whitespace character
+bool UnicodeIsWhitespace(Unicode ucs4);
 
 #endif


More information about the poppler mailing list