[poppler] Branch 'xpdf303merge' - poppler/TextOutputDev.cc

Wed Dec 14 15:27:12 PST 2011

poppler/TextOutputDev.cc |   23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

New commits:
commit 51ca2b7c7dec5430d29860fd887ad5c5d9b3f574
Author: Albert Astals Cid <aacid at kde.org>
Date:   Thu Dec 15 00:26:09 2011 +0100

    [xpdf303] Some more changes in TextOutputDev

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 0c451ec..15fdf99 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -4669,10 +4669,9 @@ GBool TextPage::findCharRange(int pos, int length,
     return gFalse;
   }
 
-  //~ this doesn't correctly handle:
-  //~ - ranges split across multiple lines (the highlighted region
-  //~   is the bounding box of all the parts of the range)
-  //~ - cases where characters don't convert one-to-one into Unicode
+  //~ this doesn't correctly handle ranges split across multiple lines
+  //~ (the highlighted region is the bounding box of all the parts of
+  //~ the range)
   first = gTrue;
   xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
   xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
@@ -4791,7 +4790,9 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
       delete s;
       if (word->next &&
 	  fabs(word->next->base - word->base) <
-	    maxIntraLineDelta * word->fontSize) {
+	    maxIntraLineDelta * word->fontSize &&
+	  word->next->xMin >
+	    word->xMax - minDupBreakOverlap * word->fontSize) {
 	if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
 	  (*outputFunc)(outputStream, space, spaceLen);
 	}
@@ -5041,7 +5042,9 @@ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
 	}
 	i = j;
 	// output a right-to-left section
-	for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
+	for (j = i;
+	     j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+	     ++j) ;
 	if (j > i) {
 	  s->append(rle, rleLen);
 	  for (k = j - 1; k >= i; --k) {
@@ -5056,11 +5059,17 @@ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
 
     } else {
 
+      // Note: This code treats numeric characters (European and
+      // Arabic/Indic) as left-to-right, which isn't strictly correct
+      // (incurs extra LRE/POPDF pairs), but does produce correct
+      // visual formatting.
       s->append(rle, rleLen);
       i = len - 1;
       while (i >= 0) {
 	// output a right-to-left section
-	for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
+	for (j = i;
+	     j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+	     --j) ;
 	for (k = i; k > j; --k) {
 	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
 	  s->append(buf, n);