[poppler] poppler/TextOutputDev.cc poppler/UTF8.h

Albert Astals Cid aacid at kemper.freedesktop.org
Tue Jun 3 12:10:38 PDT 2008


 poppler/TextOutputDev.cc |   19 ++++++++++++++++++-
 poppler/UTF8.h           |   14 ++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

New commits:
commit 979ef1cafa968d776a2b804ce555b11212212397
Author: Koji Otani <sho at bbr.jp>
Date:   Tue Jun 3 21:07:15 2008 +0200

    Support for surrogates outside the BMP plane

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 75a0ac0..97f4f3f 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2075,7 +2075,24 @@ void TextPage::addChar(GfxState *state, double x, double y,
     w1 /= uLen;
     h1 /= uLen;
   for (i = 0; i < uLen; ++i) {
-      curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]);
+      if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
+	if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
+	  /* next code is a low surrogate */
+	  Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
+	  i++;
+	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, uu);
+	} else {
+	    /* missing low surrogate
+	     replace it with REPLACEMENT CHARACTER (U+FFFD) */
+	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
+	}
+      } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
+	  /* invalid low surrogate
+	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
+	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
+      } else {
+	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]);
+      }
   }
   }
   if (curWord) {
diff --git a/poppler/UTF8.h b/poppler/UTF8.h
index 8536dbf..11fb864 100644
--- a/poppler/UTF8.h
+++ b/poppler/UTF8.h
@@ -50,6 +50,20 @@ static int mapUCS2(Unicode u, char *buf, int bufSize) {
     buf[0] = (char)((u >> 8) & 0xff);
     buf[1] = (char)(u & 0xff);
     return 2;
+  } else if (u < 0x110000) {
+    Unicode uu;
+
+    /* using surrogate pair */
+    if (bufSize < 4) {
+      return 0;
+    }
+    uu = ((u - 0x10000) >> 10) + 0xd800;
+    buf[0] = (char)((uu >> 8) & 0xff);
+    buf[1] = (char)(uu & 0xff);
+    uu = (u & 0x3ff)+0xdc00;
+    buf[2] = (char)((uu >> 8) & 0xff);
+    buf[3] = (char)(uu & 0xff);
+    return 4;
   } else {
     return 0;
   }


More information about the poppler mailing list