[poppler] 4 commits - CMakeLists.txt goo/GooString.cc poppler/CairoOutputDev.cc poppler/CharCodeToUnicode.cc poppler/GlobalParams.cc poppler/Makefile.am poppler/TextOutputDev.cc poppler/UTF8.h poppler/UTF.cc poppler/UTF.h utils/HtmlOutputDev.cc utils/pdfinfo.cc

Thu Aug 30 13:36:32 PDT 2012

CMakeLists.txt               |    3 -
 goo/GooString.cc             |    3 -
 poppler/CairoOutputDev.cc    |    2 
 poppler/CharCodeToUnicode.cc |   13 +++--
 poppler/GlobalParams.cc      |    2 
 poppler/Makefile.am          |    3 -
 poppler/TextOutputDev.cc     |   55 ++-------------------
 poppler/UTF.cc               |  104 ++++++++++++++++++++++++++++++++++++++++
 poppler/UTF.h                |  111 +++++++++++++++++++++++++++++++++++++++++++
 poppler/UTF8.h               |   84 --------------------------------
 utils/HtmlOutputDev.cc       |   16 ------
 utils/pdfinfo.cc             |   37 ++------------
 12 files changed, 246 insertions(+), 187 deletions(-)

New commits:
commit cd1ab1e34032d5620140bd0b6b6ec4b74f89ae19
Author: Albert Astals Cid <aacid at kde.org>
Date:   Thu Aug 30 22:36:14 2012 +0200

    Update Adrian's copyrights

diff --git a/goo/GooString.cc b/goo/GooString.cc
index 61dee33..451a70e 100644
--- a/goo/GooString.cc
+++ b/goo/GooString.cc
@@ -21,6 +21,7 @@
 // Copyright (C) 2008-2011 Albert Astals Cid <aacid at kde.org>
 // Copyright (C) 2011 Kenji Uno <ku at digitaldolphins.jp>
 // Copyright (C) 2012 Fabio D'Urso <fabiodurso at hotmail.it>
+// Copyright (C) 2012 Adrian Johnson <ajohnson at redneon.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc
index ce16ee5..4298090 100644
--- a/poppler/CharCodeToUnicode.cc
+++ b/poppler/CharCodeToUnicode.cc
@@ -21,6 +21,7 @@
 // Copyright (C) 2010 William Bader <williambader at hotmail.com>
 // Copyright (C) 2010 Jakub Wilk <ubanus at users.sf.net>
 // Copyright (C) 2012 Thomas Freitag <Thomas.Freitag at alfa.de>
+// Copyright (C) 2012 Adrian Johnson <ajohnson at redneon.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index cc18c9b..adbb79f 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -18,7 +18,7 @@
 // Copyright (C) 2006-2008, 2011 Carlos Garcia Campos <carlosgc at gnome.org>
 // Copyright (C) 2006, 2007 Ed Catmur <ed at catmur.co.uk>
 // Copyright (C) 2006 Jeff Muizelaar <jeff at infidigm.net>
-// Copyright (C) 2007, 2008 Adrian Johnson <ajohnson at redneon.com>
+// Copyright (C) 2007, 2008, 2012 Adrian Johnson <ajohnson at redneon.com>
 // Copyright (C) 2008 Koji Otani <sho at bbr.jp>
 // Copyright (C) 2008, 2010, 2011 Albert Astals Cid <aacid at kde.org>
 // Copyright (C) 2008 Pino Toscano <pino at kde.org>
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 0642d04..8e9cb9d 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -1,3 +1,26 @@
+//========================================================================
+//
+// UTF.h
+//
+// Copyright 2001-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+//========================================================================
+//
+// Modified under the Poppler project - http://poppler.freedesktop.org
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// Copyright (C) 2008 Koji Otani <sho at bbr.jp>
+// Copyright (C) 2012 Adrian Johnson <ajohnson at redneon.com>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
 #include "goo/gmem.h"
 #include "PDFDocEncoding.h"
 #include "UTF.h"
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index b3bb17d..e4bd0b1 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -25,7 +25,7 @@
 // Copyright (C) 2009 Warren Toomey <wkt at tuhs.org>
 // Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc at gnome.org>
 // Copyright (C) 2009 Reece Dunn <msclrhd at gmail.com>
-// Copyright (C) 2010 Adrian Johnson <ajohnson at redneon.com>
+// Copyright (C) 2010, 2012 Adrian Johnson <ajohnson at redneon.com>
 // Copyright (C) 2010 Hib Eris <hib at hiberis.nl>
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
 // Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
commit ce8a579f339507da3fd7802e1531fbf6849c0c98
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Tue Aug 28 22:16:34 2012 +0930

    Move text to unicode conversion into a separate function
    
    This also ensures UTF-16 ActualText strings are converted to UCS-4
    before calling addChar.

diff --git a/goo/GooString.cc b/goo/GooString.cc
index 1ebf341..61dee33 100644
--- a/goo/GooString.cc
+++ b/goo/GooString.cc
@@ -895,7 +895,7 @@ int GooString::cmpN(const char *sA, int n) const {
 
 GBool GooString::hasUnicodeMarker(void)
 {
-    return (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff;
+  return length > 1 && (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff;
 }
 
 GooString *GooString::sanitizedName(GBool psmode)
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 7db041e..cc18c9b 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -63,7 +63,7 @@
 #include "TextOutputDev.h"
 #include "Page.h"
 #include "Annot.h"
-#include "PDFDocEncoding.h"
+#include "UTF.h"
 
 #ifdef MACOS
 // needed for setting type/creator of MacOS files
@@ -5230,41 +5230,17 @@ void ActualText::end(GfxState *state) {
   // extents of all the glyphs inside the span
 
   if (actualTextNBytes) {
-    char *uniString = NULL;
     Unicode *uni;
-    int length, i;
-
-    if (!actualText->hasUnicodeMarker()) {
-      if (actualText->getLength() > 0) {
-        //non-unicode string -- assume pdfDocEncoding and
-        //try to convert to UTF16BE
-        uniString = pdfDocEncodingToUTF16(actualText, &length);
-      } else {
-        length = 0;
-      }
-    } else {
-      uniString = actualText->getCString();
-      length = actualText->getLength();
-    }
-
-    if (length < 3)
-      length = 0;
-    else
-      length = length/2 - 1;
-    uni = new Unicode[length];
-    for (i = 0 ; i < length; i++)
-      uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff);
+    int length;
 
     // now that we have the position info for all of the text inside
     // the marked content span, we feed the "ActualText" back through
     // text->addChar()
+    length = TextStringToUCS4(actualText, &uni);
     text->addChar(state, actualTextX0, actualTextY0,
                   actualTextX1 - actualTextX0, actualTextY1 - actualTextY0,
                   0, actualTextNBytes, uni, length);
-
-    delete [] uni;
-    if (!actualText->hasUnicodeMarker())
-      delete [] uniString;
+    gfree(uni);
   }
 
   delete actualText;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index b5f7d9f..0642d04 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -1,4 +1,5 @@
 #include "goo/gmem.h"
+#include "PDFDocEncoding.h"
 #include "UTF.h"
 
 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
@@ -45,3 +46,36 @@ int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
   return len;
 }
 
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
+{
+  int i, len;
+  const char *s;
+  Unicode *u;
+
+  len = textStr->getLength();
+  s = textStr->getCString();
+  if (len == 0)
+    return 0;
+
+  if (textStr->hasUnicodeMarker()) {
+    Unicode *utf16;
+    len = len/2 - 1;
+    if (len > 0) {
+      utf16 = new Unicode[len];
+      for (i = 0 ; i < len; i++) {
+        utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
+      }
+      len = UTF16toUCS4(utf16, len, &u);
+      delete utf16;
+    } else {
+      u = NULL;
+    }
+  } else {
+    u = (Unicode*)gmallocn(len, sizeof(Unicode));
+    for (i = 0 ; i < len; i++) {
+      u[i] = pdfDocEncoding[s[i]];
+    }
+  }
+  *ucs4 = u;
+  return len;
+}
diff --git a/poppler/UTF.h b/poppler/UTF.h
index d0ef5bc..ec51e5a 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -27,6 +27,7 @@
 #pragma implementation
 #endif
 
+#include "goo/GooString.h"
 #include "CharTypes.h"
 
 // Convert a UTF-16 string to a UCS-4
@@ -36,6 +37,13 @@
 //   returns number of UCS-4 characters
 int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
 
+// Convert a PDF Text String to UCS-4
+//   s          - PDF text string
+//   ucs4       - if the number of UCS-4 characters is > 0, allocates and
+//                returns UCS-4 string. Free with gfree.
+//   returns number of UCS-4 characters
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4);
+
 
 static int mapUTF8(Unicode u, char *buf, int bufSize) {
   if        (u <= 0x0000007f) {
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index cdc5375..d1c077b 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -48,7 +48,7 @@
 #include "PDFDocFactory.h"
 #include "CharTypes.h"
 #include "UnicodeMap.h"
-#include "PDFDocEncoding.h"
+#include "UTF.h"
 #include "Error.h"
 #include "DateInfo.h"
 
@@ -379,41 +379,16 @@ static void printInfoString(Dict *infoDict, const char *key, const char *text,
 			    UnicodeMap *uMap) {
   Object obj;
   GooString *s1;
-  GBool isUnicode;
-  Unicode u, u2;
+  Unicode *u;
   char buf[8];
-  int i, n;
+  int i, n, len;
 
   if (infoDict->lookup(key, &obj)->isString()) {
     fputs(text, stdout);
     s1 = obj.getString();
-    if ((s1->getChar(0) & 0xff) == 0xfe &&
-	(s1->getChar(1) & 0xff) == 0xff) {
-      isUnicode = gTrue;
-      i = 2;
-    } else {
-      isUnicode = gFalse;
-      i = 0;
-    }
-    while (i < obj.getString()->getLength()) {
-      if (isUnicode) {
-	u = ((s1->getChar(i) & 0xff) << 8) |
-	    (s1->getChar(i+1) & 0xff);
-	i += 2;
-	if (u >= 0xd800 && u <= 0xdbff && i < obj.getString()->getLength()) {
-	  // surrogate pair
-	  u2 = ((s1->getChar(i) & 0xff) << 8) |
-	    (s1->getChar(i+1) & 0xff);
-	  i += 2;
-	  if (u2 >= 0xdc00 && u2 <= 0xdfff) {
-	    u = 0x10000 + ((u - 0xd800) << 10) + (u2 - 0xdc00);
-	  }
-	}
-      } else {
-	u = pdfDocEncoding[s1->getChar(i) & 0xff];
-	++i;
-      }
-      n = uMap->mapUnicode(u, buf, sizeof(buf));
+    len = TextStringToUCS4(s1, &u);
+    for (i = 0; i < len; i++) {
+      n = uMap->mapUnicode(u[i], buf, sizeof(buf));
       fwrite(buf, 1, n, stdout);
     }
     fputc('\n', stdout);
commit cac13e782cf4413703cfd1fa23e76133dfbe5ef9
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Tue Aug 28 21:48:16 2012 +0930

    text: increase the tolerance for overlapping glyphs
    
    TextOutputDev will start a new line when encountering consecutive
    glyphs with overlapping bounding boxes. This can occur when drawing
    diacritics with a separate glyph. In this case, due to the diacritic
    having a different baseline, the lines may be output in the wrong
    order.
    
    This patch increases the tolerance for overlapping bounding boxes to
    prevent diacritics from splitting lines.

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 3020e22..7db041e 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -127,7 +127,7 @@
 
 // Minimum spacing between characters within a word, as a fraction of
 // the font size.
-#define minCharSpacing -0.2
+#define minCharSpacing -0.5
 
 // Maximum spacing between characters within a word, as a fraction of
 // the font size, when there is no obvious extra-wide character
commit 6f6386219449e70c2c3bc3559fdde3df4a57a809
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Thu Mar 8 20:52:28 2012 +1030

    Convert UTF-16 to UCS-4 when reading toUnicode cmap
    
    to ensure only UCS-4 values are used with the "Unicode" type.

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b07470..6bddf0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,6 +296,7 @@ set(poppler_SRCS
   poppler/strtok_r.cpp
   poppler/UnicodeMap.cc
   poppler/UnicodeTypeTable.cc
+  poppler/UTF.cc
   poppler/XRef.cc
   poppler/PSOutputDev.cc
   poppler/TextOutputDev.cc
@@ -466,7 +467,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/SecurityHandler.h
     poppler/StdinCachedFile.h
     poppler/StdinPDFDocBuilder.h
-    poppler/UTF8.h
+    poppler/UTF.h
     poppler/XpdfPluginAPI.h
     poppler/Sound.h
     ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h
diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc
index b70183e..d8f78d7 100644
--- a/poppler/CairoOutputDev.cc
+++ b/poppler/CairoOutputDev.cc
@@ -61,7 +61,7 @@
 #include "CairoOutputDev.h"
 #include "CairoFontEngine.h"
 #include "CairoRescaleBox.h"
-#include "UTF8.h"
+#include "UTF.h"
 //------------------------------------------------------------------------
 
 // #define LOG_CAIRO
diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc
index d0e6c7f..ce16ee5 100644
--- a/poppler/CharCodeToUnicode.cc
+++ b/poppler/CharCodeToUnicode.cc
@@ -43,6 +43,7 @@
 #include "GlobalParams.h"
 #include "PSTokenizer.h"
 #include "CharCodeToUnicode.h"
+#include "UTF.h"
 
 //------------------------------------------------------------------------
 
@@ -453,15 +454,16 @@ void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n,
     }
     map[code] = 0;
     sMap[sMapLen].c = code;
-    sMap[sMapLen].len = n / 4;
-    sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode));
-    for (j = 0; j < sMap[sMapLen].len; ++j) {
-      if (!parseHex(uStr + j*4, 4, &sMap[sMapLen].u[j])) {
+    int utf16Len = n / 4;
+    Unicode *utf16 = (Unicode*)gmallocn(utf16Len, sizeof(Unicode));
+    for (j = 0; j < utf16Len; ++j) {
+      if (!parseHex(uStr + j*4, 4, &utf16[j])) {
 	error(errSyntaxWarning, -1, "Illegal entry in ToUnicode CMap");
 	return;
       }
     }
-    sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset;
+    utf16[utf16Len - 1] += offset;
+    sMap[sMapLen].len = UTF16toUCS4(utf16, utf16Len, &sMap[sMapLen].u);
     ++sMapLen;
   }
 }
diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc
index 098e4a4..148a0dd 100644
--- a/poppler/GlobalParams.cc
+++ b/poppler/GlobalParams.cc
@@ -108,7 +108,7 @@
 
 #include "NameToUnicodeTable.h"
 #include "UnicodeMapTables.h"
-#include "UTF8.h"
+#include "UTF.h"
 
 #ifdef ENABLE_PLUGINS
 #  ifdef _WIN32
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 8920f8e..e9ac9d4 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -251,7 +251,7 @@ poppler_include_HEADERS =	\
 	PSOutputDev.h		\
 	TextOutputDev.h		\
 	SecurityHandler.h	\
-	UTF8.h			\
+	UTF.h			\
 	XpdfPluginAPI.h		\
 	Sound.h
 nodist_poppler_include_HEADERS = poppler-config.h
@@ -317,6 +317,7 @@ libpoppler_la_SOURCES =		\
 	strtok_r.cpp		\
 	UnicodeMap.cc		\
 	UnicodeTypeTable.cc	\
+	UTF.cc                  \
 	ViewerPreferences.cc	\
 	XRef.cc			\
 	PSOutputDev.cc		\
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 9af7532..3020e22 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2392,24 +2392,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
     w1 /= uLen;
     h1 /= uLen;
     for (i = 0; i < uLen; ++i) {
-      if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
-	if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
-	  /* next code is a low surrogate */
-	  Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
-	  i++;
-	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu);
-	} else {
-	    /* missing low surrogate
-	     replace it with REPLACEMENT CHARACTER (U+FFFD) */
-	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
-	}
-      } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
-	  /* invalid low surrogate
-	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
-	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
-      } else {
-	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
-      }
+      curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
     }
   }
   charPos += nBytes;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
new file mode 100644
index 0000000..b5f7d9f
--- /dev/null
+++ b/poppler/UTF.cc
@@ -0,0 +1,47 @@
+#include "goo/gmem.h"
+#include "UTF.h"
+
+int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
+{
+  int i, n, len;
+  Unicode *u;
+
+  // count characters
+  len = 0;
+  for (i = 0; i < utf16Len; i++) {
+    if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
+        utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
+      i++; /* surrogate pair */
+    }
+    len++;
+  }
+  if (ucs4 == NULL)
+    return len;
+
+  u = (Unicode*)gmallocn(len, sizeof(Unicode));
+  n = 0;
+  // convert string
+  for (i = 0; i < utf16Len; i++) {
+    if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
+      if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
+	/* next code is a low surrogate */
+	u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
+	++i;
+      } else {
+	/* missing low surrogate
+	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
+	u[n] = 0xfffd;
+      }
+    } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
+      /* invalid low surrogate
+	 replace it with REPLACEMENT CHARACTER (U+FFFD) */
+      u[n] = 0xfffd;
+    } else {
+      u[n] = utf16[i];
+    }
+    n++;
+  }
+  *ucs4 = u;
+  return len;
+}
+
diff --git a/poppler/UTF.h b/poppler/UTF.h
new file mode 100644
index 0000000..d0ef5bc
--- /dev/null
+++ b/poppler/UTF.h
@@ -0,0 +1,103 @@
+//========================================================================
+//
+// UTF.h
+//
+// Copyright 2001-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+//========================================================================
+//
+// Modified under the Poppler project - http://poppler.freedesktop.org
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// Copyright (C) 2008 Koji Otani <sho at bbr.jp>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
+#ifndef UTF_H
+#define UTF_H
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include "CharTypes.h"
+
+// Convert a UTF-16 string to a UCS-4
+//   utf16      - utf16 bytes
+//   utf16_len  - number of UTF-16 characters
+//   ucs4_out   - if not NULL, allocates and returns UCS-4 string. Free with gfree.
+//   returns number of UCS-4 characters
+int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
+
+
+static int mapUTF8(Unicode u, char *buf, int bufSize) {
+  if        (u <= 0x0000007f) {
+    if (bufSize < 1) {
+      return 0;
+    }
+    buf[0] = (char)u;
+    return 1;
+  } else if (u <= 0x000007ff) {
+    if (bufSize < 2) {
+      return 0;
+    }
+    buf[0] = (char)(0xc0 + (u >> 6));
+    buf[1] = (char)(0x80 + (u & 0x3f));
+    return 2;
+  } else if (u <= 0x0000ffff) {
+    if (bufSize < 3) {
+      return 0;
+    }
+    buf[0] = (char)(0xe0 + (u >> 12));
+    buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
+    buf[2] = (char)(0x80 + (u & 0x3f));
+    return 3;
+  } else if (u <= 0x0010ffff) {
+    if (bufSize < 4) {
+      return 0;
+    }
+    buf[0] = (char)(0xf0 + (u >> 18));
+    buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
+    buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
+    buf[3] = (char)(0x80 + (u & 0x3f));
+    return 4;
+  } else {
+    return 0;
+  }
+}
+
+static int mapUCS2(Unicode u, char *buf, int bufSize) {
+  if (u <= 0xffff) {
+    if (bufSize < 2) {
+      return 0;
+     }
+    buf[0] = (char)((u >> 8) & 0xff);
+    buf[1] = (char)(u & 0xff);
+    return 2;
+  } else if (u < 0x110000) {
+    Unicode uu;
+
+    /* using surrogate pair */
+    if (bufSize < 4) {
+      return 0;
+    }
+    uu = ((u - 0x10000) >> 10) + 0xd800;
+    buf[0] = (char)((uu >> 8) & 0xff);
+    buf[1] = (char)(uu & 0xff);
+    uu = (u & 0x3ff)+0xdc00;
+    buf[2] = (char)((uu >> 8) & 0xff);
+    buf[3] = (char)(uu & 0xff);
+    return 4;
+  } else {
+    return 0;
+  }
+}
+
+#endif
diff --git a/poppler/UTF8.h b/poppler/UTF8.h
deleted file mode 100644
index 34a07d4..0000000
--- a/poppler/UTF8.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//========================================================================
-//
-// UTF8.h
-//
-// Copyright 2001-2003 Glyph & Cog, LLC
-//
-//========================================================================
-
-//========================================================================
-//
-// Modified under the Poppler project - http://poppler.freedesktop.org
-//
-// All changes made under the Poppler project to this file are licensed
-// under GPL version 2 or later
-//
-// Copyright (C) 2008 Koji Otani <sho at bbr.jp>
-//
-// To see a description of the changes please see the Changelog file that
-// came with your tarball or type make ChangeLog if you are building from git
-//
-//========================================================================
-
-static int mapUTF8(Unicode u, char *buf, int bufSize) {
-  if        (u <= 0x0000007f) {
-    if (bufSize < 1) {
-      return 0;
-    }
-    buf[0] = (char)u;
-    return 1;
-  } else if (u <= 0x000007ff) {
-    if (bufSize < 2) {
-      return 0;
-    }
-    buf[0] = (char)(0xc0 + (u >> 6));
-    buf[1] = (char)(0x80 + (u & 0x3f));
-    return 2;
-  } else if (u <= 0x0000ffff) {
-    if (bufSize < 3) {
-      return 0;
-    }
-    buf[0] = (char)(0xe0 + (u >> 12));
-    buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
-    buf[2] = (char)(0x80 + (u & 0x3f));
-    return 3;
-  } else if (u <= 0x0010ffff) {
-    if (bufSize < 4) {
-      return 0;
-    }
-    buf[0] = (char)(0xf0 + (u >> 18));
-    buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
-    buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
-    buf[3] = (char)(0x80 + (u & 0x3f));
-    return 4;
-  } else {
-    return 0;
-  }
-}
-
-static int mapUCS2(Unicode u, char *buf, int bufSize) {
-  if (u <= 0xffff) {
-    if (bufSize < 2) {
-      return 0;
-    }
-    buf[0] = (char)((u >> 8) & 0xff);
-    buf[1] = (char)(u & 0xff);
-    return 2;
-  } else if (u < 0x110000) {
-    Unicode uu;
-
-    /* using surrogate pair */
-    if (bufSize < 4) {
-      return 0;
-    }
-    uu = ((u - 0x10000) >> 10) + 0xd800;
-    buf[0] = (char)((uu >> 8) & 0xff);
-    buf[1] = (char)(uu & 0xff);
-    uu = (u & 0x3ff)+0xdc00;
-    buf[2] = (char)((uu >> 8) & 0xff);
-    buf[3] = (char)(uu & 0xff);
-    return 4;
-  } else {
-    return 0;
-  }
-}
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 83f65d5..b3bb17d 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -400,19 +400,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
     h1 /= uLen;
   }
   for (i = 0; i < uLen; ++i) {
-    Unicode u1 = u[i];
-    if (u1 >= 0xd800 && u1 <= 0xdbff && i < uLen) {
-      // surrogate pair
-      const Unicode u2 = u[i + 1];
-      if (u2 >= 0xdc00 && u2 <= 0xdfff) {
-	u1 = 0x10000 + ((u1 - 0xd800) << 10) + (u2 - 0xdc00);
-	
-	curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
-      }
-      ++i;
-    } else {
-      curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
-    }
+    curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
   }
 }