[poppler] 4 commits - CMakeLists.txt goo/GooString.cc poppler/CairoOutputDev.cc poppler/CharCodeToUnicode.cc poppler/GlobalParams.cc poppler/Makefile.am poppler/TextOutputDev.cc poppler/UTF8.h poppler/UTF.cc poppler/UTF.h utils/HtmlOutputDev.cc utils/pdfinfo.cc
Albert Astals Cid
aacid at kemper.freedesktop.org
Thu Aug 30 13:36:32 PDT 2012
CMakeLists.txt | 3 -
goo/GooString.cc | 3 -
poppler/CairoOutputDev.cc | 2
poppler/CharCodeToUnicode.cc | 13 +++--
poppler/GlobalParams.cc | 2
poppler/Makefile.am | 3 -
poppler/TextOutputDev.cc | 55 ++-------------------
poppler/UTF.cc | 104 ++++++++++++++++++++++++++++++++++++++++
poppler/UTF.h | 111 +++++++++++++++++++++++++++++++++++++++++++
poppler/UTF8.h | 84 --------------------------------
utils/HtmlOutputDev.cc | 16 ------
utils/pdfinfo.cc | 37 ++------------
12 files changed, 246 insertions(+), 187 deletions(-)
New commits:
commit cd1ab1e34032d5620140bd0b6b6ec4b74f89ae19
Author: Albert Astals Cid <aacid at kde.org>
Date: Thu Aug 30 22:36:14 2012 +0200
Update Adrian's copyrights
diff --git a/goo/GooString.cc b/goo/GooString.cc
index 61dee33..451a70e 100644
--- a/goo/GooString.cc
+++ b/goo/GooString.cc
@@ -21,6 +21,7 @@
// Copyright (C) 2008-2011 Albert Astals Cid <aacid at kde.org>
// Copyright (C) 2011 Kenji Uno <ku at digitaldolphins.jp>
// Copyright (C) 2012 Fabio D'Urso <fabiodurso at hotmail.it>
+// Copyright (C) 2012 Adrian Johnson <ajohnson at redneon.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc
index ce16ee5..4298090 100644
--- a/poppler/CharCodeToUnicode.cc
+++ b/poppler/CharCodeToUnicode.cc
@@ -21,6 +21,7 @@
// Copyright (C) 2010 William Bader <williambader at hotmail.com>
// Copyright (C) 2010 Jakub Wilk <ubanus at users.sf.net>
// Copyright (C) 2012 Thomas Freitag <Thomas.Freitag at alfa.de>
+// Copyright (C) 2012 Adrian Johnson <ajohnson at redneon.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index cc18c9b..adbb79f 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -18,7 +18,7 @@
// Copyright (C) 2006-2008, 2011 Carlos Garcia Campos <carlosgc at gnome.org>
// Copyright (C) 2006, 2007 Ed Catmur <ed at catmur.co.uk>
// Copyright (C) 2006 Jeff Muizelaar <jeff at infidigm.net>
-// Copyright (C) 2007, 2008 Adrian Johnson <ajohnson at redneon.com>
+// Copyright (C) 2007, 2008, 2012 Adrian Johnson <ajohnson at redneon.com>
// Copyright (C) 2008 Koji Otani <sho at bbr.jp>
// Copyright (C) 2008, 2010, 2011 Albert Astals Cid <aacid at kde.org>
// Copyright (C) 2008 Pino Toscano <pino at kde.org>
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 0642d04..8e9cb9d 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -1,3 +1,26 @@
+//========================================================================
+//
+// UTF.h
+//
+// Copyright 2001-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+//========================================================================
+//
+// Modified under the Poppler project - http://poppler.freedesktop.org
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// Copyright (C) 2008 Koji Otani <sho at bbr.jp>
+// Copyright (C) 2012 Adrian Johnson <ajohnson at redneon.com>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
#include "goo/gmem.h"
#include "PDFDocEncoding.h"
#include "UTF.h"
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index b3bb17d..e4bd0b1 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -25,7 +25,7 @@
// Copyright (C) 2009 Warren Toomey <wkt at tuhs.org>
// Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc at gnome.org>
// Copyright (C) 2009 Reece Dunn <msclrhd at gmail.com>
-// Copyright (C) 2010 Adrian Johnson <ajohnson at redneon.com>
+// Copyright (C) 2010, 2012 Adrian Johnson <ajohnson at redneon.com>
// Copyright (C) 2010 Hib Eris <hib at hiberis.nl>
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
commit ce8a579f339507da3fd7802e1531fbf6849c0c98
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Tue Aug 28 22:16:34 2012 +0930
Move text to unicode conversion into a separate function
This also ensures UTF-16 ActualText strings are converted to UCS-4
before calling addChar.
diff --git a/goo/GooString.cc b/goo/GooString.cc
index 1ebf341..61dee33 100644
--- a/goo/GooString.cc
+++ b/goo/GooString.cc
@@ -895,7 +895,7 @@ int GooString::cmpN(const char *sA, int n) const {
GBool GooString::hasUnicodeMarker(void)
{
- return (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff;
+ return length > 1 && (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff;
}
GooString *GooString::sanitizedName(GBool psmode)
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 7db041e..cc18c9b 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -63,7 +63,7 @@
#include "TextOutputDev.h"
#include "Page.h"
#include "Annot.h"
-#include "PDFDocEncoding.h"
+#include "UTF.h"
#ifdef MACOS
// needed for setting type/creator of MacOS files
@@ -5230,41 +5230,17 @@ void ActualText::end(GfxState *state) {
// extents of all the glyphs inside the span
if (actualTextNBytes) {
- char *uniString = NULL;
Unicode *uni;
- int length, i;
-
- if (!actualText->hasUnicodeMarker()) {
- if (actualText->getLength() > 0) {
- //non-unicode string -- assume pdfDocEncoding and
- //try to convert to UTF16BE
- uniString = pdfDocEncodingToUTF16(actualText, &length);
- } else {
- length = 0;
- }
- } else {
- uniString = actualText->getCString();
- length = actualText->getLength();
- }
-
- if (length < 3)
- length = 0;
- else
- length = length/2 - 1;
- uni = new Unicode[length];
- for (i = 0 ; i < length; i++)
- uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff);
+ int length;
// now that we have the position info for all of the text inside
// the marked content span, we feed the "ActualText" back through
// text->addChar()
+ length = TextStringToUCS4(actualText, &uni);
text->addChar(state, actualTextX0, actualTextY0,
actualTextX1 - actualTextX0, actualTextY1 - actualTextY0,
0, actualTextNBytes, uni, length);
-
- delete [] uni;
- if (!actualText->hasUnicodeMarker())
- delete [] uniString;
+ gfree(uni);
}
delete actualText;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index b5f7d9f..0642d04 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -1,4 +1,5 @@
#include "goo/gmem.h"
+#include "PDFDocEncoding.h"
#include "UTF.h"
int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
@@ -45,3 +46,36 @@ int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
return len;
}
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
+{
+ int i, len;
+ const char *s;
+ Unicode *u;
+
+ len = textStr->getLength();
+ s = textStr->getCString();
+ if (len == 0)
+ return 0;
+
+ if (textStr->hasUnicodeMarker()) {
+ Unicode *utf16;
+ len = len/2 - 1;
+ if (len > 0) {
+ utf16 = new Unicode[len];
+ for (i = 0 ; i < len; i++) {
+ utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
+ }
+ len = UTF16toUCS4(utf16, len, &u);
+ delete utf16;
+ } else {
+ u = NULL;
+ }
+ } else {
+ u = (Unicode*)gmallocn(len, sizeof(Unicode));
+ for (i = 0 ; i < len; i++) {
+ u[i] = pdfDocEncoding[s[i]];
+ }
+ }
+ *ucs4 = u;
+ return len;
+}
diff --git a/poppler/UTF.h b/poppler/UTF.h
index d0ef5bc..ec51e5a 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -27,6 +27,7 @@
#pragma implementation
#endif
+#include "goo/GooString.h"
#include "CharTypes.h"
// Convert a UTF-16 string to a UCS-4
@@ -36,6 +37,13 @@
// returns number of UCS-4 characters
int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
+// Convert a PDF Text String to UCS-4
+// s - PDF text string
+// ucs4 - if the number of UCS-4 characters is > 0, allocates and
+// returns UCS-4 string. Free with gfree.
+// returns number of UCS-4 characters
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4);
+
static int mapUTF8(Unicode u, char *buf, int bufSize) {
if (u <= 0x0000007f) {
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index cdc5375..d1c077b 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -48,7 +48,7 @@
#include "PDFDocFactory.h"
#include "CharTypes.h"
#include "UnicodeMap.h"
-#include "PDFDocEncoding.h"
+#include "UTF.h"
#include "Error.h"
#include "DateInfo.h"
@@ -379,41 +379,16 @@ static void printInfoString(Dict *infoDict, const char *key, const char *text,
UnicodeMap *uMap) {
Object obj;
GooString *s1;
- GBool isUnicode;
- Unicode u, u2;
+ Unicode *u;
char buf[8];
- int i, n;
+ int i, n, len;
if (infoDict->lookup(key, &obj)->isString()) {
fputs(text, stdout);
s1 = obj.getString();
- if ((s1->getChar(0) & 0xff) == 0xfe &&
- (s1->getChar(1) & 0xff) == 0xff) {
- isUnicode = gTrue;
- i = 2;
- } else {
- isUnicode = gFalse;
- i = 0;
- }
- while (i < obj.getString()->getLength()) {
- if (isUnicode) {
- u = ((s1->getChar(i) & 0xff) << 8) |
- (s1->getChar(i+1) & 0xff);
- i += 2;
- if (u >= 0xd800 && u <= 0xdbff && i < obj.getString()->getLength()) {
- // surrogate pair
- u2 = ((s1->getChar(i) & 0xff) << 8) |
- (s1->getChar(i+1) & 0xff);
- i += 2;
- if (u2 >= 0xdc00 && u2 <= 0xdfff) {
- u = 0x10000 + ((u - 0xd800) << 10) + (u2 - 0xdc00);
- }
- }
- } else {
- u = pdfDocEncoding[s1->getChar(i) & 0xff];
- ++i;
- }
- n = uMap->mapUnicode(u, buf, sizeof(buf));
+ len = TextStringToUCS4(s1, &u);
+ for (i = 0; i < len; i++) {
+ n = uMap->mapUnicode(u[i], buf, sizeof(buf));
fwrite(buf, 1, n, stdout);
}
fputc('\n', stdout);
commit cac13e782cf4413703cfd1fa23e76133dfbe5ef9
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Tue Aug 28 21:48:16 2012 +0930
text: increase the tolerance for overlapping glyphs
TextOutputDev will start a new line when encountering consecutive
glyphs with overlapping bounding boxes. This can occur when drawing
diacritics with a separate glyph. In this case, due to the diacritic
having a different baseline, the lines may be output in the wrong
order.
This patch increases the tolerance for overlapping bounding boxes to
prevent diacritics from splitting lines.
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 3020e22..7db041e 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -127,7 +127,7 @@
// Minimum spacing between characters within a word, as a fraction of
// the font size.
-#define minCharSpacing -0.2
+#define minCharSpacing -0.5
// Maximum spacing between characters within a word, as a fraction of
// the font size, when there is no obvious extra-wide character
commit 6f6386219449e70c2c3bc3559fdde3df4a57a809
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Thu Mar 8 20:52:28 2012 +1030
Convert UTF-16 to UCS-4 when reading toUnicode cmap
to ensure only UCS-4 values are used with the "Unicode" type.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b07470..6bddf0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,6 +296,7 @@ set(poppler_SRCS
poppler/strtok_r.cpp
poppler/UnicodeMap.cc
poppler/UnicodeTypeTable.cc
+ poppler/UTF.cc
poppler/XRef.cc
poppler/PSOutputDev.cc
poppler/TextOutputDev.cc
@@ -466,7 +467,7 @@ if(ENABLE_XPDF_HEADERS)
poppler/SecurityHandler.h
poppler/StdinCachedFile.h
poppler/StdinPDFDocBuilder.h
- poppler/UTF8.h
+ poppler/UTF.h
poppler/XpdfPluginAPI.h
poppler/Sound.h
${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h
diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc
index b70183e..d8f78d7 100644
--- a/poppler/CairoOutputDev.cc
+++ b/poppler/CairoOutputDev.cc
@@ -61,7 +61,7 @@
#include "CairoOutputDev.h"
#include "CairoFontEngine.h"
#include "CairoRescaleBox.h"
-#include "UTF8.h"
+#include "UTF.h"
//------------------------------------------------------------------------
// #define LOG_CAIRO
diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc
index d0e6c7f..ce16ee5 100644
--- a/poppler/CharCodeToUnicode.cc
+++ b/poppler/CharCodeToUnicode.cc
@@ -43,6 +43,7 @@
#include "GlobalParams.h"
#include "PSTokenizer.h"
#include "CharCodeToUnicode.h"
+#include "UTF.h"
//------------------------------------------------------------------------
@@ -453,15 +454,16 @@ void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n,
}
map[code] = 0;
sMap[sMapLen].c = code;
- sMap[sMapLen].len = n / 4;
- sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode));
- for (j = 0; j < sMap[sMapLen].len; ++j) {
- if (!parseHex(uStr + j*4, 4, &sMap[sMapLen].u[j])) {
+ int utf16Len = n / 4;
+ Unicode *utf16 = (Unicode*)gmallocn(utf16Len, sizeof(Unicode));
+ for (j = 0; j < utf16Len; ++j) {
+ if (!parseHex(uStr + j*4, 4, &utf16[j])) {
error(errSyntaxWarning, -1, "Illegal entry in ToUnicode CMap");
return;
}
}
- sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset;
+ utf16[utf16Len - 1] += offset;
+ sMap[sMapLen].len = UTF16toUCS4(utf16, utf16Len, &sMap[sMapLen].u);
++sMapLen;
}
}
diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc
index 098e4a4..148a0dd 100644
--- a/poppler/GlobalParams.cc
+++ b/poppler/GlobalParams.cc
@@ -108,7 +108,7 @@
#include "NameToUnicodeTable.h"
#include "UnicodeMapTables.h"
-#include "UTF8.h"
+#include "UTF.h"
#ifdef ENABLE_PLUGINS
# ifdef _WIN32
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 8920f8e..e9ac9d4 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -251,7 +251,7 @@ poppler_include_HEADERS = \
PSOutputDev.h \
TextOutputDev.h \
SecurityHandler.h \
- UTF8.h \
+ UTF.h \
XpdfPluginAPI.h \
Sound.h
nodist_poppler_include_HEADERS = poppler-config.h
@@ -317,6 +317,7 @@ libpoppler_la_SOURCES = \
strtok_r.cpp \
UnicodeMap.cc \
UnicodeTypeTable.cc \
+ UTF.cc \
ViewerPreferences.cc \
XRef.cc \
PSOutputDev.cc \
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 9af7532..3020e22 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2392,24 +2392,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
w1 /= uLen;
h1 /= uLen;
for (i = 0; i < uLen; ++i) {
- if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
- if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
- /* next code is a low surrogate */
- Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
- i++;
- curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu);
- } else {
- /* missing low surrogate
- replace it with REPLACEMENT CHARACTER (U+FFFD) */
- curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
- }
- } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
- /* invalid low surrogate
- replace it with REPLACEMENT CHARACTER (U+FFFD) */
- curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
- } else {
- curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
- }
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
}
}
charPos += nBytes;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
new file mode 100644
index 0000000..b5f7d9f
--- /dev/null
+++ b/poppler/UTF.cc
@@ -0,0 +1,47 @@
+#include "goo/gmem.h"
+#include "UTF.h"
+
+int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
+{
+ int i, n, len;
+ Unicode *u;
+
+ // count characters
+ len = 0;
+ for (i = 0; i < utf16Len; i++) {
+ if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
+ utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
+ i++; /* surrogate pair */
+ }
+ len++;
+ }
+ if (ucs4 == NULL)
+ return len;
+
+ u = (Unicode*)gmallocn(len, sizeof(Unicode));
+ n = 0;
+ // convert string
+ for (i = 0; i < utf16Len; i++) {
+ if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
+ if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
+ /* next code is a low surrogate */
+ u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
+ ++i;
+ } else {
+ /* missing low surrogate
+ replace it with REPLACEMENT CHARACTER (U+FFFD) */
+ u[n] = 0xfffd;
+ }
+ } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
+ /* invalid low surrogate
+ replace it with REPLACEMENT CHARACTER (U+FFFD) */
+ u[n] = 0xfffd;
+ } else {
+ u[n] = utf16[i];
+ }
+ n++;
+ }
+ *ucs4 = u;
+ return len;
+}
+
diff --git a/poppler/UTF.h b/poppler/UTF.h
new file mode 100644
index 0000000..d0ef5bc
--- /dev/null
+++ b/poppler/UTF.h
@@ -0,0 +1,103 @@
+//========================================================================
+//
+// UTF.h
+//
+// Copyright 2001-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+//========================================================================
+//
+// Modified under the Poppler project - http://poppler.freedesktop.org
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// Copyright (C) 2008 Koji Otani <sho at bbr.jp>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
+#ifndef UTF_H
+#define UTF_H
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include "CharTypes.h"
+
+// Convert a UTF-16 string to a UCS-4
+// utf16 - utf16 bytes
+// utf16_len - number of UTF-16 characters
+// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
+// returns number of UCS-4 characters
+int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
+
+
+static int mapUTF8(Unicode u, char *buf, int bufSize) {
+ if (u <= 0x0000007f) {
+ if (bufSize < 1) {
+ return 0;
+ }
+ buf[0] = (char)u;
+ return 1;
+ } else if (u <= 0x000007ff) {
+ if (bufSize < 2) {
+ return 0;
+ }
+ buf[0] = (char)(0xc0 + (u >> 6));
+ buf[1] = (char)(0x80 + (u & 0x3f));
+ return 2;
+ } else if (u <= 0x0000ffff) {
+ if (bufSize < 3) {
+ return 0;
+ }
+ buf[0] = (char)(0xe0 + (u >> 12));
+ buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
+ buf[2] = (char)(0x80 + (u & 0x3f));
+ return 3;
+ } else if (u <= 0x0010ffff) {
+ if (bufSize < 4) {
+ return 0;
+ }
+ buf[0] = (char)(0xf0 + (u >> 18));
+ buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
+ buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
+ buf[3] = (char)(0x80 + (u & 0x3f));
+ return 4;
+ } else {
+ return 0;
+ }
+}
+
+static int mapUCS2(Unicode u, char *buf, int bufSize) {
+ if (u <= 0xffff) {
+ if (bufSize < 2) {
+ return 0;
+ }
+ buf[0] = (char)((u >> 8) & 0xff);
+ buf[1] = (char)(u & 0xff);
+ return 2;
+ } else if (u < 0x110000) {
+ Unicode uu;
+
+ /* using surrogate pair */
+ if (bufSize < 4) {
+ return 0;
+ }
+ uu = ((u - 0x10000) >> 10) + 0xd800;
+ buf[0] = (char)((uu >> 8) & 0xff);
+ buf[1] = (char)(uu & 0xff);
+ uu = (u & 0x3ff)+0xdc00;
+ buf[2] = (char)((uu >> 8) & 0xff);
+ buf[3] = (char)(uu & 0xff);
+ return 4;
+ } else {
+ return 0;
+ }
+}
+
+#endif
diff --git a/poppler/UTF8.h b/poppler/UTF8.h
deleted file mode 100644
index 34a07d4..0000000
--- a/poppler/UTF8.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//========================================================================
-//
-// UTF8.h
-//
-// Copyright 2001-2003 Glyph & Cog, LLC
-//
-//========================================================================
-
-//========================================================================
-//
-// Modified under the Poppler project - http://poppler.freedesktop.org
-//
-// All changes made under the Poppler project to this file are licensed
-// under GPL version 2 or later
-//
-// Copyright (C) 2008 Koji Otani <sho at bbr.jp>
-//
-// To see a description of the changes please see the Changelog file that
-// came with your tarball or type make ChangeLog if you are building from git
-//
-//========================================================================
-
-static int mapUTF8(Unicode u, char *buf, int bufSize) {
- if (u <= 0x0000007f) {
- if (bufSize < 1) {
- return 0;
- }
- buf[0] = (char)u;
- return 1;
- } else if (u <= 0x000007ff) {
- if (bufSize < 2) {
- return 0;
- }
- buf[0] = (char)(0xc0 + (u >> 6));
- buf[1] = (char)(0x80 + (u & 0x3f));
- return 2;
- } else if (u <= 0x0000ffff) {
- if (bufSize < 3) {
- return 0;
- }
- buf[0] = (char)(0xe0 + (u >> 12));
- buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
- buf[2] = (char)(0x80 + (u & 0x3f));
- return 3;
- } else if (u <= 0x0010ffff) {
- if (bufSize < 4) {
- return 0;
- }
- buf[0] = (char)(0xf0 + (u >> 18));
- buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
- buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
- buf[3] = (char)(0x80 + (u & 0x3f));
- return 4;
- } else {
- return 0;
- }
-}
-
-static int mapUCS2(Unicode u, char *buf, int bufSize) {
- if (u <= 0xffff) {
- if (bufSize < 2) {
- return 0;
- }
- buf[0] = (char)((u >> 8) & 0xff);
- buf[1] = (char)(u & 0xff);
- return 2;
- } else if (u < 0x110000) {
- Unicode uu;
-
- /* using surrogate pair */
- if (bufSize < 4) {
- return 0;
- }
- uu = ((u - 0x10000) >> 10) + 0xd800;
- buf[0] = (char)((uu >> 8) & 0xff);
- buf[1] = (char)(uu & 0xff);
- uu = (u & 0x3ff)+0xdc00;
- buf[2] = (char)((uu >> 8) & 0xff);
- buf[3] = (char)(uu & 0xff);
- return 4;
- } else {
- return 0;
- }
-}
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 83f65d5..b3bb17d 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -400,19 +400,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
h1 /= uLen;
}
for (i = 0; i < uLen; ++i) {
- Unicode u1 = u[i];
- if (u1 >= 0xd800 && u1 <= 0xdbff && i < uLen) {
- // surrogate pair
- const Unicode u2 = u[i + 1];
- if (u2 >= 0xdc00 && u2 <= 0xdfff) {
- u1 = 0x10000 + ((u1 - 0xd800) << 10) + (u2 - 0xdc00);
-
- curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
- }
- ++i;
- } else {
- curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
- }
+ curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
}
}
More information about the poppler
mailing list