[poppler] poppler/CharCodeToUnicode.cc poppler/CharCodeToUnicode.h
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Sat Feb 6 23:27:50 UTC 2021
poppler/CharCodeToUnicode.cc | 112 ++++++++++++++++++++++++++++++++++++++++---
poppler/CharCodeToUnicode.h | 5 +
2 files changed, 108 insertions(+), 9 deletions(-)
New commits:
commit 41f5acecf50aa6b16f95aca557dbcf6b49b8f394
Author: Albert Astals Cid <aacid at kde.org>
Date: Sat Feb 6 23:58:07 2021 +0100
Fix parsing text in some broken pdf files
Workaround imported from xpdf 4.02
Fixes issue #1040
diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc
index eba94d62..f7cc5be1 100644
--- a/poppler/CharCodeToUnicode.cc
+++ b/poppler/CharCodeToUnicode.cc
@@ -13,7 +13,7 @@
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
-// Copyright (C) 2006, 2008-2010, 2012, 2018-2020 Albert Astals Cid <aacid at kde.org>
+// Copyright (C) 2006, 2008-2010, 2012, 2018-2021 Albert Astals Cid <aacid at kde.org>
// Copyright (C) 2007 Julien Rebetez <julienr at svn.gnome.org>
// Copyright (C) 2007 Koji Otani <sho at bbr.jp>
// Copyright (C) 2008 Michael Vrable <mvrable at cs.ucsd.edu>
@@ -265,7 +265,10 @@ CharCodeToUnicode *CharCodeToUnicode::parseCMap(const GooString *buf, int nBits)
ctu = new CharCodeToUnicode(nullptr);
const char *p = buf->c_str();
- ctu->parseCMap1(&getCharFromString, &p, nBits);
+ if (!ctu->parseCMap1(&getCharFromString, &p, nBits)) {
+ delete ctu;
+ return nullptr;
+ }
return ctu;
}
@@ -276,8 +279,11 @@ CharCodeToUnicode *CharCodeToUnicode::parseCMapFromFile(const GooString *fileNam
ctu = new CharCodeToUnicode(nullptr);
if ((f = globalParams->findToUnicodeFile(fileName))) {
- ctu->parseCMap1(&getCharFromFile, f, nBits);
- fclose(f);
+ if (!ctu->parseCMap1(&getCharFromFile, f, nBits)) {
+ delete ctu;
+ fclose(f);
+ return nullptr;
+ }
} else {
error(errSyntaxError, -1, "Couldn't find ToUnicode CMap file for '{0:t}'", fileName);
}
@@ -290,7 +296,7 @@ void CharCodeToUnicode::mergeCMap(const GooString *buf, int nBits)
parseCMap1(&getCharFromString, &p, nBits);
}
-void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int nBits)
+bool CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int nBits)
{
PSTokenizer *pst;
char tok1[256], tok2[256], tok3[256];
@@ -300,6 +306,7 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int n
GooString *name;
FILE *f;
+ bool ok = false;
maxCode = (nBits == 8) ? 0xff : (nBits == 16) ? 0xffff : 0xffffffff;
pst = new PSTokenizer(getCharFunc, data);
pst->getToken(tok1, sizeof(tok1), &n1);
@@ -308,7 +315,9 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int n
if (tok1[0] == '/') {
name = new GooString(tok1 + 1);
if ((f = globalParams->findToUnicodeFile(name))) {
- parseCMap1(&getCharFromFile, f, nBits);
+ if (parseCMap1(&getCharFromFile, f, nBits)) {
+ ok = true;
+ }
fclose(f);
} else {
error(errSyntaxError, -1, "Couldn't find ToUnicode CMap file for '{0:t}'", name);
@@ -338,6 +347,7 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int n
error(errSyntaxWarning, -1, "Invalid entry in bfchar block in ToUnicode CMap");
}
addMapping(code1, tok2 + 1, n2 - 2, 0);
+ ok = true;
}
pst->getToken(tok1, sizeof(tok1), &n1);
} else if (!strcmp(tok2, "beginbfrange")) {
@@ -376,6 +386,7 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int n
if (tok1[0] == '<' && tok1[n1 - 1] == '>') {
tok1[n1 - 1] = '\0';
addMapping(code1 + i, tok1 + 1, n1 - 2, 0);
+ ok = true;
} else {
error(errSyntaxWarning, -1, "Illegal entry in bfrange block in ToUnicode CMap");
}
@@ -385,6 +396,7 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int n
tok3[n3 - 1] = '\0';
for (i = 0; code1 <= code2; ++code1, ++i) {
addMapping(code1, tok3 + 1, n3 - 2, i);
+ ok = true;
}
} else {
@@ -392,11 +404,74 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int n
}
}
pst->getToken(tok1, sizeof(tok1), &n1);
+ } else if (!strcmp(tok2, "begincidchar")) {
+ // the begincidchar operator is not allowed in ToUnicode CMaps,
+ // but some buggy PDF generators incorrectly use
+ // code-to-CID-type CMaps here
+ error(errSyntaxWarning, -1, "Invalid 'begincidchar' operator in ToUnicode CMap");
+ while (pst->getToken(tok1, sizeof(tok1), &n1)) {
+ if (!strcmp(tok1, "endcidchar")) {
+ break;
+ }
+ if (!pst->getToken(tok2, sizeof(tok2), &n2) || !strcmp(tok2, "endcidchar")) {
+ error(errSyntaxWarning, -1, "Illegal entry in cidchar block in ToUnicode CMap");
+ break;
+ }
+ if (!(tok1[0] == '<' && tok1[n1 - 1] == '>')) {
+ error(errSyntaxWarning, -1, "Illegal entry in cidchar block in ToUnicode CMap");
+ continue;
+ }
+ tok1[n1 - 1] = '\0';
+ if (!parseHex(tok1 + 1, n1 - 2, &code1)) {
+ error(errSyntaxWarning, -1, "Illegal entry in cidchar block in ToUnicode CMap");
+ continue;
+ }
+ if (code1 > maxCode) {
+ error(errSyntaxWarning, -1, "Invalid entry in cidchar block in ToUnicode CMap");
+ }
+ addMappingInt(code1, atoi(tok2));
+ ok = true;
+ }
+ pst->getToken(tok1, sizeof(tok1), &n1);
+ } else if (!strcmp(tok2, "begincidrange")) {
+ // the begincidrange operator is not allowed in ToUnicode CMaps,
+ // but some buggy PDF generators incorrectly use
+ // code-to-CID-type CMaps here
+ error(errSyntaxWarning, -1, "Invalid 'begincidrange' operator in ToUnicode CMap");
+ while (pst->getToken(tok1, sizeof(tok1), &n1)) {
+ if (!strcmp(tok1, "endcidrange")) {
+ break;
+ }
+ if (!pst->getToken(tok2, sizeof(tok2), &n2) || !strcmp(tok2, "endcidrange") || !pst->getToken(tok3, sizeof(tok3), &n3) || !strcmp(tok3, "endcidrange")) {
+ error(errSyntaxWarning, -1, "Illegal entry in cidrange block in ToUnicode CMap");
+ break;
+ }
+ if (!(tok1[0] == '<' && tok1[n1 - 1] == '>' && tok2[0] == '<' && tok2[n2 - 1] == '>')) {
+ error(errSyntaxWarning, -1, "Illegal entry in cidrange block in ToUnicode CMap");
+ continue;
+ }
+ tok1[n1 - 1] = tok2[n2 - 1] = '\0';
+ if (!parseHex(tok1 + 1, n1 - 2, &code1) || !parseHex(tok2 + 1, n2 - 2, &code2)) {
+ error(errSyntaxWarning, -1, "Illegal entry in cidrange block in ToUnicode CMap");
+ continue;
+ }
+ if (code1 > maxCode || code2 > maxCode) {
+ error(errSyntaxWarning, -1, "Invalid entry in cidrange block in ToUnicode CMap");
+ if (code2 > maxCode) {
+ code2 = maxCode;
+ }
+ }
+ for (i = atoi(tok3); code1 <= code2; ++code1, ++i) {
+ addMappingInt(code1, i);
+ ok = true;
+ }
+ }
+ pst->getToken(tok1, sizeof(tok1), &n1);
} else {
strcpy(tok1, tok2);
}
}
- delete pst;
+ return ok;
}
void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n, int offset)
@@ -458,6 +533,29 @@ void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n, int offset)
}
}
+void CharCodeToUnicode::addMappingInt(CharCode code, Unicode u)
+{
+ CharCode oldLen, i;
+
+ if (code > 0xffffff) {
+ // This is an arbitrary limit to avoid integer overflow issues.
+ // (I've seen CMaps with mappings for <ffffffff>.)
+ return;
+ }
+ if (code >= mapLen) {
+ oldLen = mapLen;
+ mapLen = mapLen ? 2 * mapLen : 256;
+ if (code >= mapLen) {
+ mapLen = (code + 256) & ~255;
+ }
+ map = (Unicode *)greallocn(map, mapLen, sizeof(Unicode));
+ for (i = oldLen; i < mapLen; ++i) {
+ map[i] = 0;
+ }
+ }
+ map[code] = u;
+}
+
CharCodeToUnicode::CharCodeToUnicode()
{
tag = nullptr;
diff --git a/poppler/CharCodeToUnicode.h b/poppler/CharCodeToUnicode.h
index cbcfc0ad..e2434b23 100644
--- a/poppler/CharCodeToUnicode.h
+++ b/poppler/CharCodeToUnicode.h
@@ -17,7 +17,7 @@
//
// Copyright (C) 2007 Julien Rebetez <julienr at svn.gnome.org>
// Copyright (C) 2007 Koji Otani <sho at bbr.jp>
-// Copyright (C) 2008, 2011, 2012, 2018, 2019 Albert Astals Cid <aacid at kde.org>
+// Copyright (C) 2008, 2011, 2012, 2018, 2019, 2021 Albert Astals Cid <aacid at kde.org>
// Copyright (C) 2017 Adrian Johnson <ajohnson at redneon.com>
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info at kdab.com>. Work sponsored by the LiMux project of the city of Munich
// Copyright (C) 2018 Adam Reichold <adam.reichold at t-online.de>
@@ -99,8 +99,9 @@ public:
CharCode getLength() const { return mapLen; }
private:
- void parseCMap1(int (*getCharFunc)(void *), void *data, int nBits);
+ bool parseCMap1(int (*getCharFunc)(void *), void *data, int nBits);
void addMapping(CharCode code, char *uStr, int n, int offset);
+ void addMappingInt(CharCode code, Unicode u);
CharCodeToUnicode();
CharCodeToUnicode(GooString *tagA);
CharCodeToUnicode(GooString *tagA, Unicode *mapA, CharCode mapLenA, bool copyMap, CharCodeToUnicodeString *sMapA, int sMapLenA, int sMapSizeA);
More information about the poppler
mailing list