[poppler] poppler/Lexer.cc poppler/UTF.cc poppler/UTF.h qt5/tests qt6/tests
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Jun 13 22:24:26 UTC 2023
poppler/Lexer.cc | 27 +++++++++++-------
poppler/UTF.cc | 3 ++
poppler/UTF.h | 11 +++++++
qt5/tests/CMakeLists.txt | 3 +-
qt5/tests/check_utf8document.cpp | 57 +++++++++++++++++++++++++++++++++++++++
qt6/tests/CMakeLists.txt | 1
qt6/tests/check_utf8document.cpp | 57 +++++++++++++++++++++++++++++++++++++++
7 files changed, 148 insertions(+), 11 deletions(-)
New commits:
commit 9183da4fcb8d06360ed51f7f1131a14300008735
Author: Sune Vuorela <sune at vuorela.dk>
Date: Tue Jun 13 22:24:24 2023 +0000
Fix reading of utf8-with-bom files
diff --git a/poppler/Lexer.cc b/poppler/Lexer.cc
index ab25caf5..01548950 100644
--- a/poppler/Lexer.cc
+++ b/poppler/Lexer.cc
@@ -33,6 +33,7 @@
#include <cctype>
#include "Lexer.h"
#include "Error.h"
+#include "UTF.h"
#include "XRef.h"
//------------------------------------------------------------------------
@@ -163,7 +164,7 @@ Object Lexer::getObj(int objNum)
int xi;
long long xll = 0;
double xf = 0, scale;
- GooString *s;
+ std::unique_ptr<GooString> s;
int n, m;
// skip whitespace and comments
@@ -389,7 +390,7 @@ Object Lexer::getObj(int objNum)
if (c2 != EOF) {
if (n == tokBufSize) {
if (!s) {
- s = new GooString(tokBuf, tokBufSize);
+ s = std::make_unique<GooString>(tokBuf, tokBufSize);
} else {
s->append(tokBuf, tokBufSize);
}
@@ -402,7 +403,7 @@ Object Lexer::getObj(int objNum)
if (newObjNum != objNum) {
error(errSyntaxError, getPos(), "Unterminated string");
done = true;
- delete s;
+ s.reset();
n = -2;
}
}
@@ -413,11 +414,15 @@ Object Lexer::getObj(int objNum)
} while (!done);
if (n >= 0) {
if (!s) {
- s = new GooString(tokBuf, n);
+ s = std::make_unique<GooString>(tokBuf, n);
} else {
s->append(tokBuf, n);
}
- return Object(s);
+ // Check utf8
+ if (isUtf8WithBom(s->toStr())) {
+ s = utf8ToUtf16WithBom(s->toStr());
+ }
+ return Object(s.release());
} else {
return Object(objEOF);
}
@@ -464,7 +469,7 @@ Object Lexer::getObj(int objNum)
} else if (n == tokBufSize) {
error(errSyntaxError, getPos(), "Warning: name token is longer than what the specification says it can be");
*p = c;
- s = new GooString(tokBuf, n);
+ s = std::make_unique<GooString>(tokBuf, n);
} else {
s->append((char)c);
}
@@ -474,7 +479,6 @@ Object Lexer::getObj(int objNum)
return Object(objName, tokBuf);
} else {
Object obj(objName, s->c_str());
- delete s;
return obj;
}
break;
@@ -525,7 +529,7 @@ Object Lexer::getObj(int objNum)
if (++m == 2) {
if (n == tokBufSize) {
if (!s) {
- s = new GooString(tokBuf, tokBufSize);
+ s = std::make_unique<GooString>(tokBuf, tokBufSize);
} else {
s->append(tokBuf, tokBufSize);
}
@@ -540,14 +544,17 @@ Object Lexer::getObj(int objNum)
}
}
if (!s) {
- s = new GooString(tokBuf, n);
+ s = std::make_unique<GooString>(tokBuf, n);
} else {
s->append(tokBuf, n);
}
if (m == 1) {
s->append((char)(c2 << 4));
}
- return Object(s);
+ if (isUtf8WithBom(s->toStr())) {
+ s = utf8ToUtf16WithBom(s->toStr());
+ }
+ return Object(s.release());
}
break;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 9b1bf954..2ea00895 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -356,6 +356,9 @@ int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8)
// Allocate utf16 string and convert utf8 into it.
uint16_t *utf8ToUtf16(const char *utf8, int *len)
{
+ if (isUtf8WithBom(utf8)) {
+ utf8 += 3;
+ }
int n = utf8CountUtf16CodeUnits(utf8);
if (len) {
*len = n;
diff --git a/poppler/UTF.h b/poppler/UTF.h
index 626c6862..312f231d 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -73,6 +73,17 @@ int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, uint16_t *utf16, int ma
// Allocate utf16 string and convert utf8 into it.
uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr);
+inline bool isUtf8WithBom(std::string_view str)
+{
+ if (str.size() < 4) {
+ return false;
+ }
+ if (str[0] == '\xef' && str[1] == '\xbb' && str[2] == '\xbf') {
+ return true;
+ }
+ return false;
+}
+
// Converts a UTF-8 string to a big endian UTF-16 string with BOM.
// The caller owns the returned pointer.
// utf8 - UTF-8 string to convert. An empty string is acceptable.
diff --git a/qt5/tests/CMakeLists.txt b/qt5/tests/CMakeLists.txt
index 0b1931ba..9de870ee 100644
--- a/qt5/tests/CMakeLists.txt
+++ b/qt5/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ macro(QT5_ADD_SIMPLETEST exe source)
endmacro(QT5_ADD_SIMPLETEST)
macro(QT5_ADD_QTEST exe source)
- if (Qt5Test_FOUND)
+ if (Qt5Test_FOUND)
string(REPLACE "-" "" test_name ${exe})
set(${test_name}_SOURCES
${source}
@@ -71,6 +71,7 @@ qt5_add_qtest(check_qt5_stroke_opacity check_stroke_opacity.cpp)
qt5_add_qtest(check_qt5_utf_conversion check_utf_conversion.cpp)
qt5_add_qtest(check_qt5_outline check_outline.cpp)
qt5_add_qtest(check_qt5_signature_basics check_signature_basics.cpp)
+qt5_add_qtest(check_qt5_utf8document check_utf8document.cpp)
qt5_add_qtest(check_qt5_distinguished_name_parser check_distinguished_name_parser.cpp)
qt5_add_qtest(check_qt5_cidfontswidthsbuilder check_cidfontswidthsbuilder.cpp)
if (NOT WIN32)
diff --git a/qt5/tests/check_utf8document.cpp b/qt5/tests/check_utf8document.cpp
new file mode 100644
index 00000000..ebeb22c6
--- /dev/null
+++ b/qt5/tests/check_utf8document.cpp
@@ -0,0 +1,57 @@
+#include <QtTest/QtTest>
+
+#include "PDFDoc.h"
+#include "GlobalParams.h"
+
+#include "Outline.h"
+#include "poppler-private.h"
+
+class TestUtf8Document : public QObject
+{
+ Q_OBJECT
+public:
+ explicit TestUtf8Document(QObject *parent = nullptr) : QObject(parent) { }
+private Q_SLOTS:
+ void checkStrings();
+};
+
+inline QString outlineItemTitle(OutlineItem *item)
+{
+ if (!item) {
+ return {};
+ }
+ return QString::fromUcs4(item->getTitle(), item->getTitleLength());
+}
+
+void TestUtf8Document::checkStrings()
+{
+
+ globalParams = std::make_unique<GlobalParams>();
+ auto doc = std::make_unique<PDFDoc>(std::make_unique<GooString>(TESTDATADIR "/unittestcases/pdf20-utf8-test.pdf"));
+ QVERIFY(doc);
+ QVERIFY(doc->isOk());
+
+ QVERIFY(doc->getOptContentConfig() && doc->getOptContentConfig()->hasOCGs());
+
+ QCOMPARE(Poppler::UnicodeParsedString(doc->getDocInfoTitle().get()), QStringLiteral("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀"));
+
+ QSet<QString> expectedNames { QStringLiteral("گچپژ"), QStringLiteral("Layer 1") };
+ QSet<QString> foundNames;
+
+ for (auto &[ref, group] : doc->getOptContentConfig()->getOCGs()) {
+ foundNames.insert(Poppler::UnicodeParsedString(group->getName()));
+ }
+ QCOMPARE(expectedNames, foundNames);
+
+ auto outlineItems = doc->getOutline()->getItems();
+ QVERIFY(outlineItems);
+ QCOMPARE(outlineItems->size(), 3);
+
+ QCOMPARE(outlineItemTitle(outlineItems->at(0)), QStringLiteral("PDF 2.0 with UTF-8 test file"));
+ QCOMPARE(outlineItemTitle(outlineItems->at(1)), QStringLiteral("\u202A\u202Atest\u202A"));
+ QCOMPARE(outlineItemTitle(outlineItems->at(2)), QStringLiteral("🌈️\n" /*emoji rainbow flag*/));
+}
+
+QTEST_GUILESS_MAIN(TestUtf8Document)
+
+#include "check_utf8document.moc"
diff --git a/qt6/tests/CMakeLists.txt b/qt6/tests/CMakeLists.txt
index da18b15d..577aad7f 100644
--- a/qt6/tests/CMakeLists.txt
+++ b/qt6/tests/CMakeLists.txt
@@ -63,6 +63,7 @@ qt6_add_qtest(check_qt6_stroke_opacity check_stroke_opacity.cpp)
qt6_add_qtest(check_qt6_utf_conversion check_utf_conversion.cpp)
qt6_add_qtest(check_qt6_outline check_outline.cpp)
qt6_add_qtest(check_qt6_signature_basics check_signature_basics.cpp)
+qt6_add_qtest(check_qt6_utf8document check_utf8document.cpp)
qt6_add_qtest(check_qt6_distinguished_name_parser check_distinguished_name_parser.cpp)
qt6_add_qtest(check_qt6_cidfontswidthsbuilder check_cidfontswidthsbuilder.cpp)
if (NOT WIN32)
diff --git a/qt6/tests/check_utf8document.cpp b/qt6/tests/check_utf8document.cpp
new file mode 100644
index 00000000..ebeb22c6
--- /dev/null
+++ b/qt6/tests/check_utf8document.cpp
@@ -0,0 +1,57 @@
+#include <QtTest/QtTest>
+
+#include "PDFDoc.h"
+#include "GlobalParams.h"
+
+#include "Outline.h"
+#include "poppler-private.h"
+
+class TestUtf8Document : public QObject
+{
+ Q_OBJECT
+public:
+ explicit TestUtf8Document(QObject *parent = nullptr) : QObject(parent) { }
+private Q_SLOTS:
+ void checkStrings();
+};
+
+inline QString outlineItemTitle(OutlineItem *item)
+{
+ if (!item) {
+ return {};
+ }
+ return QString::fromUcs4(item->getTitle(), item->getTitleLength());
+}
+
+void TestUtf8Document::checkStrings()
+{
+
+ globalParams = std::make_unique<GlobalParams>();
+ auto doc = std::make_unique<PDFDoc>(std::make_unique<GooString>(TESTDATADIR "/unittestcases/pdf20-utf8-test.pdf"));
+ QVERIFY(doc);
+ QVERIFY(doc->isOk());
+
+ QVERIFY(doc->getOptContentConfig() && doc->getOptContentConfig()->hasOCGs());
+
+ QCOMPARE(Poppler::UnicodeParsedString(doc->getDocInfoTitle().get()), QStringLiteral("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀"));
+
+ QSet<QString> expectedNames { QStringLiteral("گچپژ"), QStringLiteral("Layer 1") };
+ QSet<QString> foundNames;
+
+ for (auto &[ref, group] : doc->getOptContentConfig()->getOCGs()) {
+ foundNames.insert(Poppler::UnicodeParsedString(group->getName()));
+ }
+ QCOMPARE(expectedNames, foundNames);
+
+ auto outlineItems = doc->getOutline()->getItems();
+ QVERIFY(outlineItems);
+ QCOMPARE(outlineItems->size(), 3);
+
+ QCOMPARE(outlineItemTitle(outlineItems->at(0)), QStringLiteral("PDF 2.0 with UTF-8 test file"));
+ QCOMPARE(outlineItemTitle(outlineItems->at(1)), QStringLiteral("\u202A\u202Atest\u202A"));
+ QCOMPARE(outlineItemTitle(outlineItems->at(2)), QStringLiteral("🌈️\n" /*emoji rainbow flag*/));
+}
+
+QTEST_GUILESS_MAIN(TestUtf8Document)
+
+#include "check_utf8document.moc"
More information about the poppler
mailing list