[poppler] poppler/Lexer.cc poppler/UTF.cc poppler/UTF.h qt5/tests qt6/tests

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Jun 13 22:24:26 UTC 2023


 poppler/Lexer.cc                 |   27 +++++++++++-------
 poppler/UTF.cc                   |    3 ++
 poppler/UTF.h                    |   11 +++++++
 qt5/tests/CMakeLists.txt         |    3 +-
 qt5/tests/check_utf8document.cpp |   57 +++++++++++++++++++++++++++++++++++++++
 qt6/tests/CMakeLists.txt         |    1 
 qt6/tests/check_utf8document.cpp |   57 +++++++++++++++++++++++++++++++++++++++
 7 files changed, 148 insertions(+), 11 deletions(-)

New commits:
commit 9183da4fcb8d06360ed51f7f1131a14300008735
Author: Sune Vuorela <sune at vuorela.dk>
Date:   Tue Jun 13 22:24:24 2023 +0000

    Fix reading of utf8-with-bom files

diff --git a/poppler/Lexer.cc b/poppler/Lexer.cc
index ab25caf5..01548950 100644
--- a/poppler/Lexer.cc
+++ b/poppler/Lexer.cc
@@ -33,6 +33,7 @@
 #include <cctype>
 #include "Lexer.h"
 #include "Error.h"
+#include "UTF.h"
 #include "XRef.h"
 
 //------------------------------------------------------------------------
@@ -163,7 +164,7 @@ Object Lexer::getObj(int objNum)
     int xi;
     long long xll = 0;
     double xf = 0, scale;
-    GooString *s;
+    std::unique_ptr<GooString> s;
     int n, m;
 
     // skip whitespace and comments
@@ -389,7 +390,7 @@ Object Lexer::getObj(int objNum)
             if (c2 != EOF) {
                 if (n == tokBufSize) {
                     if (!s) {
-                        s = new GooString(tokBuf, tokBufSize);
+                        s = std::make_unique<GooString>(tokBuf, tokBufSize);
                     } else {
                         s->append(tokBuf, tokBufSize);
                     }
@@ -402,7 +403,7 @@ Object Lexer::getObj(int objNum)
                         if (newObjNum != objNum) {
                             error(errSyntaxError, getPos(), "Unterminated string");
                             done = true;
-                            delete s;
+                            s.reset();
                             n = -2;
                         }
                     }
@@ -413,11 +414,15 @@ Object Lexer::getObj(int objNum)
         } while (!done);
         if (n >= 0) {
             if (!s) {
-                s = new GooString(tokBuf, n);
+                s = std::make_unique<GooString>(tokBuf, n);
             } else {
                 s->append(tokBuf, n);
             }
-            return Object(s);
+            // Check utf8
+            if (isUtf8WithBom(s->toStr())) {
+                s = utf8ToUtf16WithBom(s->toStr());
+            }
+            return Object(s.release());
         } else {
             return Object(objEOF);
         }
@@ -464,7 +469,7 @@ Object Lexer::getObj(int objNum)
             } else if (n == tokBufSize) {
                 error(errSyntaxError, getPos(), "Warning: name token is longer than what the specification says it can be");
                 *p = c;
-                s = new GooString(tokBuf, n);
+                s = std::make_unique<GooString>(tokBuf, n);
             } else {
                 s->append((char)c);
             }
@@ -474,7 +479,6 @@ Object Lexer::getObj(int objNum)
             return Object(objName, tokBuf);
         } else {
             Object obj(objName, s->c_str());
-            delete s;
             return obj;
         }
         break;
@@ -525,7 +529,7 @@ Object Lexer::getObj(int objNum)
                     if (++m == 2) {
                         if (n == tokBufSize) {
                             if (!s) {
-                                s = new GooString(tokBuf, tokBufSize);
+                                s = std::make_unique<GooString>(tokBuf, tokBufSize);
                             } else {
                                 s->append(tokBuf, tokBufSize);
                             }
@@ -540,14 +544,17 @@ Object Lexer::getObj(int objNum)
                 }
             }
             if (!s) {
-                s = new GooString(tokBuf, n);
+                s = std::make_unique<GooString>(tokBuf, n);
             } else {
                 s->append(tokBuf, n);
             }
             if (m == 1) {
                 s->append((char)(c2 << 4));
             }
-            return Object(s);
+            if (isUtf8WithBom(s->toStr())) {
+                s = utf8ToUtf16WithBom(s->toStr());
+            }
+            return Object(s.release());
         }
         break;
 
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 9b1bf954..2ea00895 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -356,6 +356,9 @@ int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8)
 // Allocate utf16 string and convert utf8 into it.
 uint16_t *utf8ToUtf16(const char *utf8, int *len)
 {
+    if (isUtf8WithBom(utf8)) {
+        utf8 += 3;
+    }
     int n = utf8CountUtf16CodeUnits(utf8);
     if (len) {
         *len = n;
diff --git a/poppler/UTF.h b/poppler/UTF.h
index 626c6862..312f231d 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -73,6 +73,17 @@ int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, uint16_t *utf16, int ma
 // Allocate utf16 string and convert utf8 into it.
 uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr);
 
+inline bool isUtf8WithBom(std::string_view str)
+{
+    if (str.size() < 4) {
+        return false;
+    }
+    if (str[0] == '\xef' && str[1] == '\xbb' && str[2] == '\xbf') {
+        return true;
+    }
+    return false;
+}
+
 // Converts a UTF-8 string to a big endian UTF-16 string with BOM.
 // The caller owns the returned pointer.
 //  utf8 - UTF-8 string to convert. An empty string is acceptable.
diff --git a/qt5/tests/CMakeLists.txt b/qt5/tests/CMakeLists.txt
index 0b1931ba..9de870ee 100644
--- a/qt5/tests/CMakeLists.txt
+++ b/qt5/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ macro(QT5_ADD_SIMPLETEST exe source)
 endmacro(QT5_ADD_SIMPLETEST)
 
 macro(QT5_ADD_QTEST exe source)
-  if (Qt5Test_FOUND)	
+  if (Qt5Test_FOUND)
     string(REPLACE "-" "" test_name ${exe})
     set(${test_name}_SOURCES
       ${source}
@@ -71,6 +71,7 @@ qt5_add_qtest(check_qt5_stroke_opacity check_stroke_opacity.cpp)
 qt5_add_qtest(check_qt5_utf_conversion check_utf_conversion.cpp)
 qt5_add_qtest(check_qt5_outline check_outline.cpp)
 qt5_add_qtest(check_qt5_signature_basics check_signature_basics.cpp)
+qt5_add_qtest(check_qt5_utf8document check_utf8document.cpp)
 qt5_add_qtest(check_qt5_distinguished_name_parser check_distinguished_name_parser.cpp)
 qt5_add_qtest(check_qt5_cidfontswidthsbuilder check_cidfontswidthsbuilder.cpp)
 if (NOT WIN32)
diff --git a/qt5/tests/check_utf8document.cpp b/qt5/tests/check_utf8document.cpp
new file mode 100644
index 00000000..ebeb22c6
--- /dev/null
+++ b/qt5/tests/check_utf8document.cpp
@@ -0,0 +1,57 @@
+#include <QtTest/QtTest>
+
+#include "PDFDoc.h"
+#include "GlobalParams.h"
+
+#include "Outline.h"
+#include "poppler-private.h"
+
+class TestUtf8Document : public QObject
+{
+    Q_OBJECT
+public:
+    explicit TestUtf8Document(QObject *parent = nullptr) : QObject(parent) { }
+private Q_SLOTS:
+    void checkStrings();
+};
+
+inline QString outlineItemTitle(OutlineItem *item)
+{
+    if (!item) {
+        return {};
+    }
+    return QString::fromUcs4(item->getTitle(), item->getTitleLength());
+}
+
+void TestUtf8Document::checkStrings()
+{
+
+    globalParams = std::make_unique<GlobalParams>();
+    auto doc = std::make_unique<PDFDoc>(std::make_unique<GooString>(TESTDATADIR "/unittestcases/pdf20-utf8-test.pdf"));
+    QVERIFY(doc);
+    QVERIFY(doc->isOk());
+
+    QVERIFY(doc->getOptContentConfig() && doc->getOptContentConfig()->hasOCGs());
+
+    QCOMPARE(Poppler::UnicodeParsedString(doc->getDocInfoTitle().get()), QStringLiteral("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀"));
+
+    QSet<QString> expectedNames { QStringLiteral("گچپژ"), QStringLiteral("Layer 1") };
+    QSet<QString> foundNames;
+
+    for (auto &[ref, group] : doc->getOptContentConfig()->getOCGs()) {
+        foundNames.insert(Poppler::UnicodeParsedString(group->getName()));
+    }
+    QCOMPARE(expectedNames, foundNames);
+
+    auto outlineItems = doc->getOutline()->getItems();
+    QVERIFY(outlineItems);
+    QCOMPARE(outlineItems->size(), 3);
+
+    QCOMPARE(outlineItemTitle(outlineItems->at(0)), QStringLiteral("PDF 2.0 with UTF-8 test file"));
+    QCOMPARE(outlineItemTitle(outlineItems->at(1)), QStringLiteral("\u202A\u202Atest\u202A"));
+    QCOMPARE(outlineItemTitle(outlineItems->at(2)), QStringLiteral("🌈️\n" /*emoji rainbow flag*/));
+}
+
+QTEST_GUILESS_MAIN(TestUtf8Document)
+
+#include "check_utf8document.moc"
diff --git a/qt6/tests/CMakeLists.txt b/qt6/tests/CMakeLists.txt
index da18b15d..577aad7f 100644
--- a/qt6/tests/CMakeLists.txt
+++ b/qt6/tests/CMakeLists.txt
@@ -63,6 +63,7 @@ qt6_add_qtest(check_qt6_stroke_opacity check_stroke_opacity.cpp)
 qt6_add_qtest(check_qt6_utf_conversion check_utf_conversion.cpp)
 qt6_add_qtest(check_qt6_outline check_outline.cpp)
 qt6_add_qtest(check_qt6_signature_basics check_signature_basics.cpp)
+qt6_add_qtest(check_qt6_utf8document check_utf8document.cpp)
 qt6_add_qtest(check_qt6_distinguished_name_parser check_distinguished_name_parser.cpp)
 qt6_add_qtest(check_qt6_cidfontswidthsbuilder check_cidfontswidthsbuilder.cpp)
 if (NOT WIN32)
diff --git a/qt6/tests/check_utf8document.cpp b/qt6/tests/check_utf8document.cpp
new file mode 100644
index 00000000..ebeb22c6
--- /dev/null
+++ b/qt6/tests/check_utf8document.cpp
@@ -0,0 +1,57 @@
+#include <QtTest/QtTest>
+
+#include "PDFDoc.h"
+#include "GlobalParams.h"
+
+#include "Outline.h"
+#include "poppler-private.h"
+
+class TestUtf8Document : public QObject
+{
+    Q_OBJECT
+public:
+    explicit TestUtf8Document(QObject *parent = nullptr) : QObject(parent) { }
+private Q_SLOTS:
+    void checkStrings();
+};
+
+inline QString outlineItemTitle(OutlineItem *item)
+{
+    if (!item) {
+        return {};
+    }
+    return QString::fromUcs4(item->getTitle(), item->getTitleLength());
+}
+
+void TestUtf8Document::checkStrings()
+{
+
+    globalParams = std::make_unique<GlobalParams>();
+    auto doc = std::make_unique<PDFDoc>(std::make_unique<GooString>(TESTDATADIR "/unittestcases/pdf20-utf8-test.pdf"));
+    QVERIFY(doc);
+    QVERIFY(doc->isOk());
+
+    QVERIFY(doc->getOptContentConfig() && doc->getOptContentConfig()->hasOCGs());
+
+    QCOMPARE(Poppler::UnicodeParsedString(doc->getDocInfoTitle().get()), QStringLiteral("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀"));
+
+    QSet<QString> expectedNames { QStringLiteral("گچپژ"), QStringLiteral("Layer 1") };
+    QSet<QString> foundNames;
+
+    for (auto &[ref, group] : doc->getOptContentConfig()->getOCGs()) {
+        foundNames.insert(Poppler::UnicodeParsedString(group->getName()));
+    }
+    QCOMPARE(expectedNames, foundNames);
+
+    auto outlineItems = doc->getOutline()->getItems();
+    QVERIFY(outlineItems);
+    QCOMPARE(outlineItems->size(), 3);
+
+    QCOMPARE(outlineItemTitle(outlineItems->at(0)), QStringLiteral("PDF 2.0 with UTF-8 test file"));
+    QCOMPARE(outlineItemTitle(outlineItems->at(1)), QStringLiteral("\u202A\u202Atest\u202A"));
+    QCOMPARE(outlineItemTitle(outlineItems->at(2)), QStringLiteral("🌈️\n" /*emoji rainbow flag*/));
+}
+
+QTEST_GUILESS_MAIN(TestUtf8Document)
+
+#include "check_utf8document.moc"


More information about the poppler mailing list