[poppler] 2 commits - poppler/UTF.cc qt5/tests

Fri Jul 10 21:00:44 UTC 2020

poppler/UTF.cc                     |   17 ++++++++++++++++-
 qt5/tests/check_utf_conversion.cpp |   32 ++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

New commits:
commit 9a880ecd7d865a12b0f91f56285907bbb409f32f
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date:   Thu Jul 9 01:36:24 2020 -0400

    Add test for UTF16LE string support
    
    Issue #941

diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp
index f28829f4..1f04c2a5 100644
--- a/qt5/tests/check_utf_conversion.cpp
+++ b/qt5/tests/check_utf_conversion.cpp
@@ -4,6 +4,7 @@
 #include <poppler-private.h>
 
 #include <cstring>
+#include <cstdint> // for uint16_t
 
 #include "GlobalParams.h"
 #include "UnicodeTypeTable.h"
@@ -18,6 +19,7 @@ private slots:
     void testUTF_data();
     void testUTF();
     void testUnicodeToAscii7();
+    void testUnicodeLittleEndian();
 };
 
 static bool compare(const char *a, const char *b)
@@ -143,5 +145,35 @@ void TestUTFConversion::testUnicodeToAscii7()
     free(out_ascii_idx);
 }
 
+void TestUTFConversion::testUnicodeLittleEndian()
+{
+    uint16_t UTF16LE_hi[4] { 0xFFFE, 0x4800, 0x4900, 0x2100 }; // UTF16-LE "HI!"
+    GooString GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), 4 * 2);
+
+    uint16_t UTF16BE_hi[4] { 0xFEFF, 0x0048, 0x0049, 0x0021 }; // UTF16-BE "HI!"
+    GooString GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), 4 * 2);
+
+    // Let's assert both GooString's are different
+    Q_ASSERT(GooUTF16LE.cmp(&GooUTF16BE) != 0);
+
+    Unicode *UCS4fromLE, *UCS4fromBE;
+    const int len1 = TextStringToUCS4(&GooUTF16LE, &UCS4fromLE);
+    const int len2 = TextStringToUCS4(&GooUTF16BE, &UCS4fromBE);
+
+    // 3 as TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
+    Q_ASSERT(len1 == len2);
+    Q_ASSERT(len1 == 3);
+
+    // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same
+    for (int i = 0; i < len1; i++) {
+        Q_ASSERT(UCS4fromLE[i] == UCS4fromBE[i]);
+    }
+
+    // Do some final verifications, checking the strings to be "HI!"
+    QVERIFY(*UCS4fromLE == *UCS4fromBE);
+    QVERIFY(compare(UCS4fromLE, "HI!", 3));
+    QVERIFY(compare(UCS4fromBE, "HI!", 3));
+}
+
 QTEST_GUILESS_MAIN(TestUTFConversion)
 #include "check_utf_conversion.moc"
commit 232cba307e8be35022426ba85f34198af7406899
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date:   Thu Jul 9 01:37:20 2020 -0400

    Make TextStringToUCS4() support UTF16-LE too
    
    UTF16-LE strings can 'de facto' appear on pdf's
    (eg. title of Outline items) and Acrobat display
    them fine, so let's support that so we don't
    show an ugly 'ÿþ' at start of the text (Okular)
    or even no text at all (Evince).
    
    Issue #941
    
    Evince issue:
    https://gitlab.gnome.org/GNOME/evince/-/issues/1444

diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 112986af..d231bde1 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -90,6 +90,7 @@ int TextStringToUCS4(const GooString *textStr, Unicode **ucs4)
     int i, len;
     const char *s;
     Unicode *u;
+    bool isUnicode, isUnicodeLE;
 
     len = textStr->getLength();
     s = textStr->c_str();
@@ -99,12 +100,26 @@ int TextStringToUCS4(const GooString *textStr, Unicode **ucs4)
     }
 
     if (textStr->hasUnicodeMarker()) {
+        isUnicode = true;
+        isUnicodeLE = false;
+    } else if (textStr->hasUnicodeMarkerLE()) {
+        isUnicode = false;
+        isUnicodeLE = true;
+    } else {
+        isUnicode = false;
+        isUnicodeLE = false;
+    }
+
+    if (isUnicode || isUnicodeLE) {
         Unicode *utf16;
         len = len / 2 - 1;
         if (len > 0) {
             utf16 = new Unicode[len];
             for (i = 0; i < len; i++) {
-                utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
+                if (isUnicode)
+                    utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
+                else // UnicodeLE
+                    utf16[i] = (s[2 + i * 2] & 0xff) | (s[3 + i * 2] & 0xff) >> 8;
             }
             len = UTF16toUCS4(utf16, len, &u);
             delete[] utf16;