[poppler] 2 commits - poppler/UTF.cc qt5/tests
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Jul 10 21:00:44 UTC 2020
poppler/UTF.cc | 17 ++++++++++++++++-
qt5/tests/check_utf_conversion.cpp | 32 ++++++++++++++++++++++++++++++++
2 files changed, 48 insertions(+), 1 deletion(-)
New commits:
commit 9a880ecd7d865a12b0f91f56285907bbb409f32f
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date: Thu Jul 9 01:36:24 2020 -0400
Add test for UTF16LE string support
Issue #941
diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp
index f28829f4..1f04c2a5 100644
--- a/qt5/tests/check_utf_conversion.cpp
+++ b/qt5/tests/check_utf_conversion.cpp
@@ -4,6 +4,7 @@
#include <poppler-private.h>
#include <cstring>
+#include <cstdint> // for uint16_t
#include "GlobalParams.h"
#include "UnicodeTypeTable.h"
@@ -18,6 +19,7 @@ private slots:
void testUTF_data();
void testUTF();
void testUnicodeToAscii7();
+ void testUnicodeLittleEndian();
};
static bool compare(const char *a, const char *b)
@@ -143,5 +145,35 @@ void TestUTFConversion::testUnicodeToAscii7()
free(out_ascii_idx);
}
+void TestUTFConversion::testUnicodeLittleEndian()
+{
+ uint16_t UTF16LE_hi[4] { 0xFFFE, 0x4800, 0x4900, 0x2100 }; // UTF16-LE "HI!"
+ GooString GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), 4 * 2);
+
+ uint16_t UTF16BE_hi[4] { 0xFEFF, 0x0048, 0x0049, 0x0021 }; // UTF16-BE "HI!"
+ GooString GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), 4 * 2);
+
+ // Let's assert both GooString's are different
+ Q_ASSERT(GooUTF16LE.cmp(&GooUTF16BE) != 0);
+
+ Unicode *UCS4fromLE, *UCS4fromBE;
+ const int len1 = TextStringToUCS4(&GooUTF16LE, &UCS4fromLE);
+ const int len2 = TextStringToUCS4(&GooUTF16BE, &UCS4fromBE);
+
+ // 3 as TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
+ Q_ASSERT(len1 == len2);
+ Q_ASSERT(len1 == 3);
+
+ // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same
+ for (int i = 0; i < len1; i++) {
+ Q_ASSERT(UCS4fromLE[i] == UCS4fromBE[i]);
+ }
+
+ // Do some final verifications, checking the strings to be "HI!"
+ QVERIFY(*UCS4fromLE == *UCS4fromBE);
+ QVERIFY(compare(UCS4fromLE, "HI!", 3));
+ QVERIFY(compare(UCS4fromBE, "HI!", 3));
+}
+
QTEST_GUILESS_MAIN(TestUTFConversion)
#include "check_utf_conversion.moc"
commit 232cba307e8be35022426ba85f34198af7406899
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date: Thu Jul 9 01:37:20 2020 -0400
Make TextStringToUCS4() support UTF16-LE too
UTF16-LE strings can 'de facto' appear on pdf's
(eg. title of Outline items) and Acrobat display
them fine, so let's support that so we don't
show an ugly 'ÿþ' at start of the text (Okular)
or even no text at all (Evince).
Issue #941
Evince issue:
https://gitlab.gnome.org/GNOME/evince/-/issues/1444
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 112986af..d231bde1 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -90,6 +90,7 @@ int TextStringToUCS4(const GooString *textStr, Unicode **ucs4)
int i, len;
const char *s;
Unicode *u;
+ bool isUnicode, isUnicodeLE;
len = textStr->getLength();
s = textStr->c_str();
@@ -99,12 +100,26 @@ int TextStringToUCS4(const GooString *textStr, Unicode **ucs4)
}
if (textStr->hasUnicodeMarker()) {
+ isUnicode = true;
+ isUnicodeLE = false;
+ } else if (textStr->hasUnicodeMarkerLE()) {
+ isUnicode = false;
+ isUnicodeLE = true;
+ } else {
+ isUnicode = false;
+ isUnicodeLE = false;
+ }
+
+ if (isUnicode || isUnicodeLE) {
Unicode *utf16;
len = len / 2 - 1;
if (len > 0) {
utf16 = new Unicode[len];
for (i = 0; i < len; i++) {
- utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
+ if (isUnicode)
+ utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
+ else // UnicodeLE
+ utf16[i] = (s[2 + i * 2] & 0xff) | (s[3 + i * 2] & 0xff) >> 8;
}
len = UTF16toUCS4(utf16, len, &u);
delete[] utf16;
More information about the poppler
mailing list