[poppler] 4 commits - CMakeLists.txt poppler/Error.cc poppler/GlobalParams.cc poppler/PDFDoc.cc poppler/UnicodeMapFuncs.h poppler/UTF8.h poppler/UTF.cc poppler/UTF.h qt5/tests test/perf-test.cc utils/CMakeLists.txt utils/JSInfo.cc utils/pdfdetach.cc utils/pdffonts.cc utils/pdfimages.cc utils/pdfinfo.cc utils/pdfseparate.cc utils/pdfsig.cc utils/pdftocairo.cc utils/pdftocairo-win32.cc utils/pdftohtml.cc utils/pdftoppm.cc utils/pdftops.cc utils/pdftotext.cc utils/printencodings.cc utils/Win32Console.cc utils/Win32Console.h
Adrian Johnson
ajohnson at kemper.freedesktop.org
Wed Nov 15 08:28:30 UTC 2017
CMakeLists.txt | 2
poppler/Error.cc | 1
poppler/GlobalParams.cc | 6
poppler/PDFDoc.cc | 10 +
poppler/UTF.cc | 287 ++++++++++++++++++++++++++++++++++++-
poppler/UTF.h | 39 +++++
poppler/UnicodeMapFuncs.h | 6
qt5/tests/CMakeLists.txt | 1
qt5/tests/check_utf_conversion.cpp | 87 +++++++++++
test/perf-test.cc | 15 -
utils/CMakeLists.txt | 1
utils/JSInfo.cc | 1
utils/Win32Console.cc | 167 +++++++++++++++++++++
utils/Win32Console.h | 63 ++++++++
utils/pdfdetach.cc | 2
utils/pdffonts.cc | 2
utils/pdfimages.cc | 2
utils/pdfinfo.cc | 2
utils/pdfseparate.cc | 2
utils/pdfsig.cc | 2
utils/pdftocairo-win32.cc | 1
utils/pdftocairo.cc | 2
utils/pdftohtml.cc | 2
utils/pdftoppm.cc | 2
utils/pdftops.cc | 2
utils/pdftotext.cc | 2
utils/printencodings.cc | 9 +
27 files changed, 697 insertions(+), 21 deletions(-)
New commits:
commit a6dd3f957f5979fa34a05ba963862de7d0d9df61
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Sun Nov 12 10:33:07 2017 +1030
Support unicode on windows console
The Win32Console should be used in programs that require unicode
support for command line arguments and stdio ouput on windows. On
windows it gets the command line arguments from GetCommandLineW and
converts to UTF-8, and redefines the stdio output functions to convert
UTF-8 to calls to WriteConsoleW. On other platforms this class is a
no-op.
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 09ee0a21..147d1f45 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -81,6 +81,7 @@
#endif
#include "PDFDoc.h"
#include "Hints.h"
+#include "UTF.h"
#ifdef MULTITHREADED
# define pdfdocLocker() MutexLocker locker(&mutex)
@@ -152,7 +153,13 @@ PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
#endif
// try to open file
- file = GooFile::open(fileName);
+#ifdef _WIN32
+ wchar_t *wFileName = (wchar_t*)utf8ToUtf16(fileName->getCString());
+ file = GooFile::open(wFileName);
+ gfree(wFileName);
+#else
+ file = GooFile::open(fileName);
+#endif
if (file == NULL) {
// fopen() has failed.
// Keep a copy of the errno returned by fopen so that it can be
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index f7b02d14..90771943 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -1,6 +1,6 @@
//========================================================================
//
-// UTF.h
+// UTF.cc
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
@@ -27,6 +27,7 @@
#include "goo/gmem.h"
#include "PDFDocEncoding.h"
#include "UTF.h"
+#include "UnicodeMapFuncs.h"
#include <algorithm>
bool UnicodeIsValid(Unicode ucs4)
@@ -130,3 +131,287 @@ bool UnicodeIsWhitespace(Unicode ucs4)
Unicode const *i = std::lower_bound(spaces, end, ucs4);
return (i != end && *i == ucs4);
}
+
+//
+// decodeUtf8() and decodeUtf8Table are:
+//
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern at hoehrmann.de>
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+//
+static const uint32_t UTF8_ACCEPT = 0;
+static const uint32_t UTF8_REJECT = 12;
+static const uint32_t UCS4_MAX = 0x10FFFF;
+static const Unicode REPLACEMENT_CHAR = 0xFFFD;
+
+static const uint8_t decodeUtf8Table[] = {
+ // The first part of the table maps bytes to character classes
+ // to reduce the size of the transition table and create bitmasks.
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff
+
+ // The second part is a transition table that maps a combination
+ // of a state of the automaton and a character class to a state.
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+// Decode utf8 state machine for fast UTF-8 decoding. Initialise state
+// to 0 and call decodeUtf8() for each byte of UTF-8. Return value
+// (and state) is UTF8_ACCEPT when it has found a valid codepoint
+// (codepoint returned in codep), UTF8_REJECT when the byte is not
+// allowed to occur at its position, and some other positive value if
+// more bytes have to be read. Reset state to 0 to recover from
+// errors.
+inline uint32_t decodeUtf8(uint32_t* state, uint32_t* codep, char byte)
+{
+ uint32_t b = (unsigned char)byte;
+ uint32_t type = decodeUtf8Table[b];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (b & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (b);
+
+ *state = decodeUtf8Table[256 + *state + type];
+ return *state;
+}
+
+// Count number of UTF-16 code units required to convert a UTF-8 string
+// (excluding terminating NULL). Each invalid byte is counted as a
+// code point since the UTF-8 conversion functions will replace it with
+// REPLACEMENT_CHAR.
+int utf8CountUtf16CodeUnits(const char *utf8)
+{
+ uint32_t codepoint;
+ uint32_t state = 0;
+ int count = 0;
+
+ while (*utf8) {
+ decodeUtf8(&state, &codepoint, *utf8);
+ if (state == UTF8_ACCEPT) {
+ if (codepoint < 0x10000)
+ count++;
+ else if (codepoint <= UCS4_MAX)
+ count += 2;
+ else
+ count++; // replace with REPLACEMENT_CHAR
+ } else if (state == UTF8_REJECT) {
+ count++; // replace with REPLACEMENT_CHAR
+ state = 0;
+ }
+ utf8++;
+ }
+ if (state != UTF8_ACCEPT && state != UTF8_REJECT)
+ count++; // replace with REPLACEMENT_CHAR
+
+ return count;
+}
+
+
+// Convert UTF-8 to UTF-16
+// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
+// bytes to convert
+// utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
+// maxUtf16 - maximum size of output buffer including space for null.
+// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
+// either this count is reached or a null is encountered.
+// Returns number of UTF-16 code units written (excluding NULL).
+int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8)
+{
+ uint16_t *p = utf16;
+ uint32_t codepoint;
+ uint32_t state = 0;
+ int nIn = 0;
+ int nOut = 0;
+ while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) {
+ decodeUtf8(&state, &codepoint, *utf8);
+ if (state == UTF8_ACCEPT) {
+ if (codepoint < 0x10000) {
+ *p++ = (uint16_t)codepoint;
+ nOut++;
+ } else if (codepoint <= UCS4_MAX) {
+ *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10));
+ *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF));
+ nOut += 2;
+ } else {
+ *p++ = REPLACEMENT_CHAR;
+ nOut++;
+ state = 0;
+ }
+ } else if (state == UTF8_REJECT) {
+ *p++ = REPLACEMENT_CHAR; // invalid byte for this position
+ nOut++;
+ }
+ utf8++;
+ nIn++;
+ }
+ // replace any trailing bytes too short for a valid UTF-8 with a replacement char
+ if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) {
+ *p++ = REPLACEMENT_CHAR;
+ nOut++;
+ }
+ if (nOut > maxUtf16 - 1)
+ nOut = maxUtf16 - 1;
+ utf16[nOut] = 0;
+ return nOut;
+}
+
+// Allocate utf16 string and convert utf8 into it.
+uint16_t *utf8ToUtf16(const char *utf8, int *len)
+{
+ int n = utf8CountUtf16CodeUnits(utf8);
+ if (len)
+ *len = n;
+ uint16_t *utf16 = (uint16_t*)gmallocn(n + 1, sizeof(uint16_t));
+ utf8ToUtf16(utf8, utf16);
+ return utf16;
+}
+
+static const uint32_t UTF16_ACCEPT = 0;
+static const uint32_t UTF16_REJECT = -1;
+
+// Initialise state to 0. Returns UTF16_ACCEPT when a valid code point
+// has been found, UTF16_REJECT when invalid code unit for this state,
+// some other valid if another code unit needs to be read.
+inline uint32_t decodeUtf16(uint32_t* state, uint32_t* codePoint, uint16_t codeUnit)
+{
+ if (*state == 0) {
+ if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */
+ *state = codeUnit;
+ return *state;
+ } else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
+ /* invalid low surrogate */
+ return UTF16_REJECT;
+ } else {
+ *codePoint = codeUnit;
+ return UTF16_ACCEPT;
+ }
+ } else {
+ if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
+ *codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000;
+ *state = 0;
+ return UTF16_ACCEPT;
+ } else {
+ /* invalid high surrogate */
+ return UTF16_REJECT;
+ }
+ }
+}
+
+// Count number of UTF-8 bytes required to convert a UTF-16 string to
+// UTF-8 (excluding terminating NULL).
+int utf16CountUtf8Bytes(const uint16_t *utf16)
+{
+ uint32_t codepoint;
+ uint32_t state = 0;
+ int count = 0;
+
+ while (*utf16) {
+ decodeUtf16(&state, &codepoint, *utf16);
+ if (state == UTF16_ACCEPT) {
+ if (codepoint < 0x80)
+ count++;
+ else if (codepoint < 0x800)
+ count += 2;
+ else if (codepoint < 0x10000)
+ count += 3;
+ else if (codepoint <= UCS4_MAX)
+ count += 4;
+ else
+ count += 3; // replace with REPLACEMENT_CHAR
+ } else if (state == UTF16_REJECT) {
+ count += 3; // replace with REPLACEMENT_CHAR
+ state = 0;
+ }
+ utf16++;
+ }
+ if (state != UTF8_ACCEPT && state != UTF8_REJECT)
+ count++; // replace with REPLACEMENT_CHAR
+
+ return count;
+}
+
+// Convert UTF-16 to UTF-8
+// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
+// code units to convert
+// utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
+// maxUtf8 - maximum size of output buffer including space for null.
+// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
+// either this count is reached or a null is encountered.
+// Returns number of UTF-8 bytes written (excluding NULL).
+int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16)
+{
+ uint32_t codepoint = 0;
+ uint32_t state = 0;
+ int nIn = 0;
+ int nOut = 0;
+ char *p = utf8;
+ while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) {
+ decodeUtf16(&state, &codepoint, *utf16);
+ if (state == UTF16_ACCEPT || state == UTF16_REJECT) {
+ if (state == UTF16_REJECT || codepoint > UCS4_MAX) {
+ codepoint = REPLACEMENT_CHAR;
+ state = 0;
+ }
+
+ int bufSize = maxUtf8 - nOut;
+ int count = mapUTF8(codepoint, p, bufSize);
+ p += count;
+ nOut += count;
+ }
+ utf16++;
+ nIn++;
+ }
+ // replace any trailing bytes too short for a valid UTF-8 with a replacement char
+ if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) {
+ int bufSize = maxUtf8 - nOut;
+ int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize);
+ p += count;
+ nOut += count;
+ nOut++;
+ }
+ if (nOut > maxUtf8 - 1)
+ nOut = maxUtf8 - 1;
+ utf8[nOut] = 0;
+ return nOut;
+}
+
+// Allocate utf8 string and convert utf16 into it.
+char *utf16ToUtf8(const uint16_t *utf16, int *len)
+{
+ int n = utf16CountUtf8Bytes(utf16);
+ if (len)
+ *len = n;
+ char *utf8 = (char*)gmalloc(n + 1);
+ utf16ToUtf8(utf16, utf8);
+ return utf8;
+}
diff --git a/poppler/UTF.h b/poppler/UTF.h
index c82e165a..bddb926d 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -16,6 +16,9 @@
#pragma implementation
#endif
+#include <cstdint>
+#include <climits>
+
#include "goo/GooString.h"
#include "CharTypes.h"
@@ -39,4 +42,40 @@ bool UnicodeIsValid(Unicode ucs4);
// is a unicode whitespace character
bool UnicodeIsWhitespace(Unicode ucs4);
+// Count number of UTF-16 code units required to convert a UTF-8 string
+// (excluding terminating NULL). Each invalid byte is counted as a
+// code point since the UTF-8 conversion functions will replace it with
+// REPLACEMENT_CHAR.
+int utf8CountUtf16CodeUnits(const char *utf8);
+
+// Convert UTF-8 to UTF-16
+// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
+// bytes to convert
+// utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
+// maxUtf16 - maximum size of output buffer including space for null.
+// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
+// either this count is reached or a null is encountered.
+// Returns number of UTF-16 code units written (excluding NULL).
+int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16 = INT_MAX, int maxUtf8 = INT_MAX);
+
+// Allocate utf16 string and convert utf8 into it.
+uint16_t *utf8ToUtf16(const char *utf8, int *len = nullptr);
+
+// Count number of UTF-8 bytes required to convert a UTF-16 string to
+// UTF-8 (excluding terminating NULL).
+int utf16CountUtf8Bytes(const uint16_t *utf16);
+
+// Convert UTF-16 to UTF-8
+// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
+// code units to convert
+// utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
+// maxUtf8 - maximum size of output buffer including space for null.
+// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
+// either this count is reached or a null is encountered.
+// Returns number of UTF-8 bytes written (excluding NULL).
+int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX);
+
+// Allocate utf8 string and convert utf16 into it.
+char *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr);
+
#endif
diff --git a/qt5/tests/CMakeLists.txt b/qt5/tests/CMakeLists.txt
index 01a1a970..1e67e6ec 100644
--- a/qt5/tests/CMakeLists.txt
+++ b/qt5/tests/CMakeLists.txt
@@ -69,6 +69,7 @@ qt5_add_qtest(check_qt5_actualtext check_actualtext.cpp)
qt5_add_qtest(check_qt5_lexer check_lexer.cpp)
qt5_add_qtest(check_qt5_pagelabelinfo check_pagelabelinfo.cpp)
qt5_add_qtest(check_qt5_goostring check_goostring.cpp)
+qt5_add_qtest(check_qt5_utf_conversion check_utf_conversion.cpp)
if (NOT WIN32)
qt5_add_qtest(check_qt5_strings check_strings.cpp)
endif ()
diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp
new file mode 100644
index 00000000..b8eb03a5
--- /dev/null
+++ b/qt5/tests/check_utf_conversion.cpp
@@ -0,0 +1,87 @@
+#include <QtCore/QScopedPointer>
+#include <QtTest/QtTest>
+
+#include <cstring>
+#include "UTF.h"
+
+class TestUTFConversion : public QObject
+{
+ Q_OBJECT
+private slots:
+ void testUTF_data();
+ void testUTF();
+};
+
+static bool compare(const char *a, const char *b)
+{
+ return strcmp(a, b) == 0;
+}
+
+static bool compare(const uint16_t *a, const uint16_t *b)
+{
+ while (*a && *b) {
+ if (*a++ != *b++)
+ return false;
+ }
+ return *a == *b;
+}
+
+void TestUTFConversion::testUTF_data()
+{
+ QTest::addColumn<QString>("s");
+
+ QTest::newRow("<empty>") << QString::fromUtf8("");
+ QTest::newRow("a") << QString::fromUtf8("a");
+ QTest::newRow("abc") << QString::fromUtf8("abc");
+ QTest::newRow("Latin") << QString::fromUtf8("Vitrum edere possum; mihi non nocet");
+ QTest::newRow("Greek") << QString::fromUtf8("Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα");
+ QTest::newRow("Icelandic") << QString::fromUtf8("Ég get etið gler án þess að meiða mig");
+ QTest::newRow("Russian") << QString::fromUtf8("Я могу есть стекло, оно мне не вредит.");
+ QTest::newRow("Sanskrit") << QString::fromUtf8("काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥");
+ QTest::newRow("Arabic") << QString::fromUtf8("أنا قادر على أكل الزجاج و هذا لا يؤلمني");
+ QTest::newRow("Chinese") << QString::fromUtf8("我能吞下玻璃而不伤身体。");
+ QTest::newRow("Thai") << QString::fromUtf8("ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ");
+ QTest::newRow("non BMP") << QString::fromUtf8("𝓹𝓸𝓹𝓹𝓵𝓮𝓻");
+ }
+
+void TestUTFConversion::testUTF()
+{
+ char utf8Buf[100];
+ char *utf8String;
+ uint16_t utf16Buf[100];
+ uint16_t *utf16String;
+ int len;
+
+ QFETCH(QString, s);
+ char *str = strdup(s.toUtf8().constData());
+
+ // UTF-8 to UTF-16
+
+ // QString size() returns number of code units, not code points
+ QCOMPARE( utf8CountUtf16CodeUnits(str), s.size() );
+
+ len = utf8ToUtf16(str, utf16Buf);
+ QVERIFY( compare(utf16Buf, s.utf16()) );
+ QCOMPARE( len, s.size() );
+
+ utf16String = utf8ToUtf16(str);
+ QVERIFY( compare(utf16String, s.utf16()) );
+ free (utf16String);
+
+ // UTF-16 to UTF-8
+
+ QCOMPARE( utf16CountUtf8Bytes(s.utf16()), (int)strlen(str) );
+
+ len = utf16ToUtf8(s.utf16(), utf8Buf);
+ QVERIFY( compare(utf8Buf, str) );
+ QCOMPARE( len, (int)strlen(str) );
+
+ utf8String = utf16ToUtf8(s.utf16() );
+ QVERIFY( compare(utf8String, str) );
+ free (utf8String);
+
+ free(str);
+}
+
+QTEST_GUILESS_MAIN(TestUTFConversion)
+#include "check_utf_conversion.moc"
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index 422de4c8..34d96475 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -1,6 +1,7 @@
set(common_srcs
parseargs.cc
+ Win32Console.cc
)
set(common_libs
poppler
diff --git a/utils/JSInfo.cc b/utils/JSInfo.cc
index f132bb5b..34f31d58 100644
--- a/utils/JSInfo.cc
+++ b/utils/JSInfo.cc
@@ -24,6 +24,7 @@
#include "Form.h"
#include "UnicodeMap.h"
#include "UTF.h"
+#include "Win32Console.h"
JSInfo::JSInfo(PDFDoc *docA, int firstPage) {
doc = docA;
diff --git a/utils/Win32Console.cc b/utils/Win32Console.cc
new file mode 100644
index 00000000..4db92de6
--- /dev/null
+++ b/utils/Win32Console.cc
@@ -0,0 +1,167 @@
+//========================================================================
+//
+// Win32Console.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2017 Adrian Johnson <ajohnson at redneon.com>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
+#ifdef _WIN32
+
+#include "goo/gmem.h"
+#include "UTF.h"
+
+#define WIN32_CONSOLE_IMPL
+#include "Win32Console.h"
+
+#include <windows.h>
+#include <shellapi.h>
+
+static const int BUF_SIZE = 4096;
+static int bufLen = 0;
+static char buf[BUF_SIZE];
+static wchar_t wbuf[BUF_SIZE];
+static bool stdoutIsConsole = true;
+static bool stderrIsConsole = true;
+static HANDLE consoleHandle = 0;
+
+// If all = true, flush all characters to console.
+// If all = false, flush up to and including last newline.
+// Also flush all if buffer > half full to ensure space for future
+// writes.
+static void flush(bool all = false)
+{
+ int nchars = 0;
+
+ if (all || bufLen > BUF_SIZE/2) {
+ nchars = bufLen;
+ } else if (bufLen > 0) {
+ // find num chars up to and including last '\n'
+ for (nchars = bufLen; nchars > 0; --nchars) {
+ if (buf[nchars-1] == '\n')
+ break;
+ }
+ }
+
+ if (nchars > 0) {
+ DWORD wlen = utf8ToUtf16(buf, (uint16_t*)wbuf, BUF_SIZE, nchars);
+ WriteConsoleW(consoleHandle, wbuf, wlen, &wlen, nullptr);
+ if (nchars < bufLen) {
+ memmove(buf, buf + nchars, bufLen - nchars);
+ bufLen -= nchars;
+ } else {
+ bufLen = 0;
+ }
+ }
+}
+
+static inline bool streamIsConsole(FILE *stream)
+{
+ return ((stream == stdout && stdoutIsConsole) || (stream == stderr && stderrIsConsole));
+}
+
+int win32_fprintf(FILE *stream, ...)
+{
+ va_list args;
+ int ret = 0;
+
+ va_start(args, stream);
+ const char *format = va_arg(args, const char *);
+ if (streamIsConsole(stream)) {
+ ret = vsnprintf(buf + bufLen, BUF_SIZE - bufLen, format, args);
+ bufLen += ret;
+ if (ret >= BUF_SIZE - bufLen) {
+ // output was truncated
+ buf[BUF_SIZE - 1] = 0;
+ bufLen = BUF_SIZE - 1;
+ }
+ flush();
+ } else {
+ vfprintf(stream, format, args);
+ }
+ va_end(args);
+
+ return ret;
+}
+
+size_t win32_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+ size_t ret = 0;
+
+ if (streamIsConsole(stream)) {
+ int n = size * nmemb;
+ if (n > BUF_SIZE - bufLen - 1)
+ n = BUF_SIZE - bufLen - 1;
+ memcpy(buf + bufLen, ptr, n);
+ bufLen += n;
+ buf[bufLen] = 0;
+ flush();
+ } else {
+ ret = fwrite(ptr, size, nmemb, stream);
+ }
+
+ return ret;
+}
+
+
+Win32Console::Win32Console(int *argc, char **argv[])
+{
+ LPWSTR *wargv;
+ fpos_t pos;
+
+ argList = nullptr;
+ privateArgList = nullptr;
+ wargv = CommandLineToArgvW(GetCommandLineW(), &numArgs);
+ if (wargv) {
+ argList = new char*[numArgs];
+ privateArgList = new char*[numArgs];
+ for (int i = 0; i < numArgs; i++) {
+ argList[i] = utf16ToUtf8((uint16_t*)(wargv[i]));
+ // parseArgs will rearrange the argv list so we keep our own copy
+ // to use for freeing all the strings
+ privateArgList[i] = argList[i];
+ }
+ LocalFree(wargv);
+ *argc = numArgs;
+ *argv = argList;
+ }
+
+ bufLen = 0;
+ buf[0] = 0;
+ wbuf[0] = 0;
+
+ // check if stdout or stderr redirected
+ // GetFileType() returns CHAR for console and special devices COMx, PRN, CON, NUL etc
+ // fgetpos() succeeds on all CHAR devices except console and CON.
+
+ stdoutIsConsole = (GetFileType(GetStdHandle(STD_OUTPUT_HANDLE)) == FILE_TYPE_CHAR)
+ && (fgetpos(stdout, &pos) != 0);
+
+ stderrIsConsole = (GetFileType(GetStdHandle(STD_ERROR_HANDLE)) == FILE_TYPE_CHAR)
+ && (fgetpos(stderr, &pos) != 0);
+
+ // Need a handle to the console. Doesn't matter if we use stdout or stderr as
+ // long as the handle output is to the console.
+ if (stdoutIsConsole)
+ consoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
+ else if (stderrIsConsole)
+ consoleHandle = GetStdHandle(STD_ERROR_HANDLE);
+}
+
+Win32Console::~Win32Console()
+{
+ flush(true);
+ if (argList) {
+ for (int i = 0; i < numArgs; i++)
+ gfree(privateArgList[i]);
+ delete[] argList;
+ delete[] privateArgList;
+ }
+}
+
+#endif // _WIN32
diff --git a/utils/Win32Console.h b/utils/Win32Console.h
new file mode 100644
index 00000000..46381000
--- /dev/null
+++ b/utils/Win32Console.h
@@ -0,0 +1,63 @@
+//========================================================================
+//
+// Win32Console.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2017 Adrian Johnson <ajohnson at redneon.com>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
+// UTF-8 Support for win32 console
+//
+// Converts argc/argv to UTF-8. Supports UTF-8 stdout/stderr to win32 console.
+// On other platforms this class is a no-op.
+
+#ifdef _WIN32
+
+// Ensure stdio.h is included before redefining stdio functions. We need to provide
+// our own declarations for the redefined functions because win32 stdio.h functions
+// have DLL export decorations.
+#include <stdio.h>
+
+#ifndef WIN32_CONSOLE_IMPL // don't redefine in Win32Console.cc so we can call original functions
+#define printf(...) win32_fprintf(stdout, __VA_ARGS__)
+#define fprintf(stream, ...) win32_fprintf(stream, __VA_ARGS__)
+#define puts(s) win32_fprintf(stdout, "%s\n", s)
+#define fputs(s, stream) win32_fprintf(stream, "%s", s)
+#define putc(c) win32_fprintf(stdout, "%c", c)
+#define putchar(c) win32_fprintf(stdout, "%c", c)
+#define fputc(c, stream) win32_fprintf(stream, "%c", c)
+#define fwrite(ptr, size, nmemb, stream) win32_fwrite(ptr, size, nmemb, stream)
+#endif
+
+extern "C" {
+ int win32_fprintf(FILE *stream, ...);
+ size_t win32_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+}
+
+class Win32Console
+{
+public:
+ Win32Console(int *argc, char **argv[]);
+ ~Win32Console();
+private:
+ int numArgs;
+ char **argList;
+ char **privateArgList;
+};
+
+#else
+
+// On other platforms this class is a no-op.
+
+class Win32Console
+{
+public:
+ Win32Console(int *argc, char ***argv) {}
+};
+
+#endif // _WIN32
diff --git a/utils/pdfdetach.cc b/utils/pdfdetach.cc
index 5bbdc1e1..a39f817e 100644
--- a/utils/pdfdetach.cc
+++ b/utils/pdfdetach.cc
@@ -40,6 +40,7 @@
#include "UnicodeMap.h"
#include "PDFDocEncoding.h"
#include "Error.h"
+#include "Win32Console.h"
static GBool doList = gFalse;
static int saveNum = 0;
@@ -99,6 +100,7 @@ int main(int argc, char *argv[]) {
Unicode u;
GBool isUnicode;
+ Win32Console win32Console(&argc, &argv);
exitCode = 99;
// parse args
diff --git a/utils/pdffonts.cc b/utils/pdffonts.cc
index 535bf8fb..2867d51f 100644
--- a/utils/pdffonts.cc
+++ b/utils/pdffonts.cc
@@ -39,6 +39,7 @@
#include "PDFDoc.h"
#include "PDFDocFactory.h"
#include "FontInfo.h"
+#include "Win32Console.h"
static const char *fontTypeNames[] = {
"unknown",
@@ -94,6 +95,7 @@ int main(int argc, char *argv[]) {
GBool ok;
int exitCode;
+ Win32Console win32Console(&argc, &argv);
exitCode = 99;
// parse args
diff --git a/utils/pdfimages.cc b/utils/pdfimages.cc
index d11b0147..525a80ba 100644
--- a/utils/pdfimages.cc
+++ b/utils/pdfimages.cc
@@ -47,6 +47,7 @@
#include "PDFDocFactory.h"
#include "ImageOutputDev.h"
#include "Error.h"
+#include "Win32Console.h"
static int firstPage = 1;
static int lastPage = 0;
@@ -120,6 +121,7 @@ int main(int argc, char *argv[]) {
GBool ok;
int exitCode;
+ Win32Console win32Console(&argc, &argv);
exitCode = 99;
// parse args
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index b29e97a0..a3099cf6 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -61,6 +61,7 @@
#include "JSInfo.h"
#include "StructTreeRoot.h"
#include "StructElement.h"
+#include "Win32Console.h"
static int firstPage = 1;
@@ -616,6 +617,7 @@ int main(int argc, char *argv[]) {
exitCode = 99;
// parse args
+ Win32Console win32console(&argc, &argv);
ok = parseArgs(argDesc, &argc, argv);
if (!ok || (argc != 2 && !printEnc) || printVersion || printHelp) {
fprintf(stderr, "pdfinfo version %s\n", PACKAGE_VERSION);
diff --git a/utils/pdfseparate.cc b/utils/pdfseparate.cc
index 71e35b00..14fcf3f6 100644
--- a/utils/pdfseparate.cc
+++ b/utils/pdfseparate.cc
@@ -23,6 +23,7 @@
#include "PDFDoc.h"
#include "ErrorCodes.h"
#include "GlobalParams.h"
+#include "Win32Console.h"
#include <ctype.h>
static int firstPage = 0;
@@ -150,6 +151,7 @@ main (int argc, char *argv[])
exitCode = 99;
// parse args
+ Win32Console win32console(&argc, &argv);
ok = parseArgs (argDesc, &argc, argv);
if (!ok || argc != 3 || printVersion || printHelp)
{
diff --git a/utils/pdfsig.cc b/utils/pdfsig.cc
index eb8acd79..e31048f4 100644
--- a/utils/pdfsig.cc
+++ b/utils/pdfsig.cc
@@ -29,6 +29,7 @@
#include "Error.h"
#include "GlobalParams.h"
#include "SignatureInfo.h"
+#include "Win32Console.h"
static const char * getReadableSigState(SignatureValidationStatus sig_vs)
{
@@ -116,6 +117,7 @@ int main(int argc, char *argv[])
std::vector<FormWidgetSignature*> sig_widgets;
globalParams = new GlobalParams();
+ Win32Console win32Console(&argc, &argv);
int exitCode = 99;
GBool ok;
diff --git a/utils/pdftocairo-win32.cc b/utils/pdftocairo-win32.cc
index 9e52b6c8..a90b3343 100644
--- a/utils/pdftocairo-win32.cc
+++ b/utils/pdftocairo-win32.cc
@@ -18,6 +18,7 @@
#include "parseargs.h"
#include "pdftocairo-win32.h"
+#include "Win32Console.h"
#include <dlgs.h>
#include <commctrl.h>
diff --git a/utils/pdftocairo.cc b/utils/pdftocairo.cc
index 46f3e1e2..7a5ef981 100644
--- a/utils/pdftocairo.cc
+++ b/utils/pdftocairo.cc
@@ -55,6 +55,7 @@
#include "PDFDoc.h"
#include "PDFDocFactory.h"
#include "CairoOutputDev.h"
+#include "Win32Console.h"
#ifdef USE_CMS
#ifdef USE_LCMS1
#include <lcms.h>
@@ -943,6 +944,7 @@ int main(int argc, char *argv[]) {
int num_outputs;
// parse args
+ Win32Console win32Console(&argc, &argv);
if (!parseArgs(argDesc, &argc, argv)) {
printUsage("pdftocairo", 0, argDesc);
exit(99);
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 8e59b8b8..05a5b7e5 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -63,6 +63,7 @@
#include "Error.h"
#include "DateInfo.h"
#include "goo/gfile.h"
+#include "Win32Console.h"
static int firstPage = 1;
static int lastPage = 0;
@@ -190,6 +191,7 @@ int main(int argc, char *argv[]) {
Object info;
int exit_status = EXIT_FAILURE;
+ Win32Console win32Console(&argc, &argv);
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
diff --git a/utils/pdftoppm.cc b/utils/pdftoppm.cc
index 5cd9f53c..5677a1dd 100644
--- a/utils/pdftoppm.cc
+++ b/utils/pdftoppm.cc
@@ -51,6 +51,7 @@
#include "splash/SplashBitmap.h"
#include "splash/Splash.h"
#include "SplashOutputDev.h"
+#include "Win32Console.h"
// Uncomment to build pdftoppm with pthreads
// You may also have to change the buildsystem to
@@ -394,6 +395,7 @@ int main(int argc, char *argv[]) {
int pg, pg_num_len;
double pg_w, pg_h, tmp;
+ Win32Console win32Console(&argc, &argv);
exitCode = 99;
// parse args
diff --git a/utils/pdftops.cc b/utils/pdftops.cc
index e415fcae..8dd85ba1 100644
--- a/utils/pdftops.cc
+++ b/utils/pdftops.cc
@@ -51,6 +51,7 @@
#include "PDFDocFactory.h"
#include "PSOutputDev.h"
#include "Error.h"
+#include "Win32Console.h"
static GBool setPSPaperSize(char *size, int &psPaperWidth, int &psPaperHeight) {
if (!strcmp(size, "match")) {
@@ -218,6 +219,7 @@ int main(int argc, char *argv[]) {
GBool rasterAntialias = gFalse;
std::vector<int> pages;
+ Win32Console win32Console(&argc, &argv);
exitCode = 99;
// parse args
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index ebf9a2b2..5c3eaaa5 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -59,6 +59,7 @@
#include <string>
#include <sstream>
#include <iomanip>
+#include "Win32Console.h"
static void printInfoString(FILE *f, Dict *infoDict, const char *key,
const char *text1, const char *text2, UnicodeMap *uMap);
@@ -180,6 +181,7 @@ int main(int argc, char *argv[]) {
char *p;
int exitCode;
+ Win32Console win32Console(&argc, &argv);
exitCode = 99;
// parse args
commit 49107ffcd4d3c9b18fc950d37bede08f89bcfcda
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Sun Nov 12 10:33:07 2017 +1030
Fix some mingw warnings
- Include poppler-config.h for mingw PRINTF_FORMAT
- Only redefine strcasecmp for MSVC
- Recent versions of MSVC have snprintf and vsnprintf
diff --git a/poppler/Error.cc b/poppler/Error.cc
index 99ca3ad3..71bd3ce9 100644
--- a/poppler/Error.cc
+++ b/poppler/Error.cc
@@ -25,6 +25,7 @@
//========================================================================
#include <config.h>
+#include <poppler-config.h>
#ifdef USE_GCC_PRAGMAS
#pragma implementation
diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc
index 09ad37a2..8069b3e2 100644
--- a/poppler/GlobalParams.cc
+++ b/poppler/GlobalParams.cc
@@ -82,7 +82,7 @@
#include <fontconfig/fontconfig.h>
#endif
-#ifdef _WIN32
+#ifdef _MSVC
# define strcasecmp stricmp
#else
# include <strings.h>
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index a2468a5f..09ee0a21 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -42,6 +42,7 @@
//========================================================================
#include <config.h>
+#include <poppler-config.h>
#ifdef USE_GCC_PRAGMAS
#pragma implementation
diff --git a/test/perf-test.cc b/test/perf-test.cc
index bd455038..78083275 100644
--- a/test/perf-test.cc
+++ b/test/perf-test.cc
@@ -735,15 +735,6 @@ static void StrList_Destroy(StrList **root)
*root = NULL;
}
-#ifndef _WIN32
-/*static void OutputDebugString(const char *txt)
-{
- // do nothing
-}*/
-#define _snprintf snprintf
-#define _vsnprintf vsnprintf
-#endif
-
static void my_error(void *, ErrorCategory, Goffset pos, char *msg) {
#if 0
char buf[4096], *p = buf;
@@ -762,7 +753,7 @@ static void my_error(void *, ErrorCategory, Goffset pos, char *msg) {
}
p = buf;
- p += _vsnprintf(p, sizeof(buf) - 1, msg, args);
+ p += vsnprintf(p, sizeof(buf) - 1, msg, args);
while ( p > buf && isspace(p[-1]) )
*--p = '\0';
*p++ = '\r';
@@ -785,7 +776,7 @@ static void my_error(void *, ErrorCategory, Goffset pos, char *msg) {
#if 0
p = buf;
va_start(args, msg);
- p += _vsnprintf(p, sizeof(buf) - 3, msg, args);
+ p += vsnprintf(p, sizeof(buf) - 3, msg, args);
while ( p > buf && isspace(p[-1]) )
*--p = '\0';
*p++ = '\r';
@@ -807,7 +798,7 @@ static void LogInfo(const char *fmt, ...)
p = buf;
va_start(args, fmt);
- p += _vsnprintf(p, sizeof(buf) - 1, fmt, args);
+ p += vsnprintf(p, sizeof(buf) - 1, fmt, args);
*p = '\0';
fprintf(gOutFile, "%s", buf);
va_end(args);
commit 4f687665c39da743e802fc71ba05fb5966095293
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Sun Nov 12 10:33:07 2017 +1030
sort encoding list
makes it easier to find encodings listed by -listenc
diff --git a/utils/printencodings.cc b/utils/printencodings.cc
index dec6f98f..2c4b9f27 100644
--- a/utils/printencodings.cc
+++ b/utils/printencodings.cc
@@ -22,9 +22,18 @@
#include "goo/GooList.h"
#include "goo/GooString.h"
+
+static int cmpGooString(const void *ptr1, const void *ptr2)
+{
+ GooString *s1 = *((GooString **)ptr1);
+ GooString *s2 = *((GooString **)ptr2);
+ return s1->cmp(s2);
+}
+
void printEncodings()
{
GooList *encNames = globalParams->getEncodingNames();
+ encNames->sort(cmpGooString);
printf("Available encodings are:\n");
for (int i = 0; i < encNames->getLength(); ++i) {
GooString *enc = (GooString*)encNames->get(i);
commit 5c394f71f03d27507db3446ad34f299393fa3621
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Sun Nov 12 10:33:07 2017 +1030
Move UTF8.h to UnicodeMapFuncs.h and rename UCS2 to UTF16
UTF8.h is not exclusively UTF-8 code. Renaming to UnicodeMapFuncs.h
identifies the file as containing maps for UnicodeMap and is
consistent with the name UnicodeMapTables.h.
The mapUCS2 code was changed to support UTF-16 in 979ef1ca without
changing the name.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f181fdec..3a34168f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -610,6 +610,7 @@ if(ENABLE_XPDF_HEADERS)
poppler/StructElement.h
poppler/StructTreeRoot.h
poppler/UnicodeMap.h
+ poppler/UnicodeMapFuncs.h
poppler/UnicodeMapTables.h
poppler/UnicodeTypeTable.h
poppler/UnicodeCClassTables.h
@@ -627,7 +628,6 @@ if(ENABLE_XPDF_HEADERS)
poppler/StdinCachedFile.h
poppler/StdinPDFDocBuilder.h
poppler/UTF.h
- poppler/UTF8.h
poppler/XpdfPluginAPI.h
poppler/Sound.h
${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h
diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc
index 26669930..09ad37a2 100644
--- a/poppler/GlobalParams.cc
+++ b/poppler/GlobalParams.cc
@@ -110,7 +110,7 @@
#include "NameToUnicodeTable.h"
#include "UnicodeMapTables.h"
-#include "UTF8.h"
+#include "UnicodeMapFuncs.h"
#ifdef ENABLE_PLUGINS
# ifdef _WIN32
@@ -629,7 +629,7 @@ GlobalParams::GlobalParams(const char *customPopplerDataDir)
residentUnicodeMaps->add(map->getEncodingName(), map);
map = new UnicodeMap("UTF-8", gTrue, &mapUTF8);
residentUnicodeMaps->add(map->getEncodingName(), map);
- map = new UnicodeMap("UCS-2", gTrue, &mapUCS2);
+ map = new UnicodeMap("UTF-16", gTrue, &mapUTF16);
residentUnicodeMaps->add(map->getEncodingName(), map);
scanEncodingDirs();
diff --git a/poppler/UTF8.h b/poppler/UnicodeMapFuncs.h
similarity index 95%
rename from poppler/UTF8.h
rename to poppler/UnicodeMapFuncs.h
index 34a07d40..25c21c4e 100644
--- a/poppler/UTF8.h
+++ b/poppler/UnicodeMapFuncs.h
@@ -1,6 +1,6 @@
//========================================================================
//
-// UTF8.h
+// UnicodeMapFuncs.h
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
@@ -21,7 +21,7 @@
//========================================================================
static int mapUTF8(Unicode u, char *buf, int bufSize) {
- if (u <= 0x0000007f) {
+ if (u <= 0x0000007f) {
if (bufSize < 1) {
return 0;
}
@@ -56,7 +56,7 @@ static int mapUTF8(Unicode u, char *buf, int bufSize) {
}
}
-static int mapUCS2(Unicode u, char *buf, int bufSize) {
+static int mapUTF16(Unicode u, char *buf, int bufSize) {
if (u <= 0xffff) {
if (bufSize < 2) {
return 0;
More information about the poppler
mailing list