[poppler] 2 commits - poppler/UTF.cc poppler/UTF.h utils/pdfinfo.1 utils/pdfinfo.cc
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Sep 15 22:02:37 UTC 2021
poppler/UTF.cc | 436 ++++++++-----------------------------------------------
poppler/UTF.h | 10 +
utils/pdfinfo.1 | 3
utils/pdfinfo.cc | 70 ++++++++
4 files changed, 150 insertions(+), 369 deletions(-)
New commits:
commit b10e715b6a12d63922e428512d2d14682fd1cefc
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Thu Sep 16 06:59:14 2021 +0930
Ignore custom metadata that is not a string
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 73bbbc76..b46d1aa3 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -679,22 +679,24 @@ static void printCustomInfo(PDFDoc *doc, const UnicodeMap *uMap)
printInfoDate(info.getDict(), "ModDate", "ModDate: ", uMap);
}
} else {
- // print key
- Unicode *u;
- int len = utf8ToUCS4(key.c_str(), &u);
- printUCS4String(u, len, uMap);
- fputs(":", stdout);
- while (len < 15) {
- fputs(" ", stdout);
- len++;
- }
- gfree(u);
-
- // print value
Object obj = dict->lookup(key.c_str());
- GooString val_str(obj.getString());
- printTextString(&val_str, uMap);
- fputc('\n', stdout);
+ if (obj.isString()) {
+ // print key
+ Unicode *u;
+ int len = utf8ToUCS4(key.c_str(), &u);
+ printUCS4String(u, len, uMap);
+ fputs(":", stdout);
+ while (len < 15) {
+ fputs(" ", stdout);
+ len++;
+ }
+ gfree(u);
+
+ // print value
+ GooString val_str(obj.getString());
+ printTextString(&val_str, uMap);
+ fputc('\n', stdout);
+ }
}
}
}
commit 2bcf030e294cddf47abb63d53944b5e932848917
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Wed Sep 15 22:31:10 2021 +0930
pdfinfo: Add -custom option to print custom metadata
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 9097b312..b78fd2ff 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -176,377 +176,28 @@ static const uint32_t UTF8_REJECT = 12;
static const uint32_t UCS4_MAX = 0x10FFFF;
static const Unicode REPLACEMENT_CHAR = 0xFFFD;
+// clang-format off
static const uint8_t decodeUtf8Table[] = {
- // The first part of the table maps bytes to character classes
- // to reduce the size of the transition table and create bitmasks.
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0, // 00..1f
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0, // 20..3f
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0, // 40..5f
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0, // 60..7f
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9,
- 9, // 80..9f
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7,
- 7, // a0..bf
- 8,
- 8,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2, // c0..df
- 10,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 4,
- 3,
- 3,
- 11,
- 6,
- 6,
- 6,
- 5,
- 8,
- 8,
- 8,
- 8,
- 8,
- 8,
- 8,
- 8,
- 8,
- 8,
- 8, // e0..ff
-
- // The second part is a transition table that maps a combination
- // of a state of the automaton and a character class to a state.
- 0,
- 12,
- 24,
- 36,
- 60,
- 96,
- 84,
- 12,
- 12,
- 12,
- 48,
- 72,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 0,
- 12,
- 12,
- 12,
- 12,
- 12,
- 0,
- 12,
- 0,
- 12,
- 12,
- 12,
- 24,
- 12,
- 12,
- 12,
- 12,
- 12,
- 24,
- 12,
- 24,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 24,
- 12,
- 12,
- 12,
- 12,
- 12,
- 24,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 24,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 36,
- 12,
- 36,
- 12,
- 12,
- 12,
- 36,
- 12,
- 12,
- 12,
- 12,
- 12,
- 36,
- 12,
- 36,
- 12,
- 12,
- 12,
- 36,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
- 12,
+ // The first part of the table maps bytes to character classes
+ // to reduce the size of the transition table and create bitmasks.
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff
+
+ // The second part is a transition table that maps a combination
+ // of a state of the automaton and a character class to a state.
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
};
+// clang-format on
// Decode utf8 state machine for fast UTF-8 decoding. Initialise state
// to 0 and call decodeUtf8() for each byte of UTF-8. Return value
@@ -566,6 +217,53 @@ inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte)
return *state;
}
+int utf8CountUCS4(const char *utf8)
+{
+ uint32_t codepoint;
+ uint32_t state = 0;
+ int count = 0;
+
+ while (*utf8) {
+ decodeUtf8(&state, &codepoint, *utf8);
+ if (state == UTF8_ACCEPT) {
+ count++;
+ } else if (state == UTF8_REJECT) {
+ count++; // replace with REPLACEMENT_CHAR
+ state = 0;
+ }
+ utf8++;
+ }
+ if (state != UTF8_ACCEPT && state != UTF8_REJECT)
+ count++; // replace with REPLACEMENT_CHAR
+
+ return count;
+}
+
+int utf8ToUCS4(const char *utf8, Unicode **ucs4_out)
+{
+ int len = utf8CountUCS4(utf8);
+ Unicode *u = (Unicode *)gmallocn(len, sizeof(Unicode));
+ int n = 0;
+ uint32_t codepoint;
+ uint32_t state = 0;
+
+ while (*utf8 && n < len) {
+ decodeUtf8(&state, &codepoint, *utf8);
+ if (state == UTF8_ACCEPT) {
+ u[n++] = codepoint;
+ } else if (state == UTF8_REJECT) {
+ u[n++] = REPLACEMENT_CHAR; // invalid byte for this position
+ state = 0;
+ }
+ utf8++;
+ }
+ if (state != UTF8_ACCEPT && state != UTF8_REJECT)
+ u[n] = REPLACEMENT_CHAR; // invalid byte for this position
+
+ *ucs4_out = u;
+ return len;
+}
+
// Count number of UTF-16 code units required to convert a UTF-8 string
// (excluding terminating NULL). Each invalid byte is counted as a
// code point since the UTF-8 conversion functions will replace it with
diff --git a/poppler/UTF.h b/poppler/UTF.h
index 2e4cfe7f..d22fd409 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -42,6 +42,16 @@ bool UnicodeIsValid(Unicode ucs4);
// is a unicode whitespace character
bool UnicodeIsWhitespace(Unicode ucs4);
+// Count number of UCS-4 characters required to convert a UTF-8 string to
+// UCS-4 (excluding terminating NULL).
+int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8);
+
+// Convert a UTF-8 string to a UCS-4
+// utf8 - utf8 bytes
+// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
+// returns number of UCS-4 characters
+int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out);
+
// Count number of UTF-16 code units required to convert a UTF-8 string
// (excluding terminating NULL). Each invalid byte is counted as a
// code point since the UTF-8 conversion functions will replace it with
diff --git a/utils/pdfinfo.1 b/utils/pdfinfo.1
index 41190842..f4225a9e 100644
--- a/utils/pdfinfo.1
+++ b/utils/pdfinfo.1
@@ -99,6 +99,9 @@ TrimBox, and ArtBox.
Prints document-level metadata. (This is the "Metadata" stream from
the PDF file's Catalog object.)
.TP
+.B \-custom
+Prints custom and standard metadata.
+.TP
.B \-js
Prints all JavaScript in the PDF.
.TP
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 655c3f5e..73bbbc76 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -43,6 +43,7 @@
#include <ctime>
#include <cmath>
#include <map>
+#include <set>
#include "parseargs.h"
#include "printencodings.h"
#include "goo/GooString.h"
@@ -73,6 +74,7 @@ static int firstPage = 1;
static int lastPage = 0;
static bool printBoxes = false;
static bool printMetadata = false;
+static bool printCustom = false;
static bool printJS = false;
static bool isoDates = false;
static bool rawDates = false;
@@ -90,6 +92,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
{ "-l", argInt, &lastPage, 0, "last page to convert" },
{ "-box", argFlag, &printBoxes, 0, "print the page bounding boxes" },
{ "-meta", argFlag, &printMetadata, 0, "print the document metadata (XML)" },
+ { "-custom", argFlag, &printCustom, 0, "print both custom and standard metadata" },
{ "-js", argFlag, &printJS, 0, "print all JavaScript in the PDF" },
{ "-struct", argFlag, &printStructure, 0, "print the logical document structure (for tagged files)" },
{ "-struct-text", argFlag, &printStructureText, 0, "print text contents along with document structure (for tagged files)" },
@@ -119,6 +122,15 @@ static void printTextString(const GooString *s, const UnicodeMap *uMap)
gfree(u);
}
+static void printUCS4String(const Unicode *u, int len, const UnicodeMap *uMap)
+{
+ char buf[8];
+ for (int i = 0; i < len; i++) {
+ int n = uMap->mapUnicode(u[i], buf, sizeof(buf));
+ fwrite(buf, 1, n, stdout);
+ }
+}
+
static void printInfoString(Dict *infoDict, const char *key, const char *text, const UnicodeMap *uMap)
{
const GooString *s1;
@@ -634,6 +646,60 @@ static void printPdfSubtype(PDFDoc *doc, const UnicodeMap *uMap)
}
}
+static void printCustomInfo(PDFDoc *doc, const UnicodeMap *uMap)
+{
+ Object info = doc->getDocInfo();
+ if (info.isDict()) {
+ Dict *dict = info.getDict();
+
+ // Sort keys
+ std::set<std::string> keys;
+ for (int i = 0; i < dict->getLength(); i++) {
+ std::string key(dict->getKey(i));
+ if (key != "Trapped") {
+ keys.insert(key);
+ }
+ }
+
+ for (const std::string &key : keys) {
+ if (key == "CreationDate") {
+ if (isoDates) {
+ printISODate(info.getDict(), "CreationDate", "CreationDate: ", uMap);
+ } else if (rawDates) {
+ printInfoString(info.getDict(), "CreationDate", "CreationDate: ", uMap);
+ } else {
+ printInfoDate(info.getDict(), "CreationDate", "CreationDate: ", uMap);
+ }
+ } else if (key == "ModDate") {
+ if (isoDates) {
+ printISODate(info.getDict(), "ModDate", "ModDate: ", uMap);
+ } else if (rawDates) {
+ printInfoString(info.getDict(), "ModDate", "ModDate: ", uMap);
+ } else {
+ printInfoDate(info.getDict(), "ModDate", "ModDate: ", uMap);
+ }
+ } else {
+ // print key
+ Unicode *u;
+ int len = utf8ToUCS4(key.c_str(), &u);
+ printUCS4String(u, len, uMap);
+ fputs(":", stdout);
+ while (len < 15) {
+ fputs(" ", stdout);
+ len++;
+ }
+ gfree(u);
+
+ // print value
+ Object obj = dict->lookup(key.c_str());
+ GooString val_str(obj.getString());
+ printTextString(&val_str, uMap);
+ fputc('\n', stdout);
+ }
+ }
+ }
+}
+
static void printInfo(PDFDoc *doc, const UnicodeMap *uMap, long long filesize, bool multiPage)
{
Page *page;
@@ -908,6 +974,8 @@ int main(int argc, char *argv[])
fputc('\n', stdout);
delete metadata;
}
+ } else if (printCustom) {
+ printCustomInfo(doc.get(), uMap);
} else if (printJS) {
// print javascript
JSInfo jsInfo(doc.get(), firstPage - 1);
More information about the poppler
mailing list