[poppler] 2 commits - poppler/UTF.cc poppler/UTF.h utils/pdfinfo.1 utils/pdfinfo.cc

Wed Sep 15 22:02:37 UTC 2021

poppler/UTF.cc   |  436 ++++++++-----------------------------------------------
 poppler/UTF.h    |   10 +
 utils/pdfinfo.1  |    3 
 utils/pdfinfo.cc |   70 ++++++++
 4 files changed, 150 insertions(+), 369 deletions(-)

New commits:
commit b10e715b6a12d63922e428512d2d14682fd1cefc
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Thu Sep 16 06:59:14 2021 +0930

    Ignore custom metadata that is not a string

diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 73bbbc76..b46d1aa3 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -679,22 +679,24 @@ static void printCustomInfo(PDFDoc *doc, const UnicodeMap *uMap)
                     printInfoDate(info.getDict(), "ModDate", "ModDate:        ", uMap);
                 }
             } else {
-                // print key
-                Unicode *u;
-                int len = utf8ToUCS4(key.c_str(), &u);
-                printUCS4String(u, len, uMap);
-                fputs(":", stdout);
-                while (len < 15) {
-                    fputs(" ", stdout);
-                    len++;
-                }
-                gfree(u);
-
-                // print value
                 Object obj = dict->lookup(key.c_str());
-                GooString val_str(obj.getString());
-                printTextString(&val_str, uMap);
-                fputc('\n', stdout);
+                if (obj.isString()) {
+                    // print key
+                    Unicode *u;
+                    int len = utf8ToUCS4(key.c_str(), &u);
+                    printUCS4String(u, len, uMap);
+                    fputs(":", stdout);
+                    while (len < 15) {
+                        fputs(" ", stdout);
+                        len++;
+                    }
+                    gfree(u);
+
+                    // print value
+                    GooString val_str(obj.getString());
+                    printTextString(&val_str, uMap);
+                    fputc('\n', stdout);
+                }
             }
         }
     }
commit 2bcf030e294cddf47abb63d53944b5e932848917
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Wed Sep 15 22:31:10 2021 +0930

    pdfinfo: Add -custom option to print custom metadata

diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 9097b312..b78fd2ff 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -176,377 +176,28 @@ static const uint32_t UTF8_REJECT = 12;
 static const uint32_t UCS4_MAX = 0x10FFFF;
 static const Unicode REPLACEMENT_CHAR = 0xFFFD;
 
+// clang-format off
 static const uint8_t decodeUtf8Table[] = {
-    // The first part of the table maps bytes to character classes
-    // to reduce the size of the transition table and create bitmasks.
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0, // 00..1f
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0, // 20..3f
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0, // 40..5f
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0, // 60..7f
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9,
-    9, // 80..9f
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7,
-    7, // a0..bf
-    8,
-    8,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2, // c0..df
-    10,
-    3,
-    3,
-    3,
-    3,
-    3,
-    3,
-    3,
-    3,
-    3,
-    3,
-    3,
-    3,
-    4,
-    3,
-    3,
-    11,
-    6,
-    6,
-    6,
-    5,
-    8,
-    8,
-    8,
-    8,
-    8,
-    8,
-    8,
-    8,
-    8,
-    8,
-    8, // e0..ff
-
-    // The second part is a transition table that maps a combination
-    // of a state of the automaton and a character class to a state.
-    0,
-    12,
-    24,
-    36,
-    60,
-    96,
-    84,
-    12,
-    12,
-    12,
-    48,
-    72,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    0,
-    12,
-    12,
-    12,
-    12,
-    12,
-    0,
-    12,
-    0,
-    12,
-    12,
-    12,
-    24,
-    12,
-    12,
-    12,
-    12,
-    12,
-    24,
-    12,
-    24,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    24,
-    12,
-    12,
-    12,
-    12,
-    12,
-    24,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    24,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    36,
-    12,
-    36,
-    12,
-    12,
-    12,
-    36,
-    12,
-    12,
-    12,
-    12,
-    12,
-    36,
-    12,
-    36,
-    12,
-    12,
-    12,
-    36,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
+  // The first part of the table maps bytes to character classes
+  // to reduce the size of the transition table and create bitmasks.
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff
+
+  // The second part is a transition table that maps a combination
+  // of a state of the automaton and a character class to a state.
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12,
 };
+// clang-format on
 
 // Decode utf8 state machine for fast UTF-8 decoding. Initialise state
 // to 0 and call decodeUtf8() for each byte of UTF-8. Return value
@@ -566,6 +217,53 @@ inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte)
     return *state;
 }
 
+int utf8CountUCS4(const char *utf8)
+{
+    uint32_t codepoint;
+    uint32_t state = 0;
+    int count = 0;
+
+    while (*utf8) {
+        decodeUtf8(&state, &codepoint, *utf8);
+        if (state == UTF8_ACCEPT) {
+            count++;
+        } else if (state == UTF8_REJECT) {
+            count++; // replace with REPLACEMENT_CHAR
+            state = 0;
+        }
+        utf8++;
+    }
+    if (state != UTF8_ACCEPT && state != UTF8_REJECT)
+        count++; // replace with REPLACEMENT_CHAR
+
+    return count;
+}
+
+int utf8ToUCS4(const char *utf8, Unicode **ucs4_out)
+{
+    int len = utf8CountUCS4(utf8);
+    Unicode *u = (Unicode *)gmallocn(len, sizeof(Unicode));
+    int n = 0;
+    uint32_t codepoint;
+    uint32_t state = 0;
+
+    while (*utf8 && n < len) {
+        decodeUtf8(&state, &codepoint, *utf8);
+        if (state == UTF8_ACCEPT) {
+            u[n++] = codepoint;
+        } else if (state == UTF8_REJECT) {
+            u[n++] = REPLACEMENT_CHAR; // invalid byte for this position
+            state = 0;
+        }
+        utf8++;
+    }
+    if (state != UTF8_ACCEPT && state != UTF8_REJECT)
+        u[n] = REPLACEMENT_CHAR; // invalid byte for this position
+
+    *ucs4_out = u;
+    return len;
+}
+
 // Count number of UTF-16 code units required to convert a UTF-8 string
 // (excluding terminating NULL). Each invalid byte is counted as a
 // code point since the UTF-8 conversion functions will replace it with
diff --git a/poppler/UTF.h b/poppler/UTF.h
index 2e4cfe7f..d22fd409 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -42,6 +42,16 @@ bool UnicodeIsValid(Unicode ucs4);
 // is a unicode whitespace character
 bool UnicodeIsWhitespace(Unicode ucs4);
 
+// Count number of UCS-4 characters required to convert a UTF-8 string to
+// UCS-4 (excluding terminating NULL).
+int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8);
+
+// Convert a UTF-8 string to a UCS-4
+//   utf8      - utf8 bytes
+//   ucs4_out   - if not NULL, allocates and returns UCS-4 string. Free with gfree.
+//   returns number of UCS-4 characters
+int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out);
+
 // Count number of UTF-16 code units required to convert a UTF-8 string
 // (excluding terminating NULL). Each invalid byte is counted as a
 // code point since the UTF-8 conversion functions will replace it with
diff --git a/utils/pdfinfo.1 b/utils/pdfinfo.1
index 41190842..f4225a9e 100644
--- a/utils/pdfinfo.1
+++ b/utils/pdfinfo.1
@@ -99,6 +99,9 @@ TrimBox, and ArtBox.
 Prints document-level metadata.  (This is the "Metadata" stream from
 the PDF file's Catalog object.)
 .TP
+.B \-custom
+Prints custom and standard metadata.
+.TP
 .B \-js
 Prints all JavaScript in the PDF.
 .TP
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 655c3f5e..73bbbc76 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -43,6 +43,7 @@
 #include <ctime>
 #include <cmath>
 #include <map>
+#include <set>
 #include "parseargs.h"
 #include "printencodings.h"
 #include "goo/GooString.h"
@@ -73,6 +74,7 @@ static int firstPage = 1;
 static int lastPage = 0;
 static bool printBoxes = false;
 static bool printMetadata = false;
+static bool printCustom = false;
 static bool printJS = false;
 static bool isoDates = false;
 static bool rawDates = false;
@@ -90,6 +92,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
                                    { "-l", argInt, &lastPage, 0, "last page to convert" },
                                    { "-box", argFlag, &printBoxes, 0, "print the page bounding boxes" },
                                    { "-meta", argFlag, &printMetadata, 0, "print the document metadata (XML)" },
+                                   { "-custom", argFlag, &printCustom, 0, "print both custom and standard metadata" },
                                    { "-js", argFlag, &printJS, 0, "print all JavaScript in the PDF" },
                                    { "-struct", argFlag, &printStructure, 0, "print the logical document structure (for tagged files)" },
                                    { "-struct-text", argFlag, &printStructureText, 0, "print text contents along with document structure (for tagged files)" },
@@ -119,6 +122,15 @@ static void printTextString(const GooString *s, const UnicodeMap *uMap)
     gfree(u);
 }
 
+static void printUCS4String(const Unicode *u, int len, const UnicodeMap *uMap)
+{
+    char buf[8];
+    for (int i = 0; i < len; i++) {
+        int n = uMap->mapUnicode(u[i], buf, sizeof(buf));
+        fwrite(buf, 1, n, stdout);
+    }
+}
+
 static void printInfoString(Dict *infoDict, const char *key, const char *text, const UnicodeMap *uMap)
 {
     const GooString *s1;
@@ -634,6 +646,60 @@ static void printPdfSubtype(PDFDoc *doc, const UnicodeMap *uMap)
     }
 }
 
+static void printCustomInfo(PDFDoc *doc, const UnicodeMap *uMap)
+{
+    Object info = doc->getDocInfo();
+    if (info.isDict()) {
+        Dict *dict = info.getDict();
+
+        // Sort keys
+        std::set<std::string> keys;
+        for (int i = 0; i < dict->getLength(); i++) {
+            std::string key(dict->getKey(i));
+            if (key != "Trapped") {
+                keys.insert(key);
+            }
+        }
+
+        for (const std::string &key : keys) {
+            if (key == "CreationDate") {
+                if (isoDates) {
+                    printISODate(info.getDict(), "CreationDate", "CreationDate:   ", uMap);
+                } else if (rawDates) {
+                    printInfoString(info.getDict(), "CreationDate", "CreationDate:   ", uMap);
+                } else {
+                    printInfoDate(info.getDict(), "CreationDate", "CreationDate:   ", uMap);
+                }
+            } else if (key == "ModDate") {
+                if (isoDates) {
+                    printISODate(info.getDict(), "ModDate", "ModDate:        ", uMap);
+                } else if (rawDates) {
+                    printInfoString(info.getDict(), "ModDate", "ModDate:        ", uMap);
+                } else {
+                    printInfoDate(info.getDict(), "ModDate", "ModDate:        ", uMap);
+                }
+            } else {
+                // print key
+                Unicode *u;
+                int len = utf8ToUCS4(key.c_str(), &u);
+                printUCS4String(u, len, uMap);
+                fputs(":", stdout);
+                while (len < 15) {
+                    fputs(" ", stdout);
+                    len++;
+                }
+                gfree(u);
+
+                // print value
+                Object obj = dict->lookup(key.c_str());
+                GooString val_str(obj.getString());
+                printTextString(&val_str, uMap);
+                fputc('\n', stdout);
+            }
+        }
+    }
+}
+
 static void printInfo(PDFDoc *doc, const UnicodeMap *uMap, long long filesize, bool multiPage)
 {
     Page *page;
@@ -908,6 +974,8 @@ int main(int argc, char *argv[])
             fputc('\n', stdout);
             delete metadata;
         }
+    } else if (printCustom) {
+        printCustomInfo(doc.get(), uMap);
     } else if (printJS) {
         // print javascript
         JSInfo jsInfo(doc.get(), firstPage - 1);