[poppler] utils/pdfinfo.1 utils/pdfinfo.cc

Fri Oct 1 09:24:05 UTC 2021

utils/pdfinfo.1  |    6 ++++++
 utils/pdfinfo.cc |   24 ++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

New commits:
commit c498cfe5a6292f2c696178b69e6fb275f1a4a4da
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Fri Oct 1 09:24:03 2021 +0000

    pdfinfo: add -url option to print all URLs in a PDF

diff --git a/utils/pdfinfo.1 b/utils/pdfinfo.1
index abd34a8c..2a17bbd5 100644
--- a/utils/pdfinfo.1
+++ b/utils/pdfinfo.1
@@ -126,6 +126,12 @@ file.  Note that extracting text this way might be slow for big PDF files.
 (Implies
 .BR \-struct .)
 .TP
+.B \-url
+Print all URLs in the PDF. Only the URL types supported by Poppler are listed.
+Currently, this is limited to Annotations. Note: only URLs referenced by the PDF objects
+such as Link Annotations are listed. pdfinfo does not attempt to extract strings
+matching http://... from the text content.
+.TP
 .B \-isodates
 Prints dates in ISO-8601 format (including the time zone).
 .TP
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index e34f21fc..d47b2564 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -87,6 +87,7 @@ static bool printEnc = false;
 static bool printStructure = false;
 static bool printStructureText = false;
 static bool printDests = false;
+static bool printUrls = false;
 
 static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" },
                                    { "-l", argInt, &lastPage, 0, "last page to convert" },
@@ -99,6 +100,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
                                    { "-isodates", argFlag, &isoDates, 0, "print the dates in ISO-8601 format" },
                                    { "-rawdates", argFlag, &rawDates, 0, "print the undecoded date strings directly from the PDF file" },
                                    { "-dests", argFlag, &printDests, 0, "print all named destinations in the PDF" },
+                                   { "-url", argFlag, &printUrls, 0, "print all URLs inside PDF objects (does not scan text content)" },
                                    { "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
                                    { "-listenc", argFlag, &printEnc, 0, "list available encodings" },
                                    { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
@@ -412,6 +414,26 @@ static void printDestinations(PDFDoc *doc, const UnicodeMap *uMap)
     }
 }
 
+static void printUrlList(PDFDoc *doc)
+{
+    printf("Page  Type          URL\n");
+    for (int pg = firstPage; pg <= lastPage; pg++) {
+        Page *page = doc->getPage(pg);
+        if (page) {
+            Links *links = page->getLinks();
+            for (int i = 0; i < links->getNumLinks(); i++) {
+                AnnotLink *annot = links->getLink(i);
+                LinkAction *action = annot->getAction();
+                if (action->getKind() == actionURI) {
+                    LinkURI *linkUri = dynamic_cast<LinkURI *>(action);
+                    std::string uri = linkUri->getURI();
+                    printf("%4d  Annotation    %s\n", pg, uri.c_str());
+                }
+            }
+        }
+    }
+}
+
 static void printPdfSubtype(PDFDoc *doc, const UnicodeMap *uMap)
 {
     const Object info = doc->getDocInfo();
@@ -1015,6 +1037,8 @@ int main(int argc, char *argv[])
         }
     } else if (printDests) {
         printDestinations(doc.get(), uMap);
+    } else if (printUrls) {
+        printUrlList(doc.get());
     } else {
         // print info
         long long filesize = 0;