[poppler] utils/pdfinfo.1 utils/pdfinfo.cc
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Oct 1 09:24:05 UTC 2021
utils/pdfinfo.1 | 6 ++++++
utils/pdfinfo.cc | 24 ++++++++++++++++++++++++
2 files changed, 30 insertions(+)
New commits:
commit c498cfe5a6292f2c696178b69e6fb275f1a4a4da
Author: Adrian Johnson <ajohnson at redneon.com>
Date: Fri Oct 1 09:24:03 2021 +0000
pdfinfo: add -url option to print all URLs in a PDF
diff --git a/utils/pdfinfo.1 b/utils/pdfinfo.1
index abd34a8c..2a17bbd5 100644
--- a/utils/pdfinfo.1
+++ b/utils/pdfinfo.1
@@ -126,6 +126,12 @@ file. Note that extracting text this way might be slow for big PDF files.
(Implies
.BR \-struct .)
.TP
+.B \-url
+Print all URLs in the PDF. Only the URL types supported by Poppler are listed.
+Currently, this is limited to Annotations. Note: only URLs referenced by the PDF objects
+such as Link Annotations are listed. pdfinfo does not attempt to extract strings
+matching http://... from the text content.
+.TP
.B \-isodates
Prints dates in ISO-8601 format (including the time zone).
.TP
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index e34f21fc..d47b2564 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -87,6 +87,7 @@ static bool printEnc = false;
static bool printStructure = false;
static bool printStructureText = false;
static bool printDests = false;
+static bool printUrls = false;
static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" },
{ "-l", argInt, &lastPage, 0, "last page to convert" },
@@ -99,6 +100,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to
{ "-isodates", argFlag, &isoDates, 0, "print the dates in ISO-8601 format" },
{ "-rawdates", argFlag, &rawDates, 0, "print the undecoded date strings directly from the PDF file" },
{ "-dests", argFlag, &printDests, 0, "print all named destinations in the PDF" },
+ { "-url", argFlag, &printUrls, 0, "print all URLs inside PDF objects (does not scan text content)" },
{ "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
{ "-listenc", argFlag, &printEnc, 0, "list available encodings" },
{ "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
@@ -412,6 +414,26 @@ static void printDestinations(PDFDoc *doc, const UnicodeMap *uMap)
}
}
+static void printUrlList(PDFDoc *doc)
+{
+ printf("Page Type URL\n");
+ for (int pg = firstPage; pg <= lastPage; pg++) {
+ Page *page = doc->getPage(pg);
+ if (page) {
+ Links *links = page->getLinks();
+ for (int i = 0; i < links->getNumLinks(); i++) {
+ AnnotLink *annot = links->getLink(i);
+ LinkAction *action = annot->getAction();
+ if (action->getKind() == actionURI) {
+ LinkURI *linkUri = dynamic_cast<LinkURI *>(action);
+ std::string uri = linkUri->getURI();
+ printf("%4d Annotation %s\n", pg, uri.c_str());
+ }
+ }
+ }
+ }
+}
+
static void printPdfSubtype(PDFDoc *doc, const UnicodeMap *uMap)
{
const Object info = doc->getDocInfo();
@@ -1015,6 +1037,8 @@ int main(int argc, char *argv[])
}
} else if (printDests) {
printDestinations(doc.get(), uMap);
+ } else if (printUrls) {
+ printUrlList(doc.get());
} else {
// print info
long long filesize = 0;
More information about the poppler
mailing list