[poppler] utils/pdfinfo.cc

Carlos Garcia Campos carlosgc at kemper.freedesktop.org
Tue Mar 1 12:58:43 UTC 2016


 utils/pdfinfo.cc |   96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

New commits:
commit a8853b1df0a15570dff6ecc333769257bbf874c3
Author: Adrian Perez de Castro <aperez at igalia.com>
Date:   Thu May 9 19:11:26 2013 +0300

    Tagged-PDF: Modify pdfinfo to show the document structure
    
    https://bugs.freedesktop.org/show_bug.cgi?id=64816

diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 5a9745f..18221c2 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -3,6 +3,7 @@
 // pdfinfo.cc
 //
 // Copyright 1998-2003 Glyph & Cog, LLC
+// Copyright 2013 Igalia S.L.
 //
 //========================================================================
 
@@ -56,11 +57,15 @@
 #include "Error.h"
 #include "DateInfo.h"
 #include "JSInfo.h"
+#include "StructTreeRoot.h"
+#include "StructElement.h"
 
 static void printInfoString(Dict *infoDict, const char *key, const char *text,
 			    UnicodeMap *uMap);
 static void printInfoDate(Dict *infoDict, const char *key, const char *text);
 static void printBox(const char *text, PDFRectangle *box);
+static void printStruct(const StructElement *element, unsigned indent = 0);
+static void printIndent(unsigned level);
 
 static int firstPage = 1;
 static int lastPage = 0;
@@ -74,6 +79,8 @@ static char userPassword[33] = "\001";
 static GBool printVersion = gFalse;
 static GBool printHelp = gFalse;
 static GBool printEnc = gFalse;
+static GBool printStructure = gFalse;
+static GBool printStructureText = gFalse;
 
 static const ArgDesc argDesc[] = {
   {"-f",      argInt,      &firstPage,        0,
@@ -86,6 +93,10 @@ static const ArgDesc argDesc[] = {
    "print the document metadata (XML)"},
   {"-js",     argFlag,     &printJS,          0,
    "print all JavaScript in the PDF"},
+  {"-struct", argFlag,     &printStructure,   0,
+   "print the logical document structure (for tagged files)"},
+  {"-struct-text", argFlag, &printStructureText, 0,
+   "print text contents along with document structure (for tagged files)"},
   {"-rawdates", argFlag,   &rawDates,         0,
    "print the undecoded date strings directly from the PDF file"},
   {"-enc",    argString,   textEncName,    sizeof(textEncName),
@@ -142,6 +153,9 @@ int main(int argc, char *argv[]) {
     goto err0;
   }
 
+  if (printStructureText)
+    printStructure = gTrue;
+
   // read config file
   globalParams = new GlobalParams();
 
@@ -401,6 +415,15 @@ int main(int argc, char *argv[]) {
     jsInfo.scanJS(lastPage - firstPage + 1, stdout, uMap);
   }
 
+  // print the structure
+  const StructTreeRoot *structTree;
+  if (printStructure && (structTree = doc->getCatalog()->getStructTreeRoot())) {
+    fputs("Structure:\n", stdout);
+    for (unsigned i = 0; i < structTree->getNumChildren(); i++) {
+      printStruct(structTree->getChild(i), 1);
+    }
+  }
+
   exitCode = 0;
 
   // clean up
@@ -481,3 +504,76 @@ static void printBox(const char *text, PDFRectangle *box) {
   printf("%s%8.2f %8.2f %8.2f %8.2f\n",
 	 text, box->x1, box->y1, box->x2, box->y2);
 }
+
+static void printIndent(unsigned indent) {
+  while (indent--) {
+    putchar(' ');
+    putchar(' ');
+  }
+}
+
+static void printAttribute(const Attribute *attribute, unsigned indent)
+{
+  printIndent(indent);
+  printf(" /%s ", attribute->getTypeName());
+  if (attribute->getType() == Attribute::UserProperty) {
+    GooString *name = attribute->getName();
+    printf("(%s) ", name->getCString());
+    delete name;
+  }
+  attribute->getValue()->print(stdout);
+  if (attribute->getFormattedValue()) {
+    printf(" \"%s\"", attribute->getFormattedValue());
+  }
+  if (attribute->isHidden()) {
+    printf(" [hidden]");
+  }
+}
+
+static void printStruct(const StructElement *element, unsigned indent) {
+  if (element->isObjectRef()) {
+    printIndent(indent);
+    printf("Object %i %i\n", element->getObjectRef().num, element->getObjectRef().gen);
+    return;
+  }
+
+  if (printStructureText && element->isContent()) {
+    GooString *text = element->getText(gFalse);
+    printIndent(indent);
+    if (text) {
+      printf("\"%s\"\n", text->getCString());
+    } else {
+      printf("(No content?)\n");
+    }
+    delete text;
+  }
+
+  if (!element->isContent()) {
+      printIndent(indent);
+      printf("%s", element->getTypeName());
+      if (element->getID()) {
+          printf(" <%s>", element->getID()->getCString());
+      }
+      if (element->getTitle()) {
+          printf(" \"%s\"", element->getTitle()->getCString());
+      }
+      if (element->getRevision() > 0) {
+          printf(" r%u", element->getRevision());
+      }
+      if (element->isInline() || element->isBlock()) {
+          printf(" (%s)", element->isInline() ? "inline" : "block");
+      }
+      if (element->getNumAttributes()) {
+          putchar(':');
+          for (unsigned i = 0; i < element->getNumAttributes(); i++) {
+              putchar('\n');
+              printAttribute(element->getAttribute(i), indent + 1);
+          }
+      }
+
+      putchar('\n');
+      for (unsigned i = 0; i < element->getNumChildren(); i++) {
+          printStruct(element->getChild(i), indent + 1);
+      }
+  }
+}


More information about the poppler mailing list