[poppler] [PATCH 3/6] Tagged-PDF: Modify pdfinfo to show the document structure

Adrian Perez aperez at igalia.com
Wed May 29 16:47:28 PDT 2013


From: Adrian Perez de Castro <aperez at igalia.com>

---
 utils/pdfinfo.cc | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 14e4f6c..5127412 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -3,6 +3,7 @@
 // pdfinfo.cc
 //
 // Copyright 1998-2003 Glyph & Cog, LLC
+// Copyright 2013 Igalia S.L.
 //
 //========================================================================
 
@@ -19,6 +20,7 @@
 // Copyright (C) 2011 Vittal Aithal <vittal.aithal at cognidox.com>
 // Copyright (C) 2012, 2013 Adrian Johnson <ajohnson at redneon.com>
 // Copyright (C) 2012 Fabio D'Urso <fabiodurso at hotmail.it>
+// Copyright (C) 2013 Igalia S.L.
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -53,11 +55,15 @@
 #include "UTF.h"
 #include "Error.h"
 #include "DateInfo.h"
+#include "StructTreeRoot.h"
+#include "StructElement.h"
 
 static void printInfoString(Dict *infoDict, const char *key, const char *text,
 			    UnicodeMap *uMap);
 static void printInfoDate(Dict *infoDict, const char *key, const char *text);
 static void printBox(const char *text, PDFRectangle *box);
+static void printStruct(const StructElement* element, unsigned indent = 0);
+static void printIndent(unsigned level);
 
 static int firstPage = 1;
 static int lastPage = 0;
@@ -70,6 +76,8 @@ static char userPassword[33] = "\001";
 static GBool printVersion = gFalse;
 static GBool printHelp = gFalse;
 static GBool printEnc = gFalse;
+static GBool printStructure = gFalse;
+static GBool printStructureText = gFalse;
 
 static const ArgDesc argDesc[] = {
   {"-f",      argInt,      &firstPage,        0,
@@ -80,6 +88,10 @@ static const ArgDesc argDesc[] = {
    "print the page bounding boxes"},
   {"-meta",   argFlag,     &printMetadata,    0,
    "print the document metadata (XML)"},
+  {"-struct", argFlag,     &printStructure,   0,
+   "print the logical document structure (for tagged files)"},
+  {"-struct-text", argFlag, &printStructureText, 0,
+   "print text contents along with document structure (for tagged files)"},
   {"-rawdates", argFlag,   &rawDates,         0,
    "print the undecoded date strings directly from the PDF file"},
   {"-enc",    argString,   textEncName,    sizeof(textEncName),
@@ -136,6 +148,9 @@ int main(int argc, char *argv[]) {
     goto err0;
   }
 
+  if (printStructureText)
+    printStructure = gTrue;
+
   // read config file
   globalParams = new GlobalParams();
 
@@ -226,7 +241,13 @@ int main(int argc, char *argv[]) {
 
   // print tagging info
   printf("Tagged:         %s\n",
-	 doc->getStructTreeRoot()->isDict() ? "yes" : "no");
+	 doc->getStructTreeRoot() ? "yes" : "no");
+  printf("Marked:         %s\n",
+         (doc->getCatalog()->getMarkInfo() & Catalog::markInfoMarked) ? "yes" : "no");
+  printf("UserProperties: %s\n",
+         (doc->getCatalog()->getMarkInfo() & Catalog::markInfoUserProperties) ? "yes" : "no");
+  printf("Suspects:       %s\n",
+         (doc->getCatalog()->getMarkInfo() & Catalog::markInfoSuspects) ? "yes" : "no");
 
   // print form info
   switch (doc->getCatalog()->getFormType())
@@ -371,6 +392,15 @@ int main(int argc, char *argv[]) {
     delete metadata;
   }
 
+  // print the structure
+  const StructTreeRoot* structTree;
+  if (printStructure && (structTree = doc->getCatalog()->getStructTreeRoot())) {
+    fputs("Structure:\n", stdout);
+    for (unsigned i = 0; i < structTree->getNumElements(); i++) {
+      printStruct(structTree->getElement(i), 1);
+    }
+  }
+
   exitCode = 0;
 
   // clean up
@@ -451,3 +481,68 @@ static void printBox(const char *text, PDFRectangle *box) {
   printf("%s%8.2f %8.2f %8.2f %8.2f\n",
 	 text, box->x1, box->y1, box->x2, box->y2);
 }
+
+static void printIndent(unsigned indent) {
+  while (indent--) {
+    putchar(' ');
+    putchar(' ');
+  }
+}
+
+static void printAttribute(const Attribute* attribute, unsigned indent)
+{
+  printIndent(indent);
+  printf(" /%s ", attribute->getTypeName());
+  if (attribute->getType() == Attribute::UserProperty) {
+    printf("(%s) ", attribute->getName());
+  }
+  attribute->getValue()->print(stdout);
+  if (attribute->getFormattedValue()) {
+    printf(" \"%s\"", attribute->getFormattedValue());
+  }
+  if (attribute->isHidden()) {
+    printf(" [hidden]");
+  }
+}
+
+static void printStruct(const StructElement* element, unsigned indent) {
+  if (printStructureText && element->isContent()) {
+    GooString *text = element->getText(NULL, gFalse);
+    printIndent(indent);
+    if (text) {
+      printf("\"%s\"\n", text->getCString());
+    } else {
+      printf("(No content?)\n");
+    }
+    delete text;
+  }
+
+  if (!element->isContent()) {
+      printIndent(indent);
+      printf("%s", element->getTypeName());
+      if (element->getID()) {
+          printf(" <%s>", element->getID()->getCString());
+      }
+      if (element->getTitle()) {
+          printf(" \"%s\"", element->getTitle()->getCString());
+      }
+      if (element->getRevision() > 0) {
+          printf(" r%u", element->getRevision());
+      }
+      if (element->isInline() || element->isBlock()) {
+          printf(" (%s)", element->isInline() ? "inline" : "block");
+      }
+      if (element->getNumAttributes()) {
+          putchar(':');
+          for (unsigned i = 0; i < element->getNumAttributes(); i++) {
+              putchar('\n');
+              printAttribute(element->getAttribute(i), indent + 1);
+          }
+      }
+
+      putchar('\n');
+      for (unsigned i = 0; i < element->getNumElements(); i++) {
+          printStruct(element->getElement(i), indent + 1);
+      }
+  }
+}
-- 
1.8.3



More information about the poppler mailing list