[poppler] [PATCH 3/6] Tagged-PDF: Modify pdfinfo to show the document structure
Adrian Perez
aperez at igalia.com
Wed May 29 16:47:28 PDT 2013
From: Adrian Perez de Castro <aperez at igalia.com>
---
utils/pdfinfo.cc | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 96 insertions(+), 1 deletion(-)
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 14e4f6c..5127412 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -3,6 +3,7 @@
// pdfinfo.cc
//
// Copyright 1998-2003 Glyph & Cog, LLC
+// Copyright 2013 Igalia S.L.
//
//========================================================================
@@ -19,6 +20,7 @@
// Copyright (C) 2011 Vittal Aithal <vittal.aithal at cognidox.com>
// Copyright (C) 2012, 2013 Adrian Johnson <ajohnson at redneon.com>
// Copyright (C) 2012 Fabio D'Urso <fabiodurso at hotmail.it>
+// Copyright (C) 2013 Igalia S.L.
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -53,11 +55,15 @@
#include "UTF.h"
#include "Error.h"
#include "DateInfo.h"
+#include "StructTreeRoot.h"
+#include "StructElement.h"
static void printInfoString(Dict *infoDict, const char *key, const char *text,
UnicodeMap *uMap);
static void printInfoDate(Dict *infoDict, const char *key, const char *text);
static void printBox(const char *text, PDFRectangle *box);
+static void printStruct(const StructElement* element, unsigned indent = 0);
+static void printIndent(unsigned level);
static int firstPage = 1;
static int lastPage = 0;
@@ -70,6 +76,8 @@ static char userPassword[33] = "\001";
static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static GBool printEnc = gFalse;
+static GBool printStructure = gFalse;
+static GBool printStructureText = gFalse;
static const ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
@@ -80,6 +88,10 @@ static const ArgDesc argDesc[] = {
"print the page bounding boxes"},
{"-meta", argFlag, &printMetadata, 0,
"print the document metadata (XML)"},
+ {"-struct", argFlag, &printStructure, 0,
+ "print the logical document structure (for tagged files)"},
+ {"-struct-text", argFlag, &printStructureText, 0,
+ "print text contents along with document structure (for tagged files)"},
{"-rawdates", argFlag, &rawDates, 0,
"print the undecoded date strings directly from the PDF file"},
{"-enc", argString, textEncName, sizeof(textEncName),
@@ -136,6 +148,9 @@ int main(int argc, char *argv[]) {
goto err0;
}
+ if (printStructureText)
+ printStructure = gTrue;
+
// read config file
globalParams = new GlobalParams();
@@ -226,7 +241,13 @@ int main(int argc, char *argv[]) {
// print tagging info
printf("Tagged: %s\n",
- doc->getStructTreeRoot()->isDict() ? "yes" : "no");
+ doc->getStructTreeRoot() ? "yes" : "no");
+ printf("Marked: %s\n",
+ (doc->getCatalog()->getMarkInfo() & Catalog::markInfoMarked) ? "yes" : "no");
+ printf("UserProperties: %s\n",
+ (doc->getCatalog()->getMarkInfo() & Catalog::markInfoUserProperties) ? "yes" : "no");
+ printf("Suspects: %s\n",
+ (doc->getCatalog()->getMarkInfo() & Catalog::markInfoSuspects) ? "yes" : "no");
// print form info
switch (doc->getCatalog()->getFormType())
@@ -371,6 +392,15 @@ int main(int argc, char *argv[]) {
delete metadata;
}
+ // print the structure
+ const StructTreeRoot* structTree;
+ if (printStructure && (structTree = doc->getCatalog()->getStructTreeRoot())) {
+ fputs("Structure:\n", stdout);
+ for (unsigned i = 0; i < structTree->getNumElements(); i++) {
+ printStruct(structTree->getElement(i), 1);
+ }
+ }
+
exitCode = 0;
// clean up
@@ -451,3 +481,68 @@ static void printBox(const char *text, PDFRectangle *box) {
printf("%s%8.2f %8.2f %8.2f %8.2f\n",
text, box->x1, box->y1, box->x2, box->y2);
}
+
+static void printIndent(unsigned indent) {
+ while (indent--) {
+ putchar(' ');
+ putchar(' ');
+ }
+}
+
+static void printAttribute(const Attribute* attribute, unsigned indent)
+{
+ printIndent(indent);
+ printf(" /%s ", attribute->getTypeName());
+ if (attribute->getType() == Attribute::UserProperty) {
+ printf("(%s) ", attribute->getName());
+ }
+ attribute->getValue()->print(stdout);
+ if (attribute->getFormattedValue()) {
+ printf(" \"%s\"", attribute->getFormattedValue());
+ }
+ if (attribute->isHidden()) {
+ printf(" [hidden]");
+ }
+}
+
+static void printStruct(const StructElement* element, unsigned indent) {
+ if (printStructureText && element->isContent()) {
+ GooString *text = element->getText(NULL, gFalse);
+ printIndent(indent);
+ if (text) {
+ printf("\"%s\"\n", text->getCString());
+ } else {
+ printf("(No content?)\n");
+ }
+ delete text;
+ }
+
+ if (!element->isContent()) {
+ printIndent(indent);
+ printf("%s", element->getTypeName());
+ if (element->getID()) {
+ printf(" <%s>", element->getID()->getCString());
+ }
+ if (element->getTitle()) {
+ printf(" \"%s\"", element->getTitle()->getCString());
+ }
+ if (element->getRevision() > 0) {
+ printf(" r%u", element->getRevision());
+ }
+ if (element->isInline() || element->isBlock()) {
+ printf(" (%s)", element->isInline() ? "inline" : "block");
+ }
+ if (element->getNumAttributes()) {
+ putchar(':');
+ for (unsigned i = 0; i < element->getNumAttributes(); i++) {
+ putchar('\n');
+ printAttribute(element->getAttribute(i), indent + 1);
+ }
+ }
+
+ putchar('\n');
+ for (unsigned i = 0; i < element->getNumElements(); i++) {
+ printStruct(element->getElement(i), indent + 1);
+ }
+ }
+}
--
1.8.3
More information about the poppler
mailing list