[poppler] poppler/Catalog.cc poppler/Catalog.h poppler/Makefile.am poppler/PDFDoc.h poppler/StructElement.cc poppler/StructElement.h poppler/StructTreeRoot.cc poppler/StructTreeRoot.h

Carlos Garcia Campos carlosgc at kemper.freedesktop.org
Wed Oct 2 03:48:27 PDT 2013


 poppler/Catalog.cc        |   36 +++--
 poppler/Catalog.h         |    5 
 poppler/Makefile.am       |    4 
 poppler/PDFDoc.h          |    3 
 poppler/StructElement.cc  |  322 ++++++++++++++++++++++++++++++++++++++++++++++
 poppler/StructElement.h   |  167 +++++++++++++++++++++++
 poppler/StructTreeRoot.cc |  174 ++++++++++++++++++++++++
 poppler/StructTreeRoot.h  |   83 +++++++++++
 8 files changed, 776 insertions(+), 18 deletions(-)

New commits:
commit e04cabd878a0fd84faa5178f423fd828d010b664
Author: Adrian Perez de Castro <aperez at igalia.com>
Date:   Mon Jun 17 17:00:27 2013 +0300

    Tagged-PDF: Implement parsing of StructTreeRoot
    
    Implement parsing of the StructTreeRoot entry of the Catalog. Also, the
    Catalog::getStructTreeRoot() and PDFDoc::getStructTreeRoot() methods are
    modified to return an instance of StructTreeRoot instead of an Object.
    
    All elements from the StructTreeRoot are parsed except for:
    
    - IDTree: it is a lookup tree to locate items by their ID, which would
      be barely useful because the whole structure tree is to be kept in
      memory, which should be fast enough to traverse.
    - ParentTreeNextKey: This is needed only when the ParentTree object is
      to be modified. For the moment the implementation deals only with
      reading, so this has been deliberately left out.
    
    StructElem tree nodes from the document structure tree are parsed as a
    StructElement instance. Attributes and extraction of content out from
    elements are not yet handled.
    
    https://bugs.freedesktop.org/show_bug.cgi?id=64815

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index 2e2511e..a06ae5f 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -58,6 +58,7 @@
 #include "OptionalContent.h"
 #include "ViewerPreferences.h"
 #include "FileSpec.h"
+#include "StructTreeRoot.h"
 
 #if MULTITHREADED
 #  define catalogLocker()   MutexLocker locker(&mutex)
@@ -93,6 +94,7 @@ Catalog::Catalog(PDFDoc *docA) {
   embeddedFileNameTree = NULL;
   jsNameTree = NULL;
   viewerPrefs = NULL;
+  structTreeRoot = NULL;
 
   pagesList = NULL;
   pagesRefList = NULL;
@@ -181,8 +183,8 @@ Catalog::~Catalog() {
   delete form;
   delete optContent;
   delete viewerPrefs;
+  delete structTreeRoot;
   metadata.free();
-  structTreeRoot.free();
   outline.free();
   acroForm.free();
   viewerPreferences.free();
@@ -844,24 +846,28 @@ PageLabelInfo *Catalog::getPageLabelInfo()
   return pageLabelInfo;
 }
 
-Object *Catalog::getStructTreeRoot()
+StructTreeRoot *Catalog::getStructTreeRoot()
 {
   catalogLocker();
-  if (structTreeRoot.isNone())
-  {
-     Object catDict;
+  if (!structTreeRoot) {
+    Object catalog;
+    Object root;
+
+    xref->getCatalog(&catalog);
+    if (!catalog.isDict()) {
+      error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catalog.getTypeName());
+      catalog.free();
+      return NULL;
+    }
 
-     xref->getCatalog(&catDict);
-     if (catDict.isDict()) {
-       catDict.dictLookup("StructTreeRoot", &structTreeRoot);
-     } else {
-       error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catDict.getTypeName());
-       structTreeRoot.initNull();
-     }
-     catDict.free();
-  }
+    if (catalog.dictLookup("StructTreeRoot", &root)->isDict("StructTreeRoot")) {
+      structTreeRoot = new StructTreeRoot(doc, root.getDict());
+    }
 
-  return &structTreeRoot;
+    root.free();
+    catalog.free();
+  }
+  return structTreeRoot;
 }
 
 Guint Catalog::getMarkInfo()
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 1a445f5..40c783c 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -56,6 +56,7 @@ class Form;
 class OCGs;
 class ViewerPreferences;
 class FileSpec;
+class StructTreeRoot;
 
 //------------------------------------------------------------------------
 // NameTree
@@ -126,7 +127,7 @@ public:
   GooString *readMetadata();
 
   // Return the structure tree root object.
-  Object *getStructTreeRoot();
+  StructTreeRoot *getStructTreeRoot();
 
   // Return values from the MarkInfo dictionary as flags in a bitfield.
   enum MarkInfoFlags {
@@ -241,7 +242,7 @@ private:
   NameTree *jsNameTree;		// Java Script name-tree
   GooString *baseURI;		// base URI for URI-type links
   Object metadata;		// metadata stream
-  Object structTreeRoot;	// structure tree root dictionary
+  StructTreeRoot *structTreeRoot;	// structure tree root
   Guint markInfo;               // Flags from MarkInfo dictionary
   Object outline;		// outline dictionary
   Object acroForm;		// AcroForm dictionary
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index aa7c924..9f90c9d 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -216,6 +216,8 @@ poppler_include_HEADERS =	\
 	StdinPDFDocBuilder.h	\
 	Stream-CCITT.h		\
 	Stream.h		\
+	StructElement.h		\
+	StructTreeRoot.h	\
 	UnicodeMap.h		\
 	UnicodeMapTables.h	\
 	UnicodeTypeTable.h	\
@@ -294,6 +296,8 @@ libpoppler_la_SOURCES =		\
 	StdinCachedFile.cc	\
 	StdinPDFDocBuilder.cc	\
 	Stream.cc 		\
+	StructTreeRoot.cc	\
+	StructElement.cc	\
 	strtok_r.cpp		\
 	UnicodeMap.cc		\
 	UnicodeTypeTable.cc	\
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index da9bf5b..48189bc 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -60,6 +60,7 @@ class Outline;
 class Linearization;
 class SecurityHandler;
 class Hints;
+class StructTreeRoot;
 
 enum PDFWriteMode {
   writeStandard,
@@ -139,7 +140,7 @@ public:
   GooString *readMetadata() { return catalog->readMetadata(); }
 
   // Return the structure tree root object.
-  Object *getStructTreeRoot() { return catalog->getStructTreeRoot(); }
+  StructTreeRoot *getStructTreeRoot() { return catalog->getStructTreeRoot(); }
 
   // Get page.
   Page *getPage(int page);
diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc
new file mode 100644
index 0000000..e403457
--- /dev/null
+++ b/poppler/StructElement.cc
@@ -0,0 +1,322 @@
+//========================================================================
+//
+// StructElement.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "StructElement.h"
+#include "StructTreeRoot.h"
+#include "PDFDoc.h"
+#include "Dict.h"
+
+#include <assert.h>
+
+class GfxState;
+
+
+static const char *typeToName(StructElement::Type type)
+{
+  if (type == StructElement::MCID)
+    return "MarkedContent";
+  if (type == StructElement::OBJR)
+    return "ObjectReference";
+
+  return "Unknown";
+}
+
+
+//------------------------------------------------------------------------
+// StructElement
+//------------------------------------------------------------------------
+
+StructElement::StructData::StructData():
+  altText(0),
+  actualText(0),
+  id(0),
+  title(0),
+  expandedAbbr(0),
+  language(0),
+  revision(0)
+{
+}
+
+StructElement::StructData::~StructData()
+{
+  delete altText;
+  delete actualText;
+  delete id;
+  delete title;
+  delete language;
+  parentRef.free();
+  for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i) delete *i;
+}
+
+
+StructElement::StructElement(Dict *element,
+                             StructTreeRoot *treeRootA,
+                             StructElement *parentA,
+                             std::set<int> &seen):
+  type(Unknown),
+  treeRoot(treeRootA),
+  parent(parentA),
+  s(new StructData())
+{
+  assert(treeRoot);
+  assert(element);
+
+  parse(element);
+  parseChildren(element, seen);
+}
+
+StructElement::StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA):
+  type(MCID),
+  treeRoot(treeRootA),
+  parent(parentA),
+  c(new ContentData(mcid))
+{
+  assert(treeRoot);
+  assert(parent);
+}
+
+StructElement::StructElement(const Ref& ref, StructTreeRoot *treeRootA, StructElement *parentA):
+  type(OBJR),
+  treeRoot(treeRootA),
+  parent(parentA),
+  c(new ContentData(ref))
+{
+  assert(treeRoot);
+  assert(parent);
+}
+
+StructElement::~StructElement()
+{
+  if (isContent())
+    delete c;
+  else
+    delete s;
+  pageRef.free();
+}
+
+GBool StructElement::hasPageRef() const
+{
+  return pageRef.isRef() || (parent && parent->hasPageRef());
+}
+
+bool StructElement::getPageRef(Ref& ref) const
+{
+  if (pageRef.isRef()) {
+    ref = pageRef.getRef();
+    return gTrue;
+  }
+
+  if (parent)
+    return parent->getPageRef(ref);
+
+  return gFalse;
+}
+
+const char* StructElement::getTypeName() const
+{
+  return typeToName(type);
+}
+
+static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved)
+{
+  // TODO Replace this dummy implementation
+  return StructElement::Unknown;
+}
+
+void StructElement::parse(Dict *element)
+{
+  Object obj;
+
+  // Type is optional, but if present must be StructElem
+  if (!element->lookup("Type", &obj)->isNull() && !obj.isName("StructElem")) {
+    error(errSyntaxError, -1, "Type of StructElem object is wrong");
+    obj.free();
+    return;
+  }
+  obj.free();
+
+  // Parent object reference (required).
+  if (!element->lookupNF("P", &s->parentRef)->isRef()) {
+    error(errSyntaxError, -1, "P object is wrong type ({0:s})", obj.getTypeName());
+    return;
+  }
+
+  // Check whether the S-type is valid for the top level
+  // element and create a node of the appropriate type.
+  if (!element->lookup("S", &obj)->isName()) {
+    error(errSyntaxError, -1, "S object is wrong type ({0:s})", obj.getTypeName());
+    obj.free();
+    return;
+  }
+
+  // Type name may not be standard, resolve through RoleMap first.
+  if (treeRoot->getRoleMap()) {
+    Object resolvedName;
+    type = roleMapResolve(treeRoot->getRoleMap(), obj.getName(), NULL, &resolvedName);
+  }
+
+  obj.free();
+
+  // Object ID (optional), to be looked at the IDTree in the tree root.
+  if (element->lookup("ID", &obj)->isString()) {
+    s->id = obj.takeString();
+  }
+  obj.free();
+
+  // Page reference (optional) in which at least one of the child items
+  // is to be rendered in. Note: each element stores only the /Pg value
+  // contained by it, and StructElement::getPageRef() may look in parent
+  // elements to find the page where an element belongs.
+  element->lookupNF("Pg", &pageRef);
+
+  // Revision number (optional).
+  if (element->lookup("R", &obj)->isInt()) {
+    s->revision = obj.getInt();
+  }
+  obj.free();
+
+  // Element title (optional).
+  if (element->lookup("T", &obj)->isString()) {
+    s->title = obj.takeString();
+  }
+  obj.free();
+
+  // Language (optional).
+  if (element->lookup("Lang", &obj)->isString()) {
+    s->language = obj.takeString();
+  }
+  obj.free();
+
+  // Alternative text (optional).
+  if (element->lookup("Alt", &obj)->isString()) {
+    s->altText = obj.takeString();
+  }
+  obj.free();
+
+  // Expanded form of an abbreviation (optional).
+  if (element->lookup("E", &obj)->isString()) {
+    s->expandedAbbr = obj.takeString();
+  }
+  obj.free();
+
+  // Actual text (optional).
+  if (element->lookup("ActualText", &obj)->isString()) {
+    s->actualText = obj.takeString();
+  }
+  obj.free();
+
+  // TODO: Attributes directly attached to the element (optional).
+  // TODO: Attributes referenced indirectly through the ClassMap (optional).
+}
+
+StructElement *StructElement::parseChild(Object *ref,
+                                         Object *childObj,
+                                         std::set<int> &seen)
+{
+  assert(childObj);
+  assert(ref);
+
+  StructElement *child = NULL;
+
+  if (childObj->isInt()) {
+    child = new StructElement(childObj->getInt(), treeRoot, this);
+  } else if (childObj->isDict("MCR")) {
+    /*
+     * TODO: The optional Stm/StwOwn attributes are not handled, so all the
+     *      page will be always scanned when calling StructElement::getText().
+     */
+    Object mcidObj;
+    Object pageRefObj;
+
+    if (!childObj->dictLookup("MCID", &mcidObj)->isInt()) {
+      error(errSyntaxError, -1, "MCID object is wrong type ({0:s})", mcidObj.getTypeName());
+      mcidObj.free();
+      return NULL;
+    }
+
+    child = new StructElement(mcidObj.getInt(), treeRoot, this);
+    mcidObj.free();
+
+    if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) {
+      child->pageRef = pageRefObj;
+    } else {
+      pageRefObj.free();
+    }
+  } else if (childObj->isDict("OBJR")) {
+    Object refObj;
+
+    if (childObj->dictLookupNF("Obj", &refObj)->isRef()) {
+      Object pageRefObj;
+
+      child = new StructElement(refObj.getRef(), treeRoot, this);
+
+      if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) {
+        child->pageRef = pageRefObj;
+      } else {
+        pageRefObj.free();
+      }
+    } else {
+      error(errSyntaxError, -1, "Obj object is wrong type ({0:s})", refObj.getTypeName());
+    }
+    refObj.free();
+  } else if (childObj->isDict()) {
+    if (!ref->isRef()) {
+      error(errSyntaxError, -1,
+            "Structure element dictionary is not an indirect reference ({0:s})",
+            ref->getTypeName());
+    } else if (seen.find(ref->getRefNum()) == seen.end()) {
+      seen.insert(ref->getRefNum());
+      child = new StructElement(childObj->getDict(), treeRoot, this, seen);
+    } else {
+      error(errSyntaxWarning, -1,
+            "Loop detected in structure tree, skipping subtree at object {0:i}:{0:i}",
+            ref->getRefNum(), ref->getRefGen());
+    }
+  } else {
+    error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", childObj->getTypeName());
+  }
+
+  if (child) {
+    if (child->isOk()) {
+      appendElement(child);
+      if (ref->isRef())
+        treeRoot->parentTreeAdd(ref->getRef(), child);
+    } else {
+      delete child;
+      child = NULL;
+    }
+  }
+
+  return child;
+}
+
+void StructElement::parseChildren(Dict *element, std::set<int> &seen)
+{
+  Object kids;
+
+  if (element->lookup("K", &kids)->isArray()) {
+    for (int i = 0; i < kids.arrayGetLength(); i++) {
+      Object obj, ref;
+      parseChild(kids.arrayGetNF(i, &ref), kids.arrayGet(i, &obj), seen);
+      obj.free();
+      ref.free();
+    }
+  } else if (kids.isDict() || kids.isInt()) {
+    Object ref;
+    parseChild(element->lookupNF("K", &ref), &kids, seen);
+    ref.free();
+  }
+
+  kids.free();
+}
diff --git a/poppler/StructElement.h b/poppler/StructElement.h
new file mode 100644
index 0000000..d1997c9
--- /dev/null
+++ b/poppler/StructElement.h
@@ -0,0 +1,167 @@
+//========================================================================
+//
+// StructElement.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef STRUCTELEMENT_H
+#define STRUCTELEMENT_H
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "goo/gtypes.h"
+#include "goo/GooString.h"
+#include "Object.h"
+#include <vector>
+#include <set>
+
+class GooString;
+class Dict;
+class StructTreeRoot;
+
+
+class StructElement {
+public:
+  enum Type {
+    Unknown = 0,
+    MCID,                                   // MCID reference, used internally
+    OBJR,                                   // Object reference, used internally
+
+    Document, Part, Art, Sect, Div,         // Structural elements
+
+    Span, Quote, Note, Reference, BibEntry, // Inline elements
+    Code, Link, Annot,
+    BlockQuote, Caption, NonStruct,
+    TOC, TOCI, Index, Private,
+
+    P, H, H1, H2, H3, H4, H5, H6,           // Paragraph-like
+
+    L, LI, Lbl,                             // List elements
+
+    Table, TR, TH, TD, THead, TFoot, TBody, // Table elements
+
+    Ruby, RB, RT, RP,                       // Ruby text elements
+    Warichu, WT, WP,
+
+    Figure, Formula, Form,                  // Illustration-like elements
+  };
+
+  static const Ref InvalidRef;
+
+  const char *getTypeName() const;
+  Type getType() const { return type; }
+  GBool isOk() const { return type != Unknown; }
+
+  inline GBool isContent() const { return (type == MCID) || isObjectRef(); }
+  inline GBool isObjectRef() const { return (type == OBJR && c->ref.num != -1 && c->ref.gen != -1); }
+
+  int getMCID() const { return c->mcid; }
+  Ref getObjectRef() const { return c->ref; }
+  Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef.getRef(); }
+  GBool hasPageRef() const;
+  GBool getPageRef(Ref& ref) const;
+  StructTreeRoot *getStructTreeRoot() { return treeRoot; }
+
+  // Optional element identifier.
+  const GooString *getID() const { return isContent() ? NULL : s->id; }
+  GooString *getID() { return isContent() ? NULL : s->id; }
+
+  // Optional ISO language name, e.g. en_US
+  GooString *getLanguage() {
+    if (!isContent() && s->language) return s->language;
+    return parent ? parent->getLanguage() : NULL;
+  }
+  const GooString *getLanguage() const {
+    if (!isContent() && s->language) return s->language;
+    return parent ? parent->getLanguage() : NULL;
+  }
+
+  // Optional revision number, defaults to zero.
+  Guint getRevision() const { return isContent() ? 0 : s->revision; }
+  void setRevision(Guint revision) { if (isContent()) s->revision = revision; }
+
+  // Optional element title, in human-readable form.
+  const GooString *getTitle() const { return isContent() ? NULL : s->title; }
+  GooString *getTitle() { return isContent() ? NULL : s->title; }
+
+  // Optional element expanded abbreviation text.
+  const GooString *getExpandedAbbr() const { return isContent() ? NULL : s->expandedAbbr; }
+  GooString *getExpandedAbbr() { return isContent() ? NULL : s->expandedAbbr; }
+
+  unsigned getNumElements() const { return isContent() ? 0 : s->elements.size(); }
+  const StructElement *getElement(int i) const { return isContent() ? NULL : s->elements.at(i); }
+  StructElement *getElement(int i) { return isContent() ? NULL : s->elements.at(i); }
+
+  void appendElement(StructElement *element) {
+    if (!isContent() && element && element->isOk()) {
+      s->elements.push_back(element);
+    }
+  }
+
+  const GooString *getAltText() const { return isContent() ? NULL : s->altText; }
+  GooString *getAltText() { return isContent() ? NULL : s->altText; }
+
+  const GooString *getActualText() const { return isContent() ? NULL : s->actualText; }
+  GooString *getActualText() { return isContent() ? NULL : s->actualText; }
+
+  ~StructElement();
+
+private:
+  typedef std::vector<StructElement*> ElemPtrArray;
+
+  struct StructData {
+    Object       parentRef;
+    GooString   *altText;
+    GooString   *actualText;
+    GooString   *id;
+    GooString   *title;
+    GooString   *expandedAbbr;
+    GooString   *language;
+    Guint        revision;
+    ElemPtrArray elements;
+
+    StructData();
+    ~StructData();
+  };
+
+  // Data in content elements (MCID, MCR)
+  struct ContentData {
+    union {
+      int mcid;
+      Ref ref;
+    };
+
+    ContentData(int mcidA): mcid(mcidA) {}
+    ContentData(const Ref& r) { ref.num = r.num; ref.gen = r.gen; }
+  };
+
+  // Common data
+  Type type;
+  StructTreeRoot *treeRoot;
+  StructElement *parent;
+  mutable Object pageRef;
+
+  union {
+    StructData  *s;
+    ContentData *c;
+  };
+
+  StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA, std::set<int> &seen);
+  StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA);
+  StructElement(const Ref &ref, StructTreeRoot *treeRootA, StructElement *parentA);
+
+  void parse(Dict* elementDict);
+  StructElement* parseChild(Object *ref, Object* childObj, std::set<int> &seen);
+  void parseChildren(Dict* element, std::set<int> &seen);
+
+  friend class StructTreeRoot;
+};
+
+#endif
+
diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc
new file mode 100644
index 0000000..59f017e
--- /dev/null
+++ b/poppler/StructTreeRoot.cc
@@ -0,0 +1,174 @@
+//========================================================================
+//
+// StructTreeRoot.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "goo/GooString.h"
+#include "StructTreeRoot.h"
+#include "StructElement.h"
+#include "PDFDoc.h"
+#include "Object.h"
+#include "Dict.h"
+#include <set>
+#include <assert.h>
+
+
+StructTreeRoot::StructTreeRoot(PDFDoc *docA, Dict *structTreeRootDict):
+  doc(docA)
+{
+  assert(doc);
+  assert(structTreeRootDict);
+  parse(structTreeRootDict);
+}
+
+StructTreeRoot::~StructTreeRoot()
+{
+  for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i)
+    delete *i;
+  classMap.free();
+  roleMap.free();
+}
+
+void StructTreeRoot::parse(Dict *root)
+{
+  // The RoleMap/ClassMap dictionaries are needed by all the parsing
+  // functions, which will resolve the custom names to canonical
+  // standard names.
+  root->lookup("RoleMap", &roleMap);
+  root->lookup("ClassMap", &classMap);
+
+  // ParentTree (optional). If present, it must be a number tree,
+  // otherwise it is not possible to map stream objects to their
+  // corresponsing structure element. Here only the references are
+  // loaded into the array, the pointers to the StructElements will
+  // be filled-in later when parsing them.
+  Object obj;
+  if (root->lookup("ParentTree", &obj)->isDict()) {
+    Object nums;
+    if (obj.dictLookup("Nums", &nums)->isArray()) {
+      if (nums.arrayGetLength() % 2 == 0) {
+        parentTree.resize(nums.arrayGetLength() / 2);
+        // Index numbers in even positions, references in odd ones
+        for (int i = 0; i < nums.arrayGetLength(); i += 2) {
+          Object index, value;
+
+          if (!nums.arrayGet(i, &index)->isInt()) {
+            error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i, index.getTypeName());
+            index.free();
+            continue;
+          }
+          if (index.getInt() < 0) {
+            error(errSyntaxError, -1, "Nums item at position {0:d} is invalid value ({1:d})", i, index.getInt());
+            index.free();
+            continue;
+          }
+
+          const unsigned idx = index.getInt();
+          if (nums.arrayGetNF(i + 1, &value)->isRef()) {
+            parentTree[idx].resize(1);
+            parentTree[idx][0].ref = value.getRef();
+          } else if (nums.arrayGet(i + 1, &value)->isArray()) {
+            parentTree[idx].resize(value.arrayGetLength());
+            for (int j = 0; j < value.arrayGetLength(); j++) {
+              Object itemvalue;
+              if (value.arrayGetNF(j, &itemvalue)->isRef())
+                parentTree[idx][j].ref = itemvalue.getRef();
+              else
+                error(errSyntaxError, -1, "Nums array item at position {0:d}/{1:d} is invalid type ({2:s})", i, j, itemvalue.getTypeName());
+              itemvalue.free();
+            }
+          } else {
+            error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i + 1, value.getTypeName());
+          }
+
+          value.free();
+          index.free();
+        }
+      } else {
+        error(errSyntaxError, -1, "Nums array length is not a even ({0:i})", nums.arrayGetLength());
+      }
+    } else {
+      error(errSyntaxError, -1, "Nums object is wrong type ({0:s})", nums.getTypeName());
+    }
+    nums.free();
+  }
+  obj.free();
+
+  std::set<int> seenElements;
+
+  // Parse the children StructElements
+  const GBool marked = doc->getCatalog()->getMarkInfo() & Catalog::markInfoMarked;
+  Object kids;
+  if (root->lookup("K", &kids)->isArray()) {
+    if (marked && kids.arrayGetLength() > 1) {
+      error(errSyntaxWarning, -1, "K in StructTreeRoot has more than one children in a tagged PDF");
+    }
+    for (int i = 0; i < kids.arrayGetLength(); i++) {
+      Object obj, ref;
+      kids.arrayGetNF(i, &ref);
+      if (ref.isRef()) {
+        seenElements.insert(ref.getRefNum());
+      }
+      if (kids.arrayGet(i, &obj)->isDict()) {
+        StructElement *child = new StructElement(obj.getDict(), this, NULL, seenElements);
+        if (child->isOk()) {
+          if (marked && !(child->getType() == StructElement::Document ||
+                          child->getType() == StructElement::Part ||
+                          child->getType() == StructElement::Art ||
+                          child->getType() == StructElement::Div)) {
+            error(errSyntaxWarning, -1, "StructTreeRoot element of tagged PDF is wrong type ({0:s})", child->getTypeName());
+          }
+          appendElement(child);
+          if (ref.isRef()) {
+            parentTreeAdd(ref.getRef(), child);
+          }
+        } else {
+          error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed");
+          delete child;
+        }
+      } else {
+        error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", obj.getTypeName());
+      }
+      obj.free();
+      ref.free();
+    }
+  } else if (kids.isDict()) {
+    if (marked) {
+      error(errSyntaxWarning, -1, "K has a child of wrong type for a tagged PDF ({0:s})", kids.getTypeName());
+    }
+    StructElement *child = new StructElement(kids.getDict(), this, NULL, seenElements);
+    if (child->isOk()) {
+      appendElement(child);
+      Object ref;
+      if (root->lookupNF("K", &ref)->isRef())
+        parentTreeAdd(ref.getRef(), child);
+      ref.free();
+    } else {
+      error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed");
+      delete child;
+    }
+  } else if (!kids.isNull()) {
+    error(errSyntaxWarning, -1, "K in StructTreeRoot is wrong type ({0:s})", kids.getTypeName());
+  }
+
+  kids.free();
+}
+
+void StructTreeRoot::parentTreeAdd(const Ref &objectRef, StructElement *element)
+{
+  for (std::vector< std::vector<Parent> >::iterator i = parentTree.begin(); i != parentTree.end(); ++i) {
+    for (std::vector<Parent>::iterator j = i->begin(); j != i->end(); ++j) {
+      if (j->ref.num == objectRef.num && j->ref.gen == objectRef.gen)
+        j->element = element;
+    }
+  }
+}
diff --git a/poppler/StructTreeRoot.h b/poppler/StructTreeRoot.h
new file mode 100644
index 0000000..9928e2f
--- /dev/null
+++ b/poppler/StructTreeRoot.h
@@ -0,0 +1,83 @@
+//========================================================================
+//
+// StructTreeRoot.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef STRUCTTREEROOT_H
+#define STRUCTTREEROOT_H
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "goo/gtypes.h"
+#include "Object.h"
+#include "StructElement.h"
+#include <vector>
+
+class Dict;
+class PDFDoc;
+
+
+class StructTreeRoot
+{
+public:
+  StructTreeRoot(PDFDoc *docA, Dict *rootDict);
+  ~StructTreeRoot();
+
+  PDFDoc *getDoc() { return doc; }
+  Dict *getRoleMap() { return roleMap.isDict() ? roleMap.getDict() : NULL; }
+  Dict *getClassMap() { return classMap.isDict() ? classMap.getDict() : NULL; }
+  unsigned getNumElements() const { return elements.size(); }
+  const StructElement *getElement(int i) const { return elements.at(i); }
+  StructElement *getElement(int i) { return elements.at(i); }
+
+  void appendElement(StructElement *element) {
+    if (element && element->isOk()) {
+      elements.push_back(element);
+    }
+  }
+
+  const StructElement *findParentElement(unsigned index) const {
+    if (index < parentTree.size() && parentTree[index].size() == 1) {
+      return parentTree[index][0].element;
+    }
+    return NULL;
+  }
+
+private:
+  typedef std::vector<StructElement*> ElemPtrArray;
+
+  // Structure for items in /ParentTree, it keeps a mapping of
+  // object references and pointers to StructElement objects.
+  struct Parent {
+    Ref            ref;
+    StructElement *element;
+
+    Parent(): element(NULL) { ref.num = ref.gen = -1; }
+    Parent(const Parent &p): element(p.element) {
+      ref.num = p.ref.num;
+      ref.gen = p.ref.gen;
+    }
+    ~Parent() {}
+  };
+
+  PDFDoc *doc;
+  Object roleMap;
+  Object classMap;
+  ElemPtrArray elements;
+  std::vector< std::vector<Parent> > parentTree;
+
+  void parse(Dict *rootDict);
+  void parentTreeAdd(const Ref &objectRef, StructElement *element);
+
+  friend class StructElement;
+};
+
+#endif
+


More information about the poppler mailing list