[poppler] [PATCH 2/6] Tagged-PDF: Interpret the document structure

Adrian Perez aperez at igalia.com
Wed May 29 16:47:27 PDT 2013


From: Adrian Perez de Castro <aperez at igalia.com>

Picking from StructTreeRoot, recursively creates a tree of StructTreeNode
objects representing the structure of the document. The biggest missing
things are:

- Presenting more information in StructTreeNode, being the most
	important reading the structure and attribute dictionaries for
	the elements in the tree.
- Resolving marked-content identifiers which refer to information
	stored in page object streams.
- Creating a synthetic tree when the PDF is not tagged to use as
	fall-back.
---
 poppler/Catalog.cc        |   36 +-
 poppler/Catalog.h         |    5 +-
 poppler/MCOutputDev.cc    |  145 +++++
 poppler/MCOutputDev.h     |  108 ++++
 poppler/Makefile.am       |    6 +
 poppler/PDFDoc.h          |    3 +-
 poppler/StructElement.cc  | 1361 +++++++++++++++++++++++++++++++++++++++++++++
 poppler/StructElement.h   |  273 +++++++++
 poppler/StructTreeRoot.cc |  120 ++++
 poppler/StructTreeRoot.h  |   56 ++
 10 files changed, 2095 insertions(+), 18 deletions(-)
 create mode 100644 poppler/MCOutputDev.cc
 create mode 100644 poppler/MCOutputDev.h
 create mode 100644 poppler/StructElement.cc
 create mode 100644 poppler/StructElement.h
 create mode 100644 poppler/StructTreeRoot.cc
 create mode 100644 poppler/StructTreeRoot.h

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index c365e06..f24f8a3 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -55,6 +55,7 @@
 #include "OptionalContent.h"
 #include "ViewerPreferences.h"
 #include "FileSpec.h"
+#include "StructTreeRoot.h"
 
 #if MULTITHREADED
 #  define catalogLocker()   MutexLocker locker(&mutex)
@@ -90,6 +91,7 @@ Catalog::Catalog(PDFDoc *docA) {
   embeddedFileNameTree = NULL;
   jsNameTree = NULL;
   viewerPrefs = NULL;
+  structTreeRoot = NULL;
 
   pagesList = NULL;
   pagesRefList = NULL;
@@ -175,8 +177,8 @@ Catalog::~Catalog() {
   delete form;
   delete optContent;
   delete viewerPrefs;
+  delete structTreeRoot;
   metadata.free();
-  structTreeRoot.free();
   outline.free();
   acroForm.free();
   viewerPreferences.free();
@@ -837,24 +839,28 @@ PageLabelInfo *Catalog::getPageLabelInfo()
   return pageLabelInfo;
 }
 
-Object *Catalog::getStructTreeRoot()
+StructTreeRoot *Catalog::getStructTreeRoot()
 {
   catalogLocker();
-  if (structTreeRoot.isNone())
-  {
-     Object catDict;
+  if (!structTreeRoot) {
+    Object catalog;
+    Object root;
 
-     xref->getCatalog(&catDict);
-     if (catDict.isDict()) {
-       catDict.dictLookup("StructTreeRoot", &structTreeRoot);
-     } else {
-       error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catDict.getTypeName());
-       structTreeRoot.initNull();
-     }
-     catDict.free();
+    xref->getCatalog(&catalog);
+    if (!catalog.isDict()) {
+      error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catalog.getTypeName());
+      catalog.free();
+      return NULL;
+    }
+
+    if (catalog.dictLookup("StructTreeRoot", &root)->isDict("StructTreeRoot")) {
+      structTreeRoot = new StructTreeRoot(doc, root.getDict(), getMarkInfo() & markInfoMarked);
+    }
+
+    root.free();
+    catalog.free();
   }
-
-  return &structTreeRoot;
+  return structTreeRoot;
 }
 
 Guint Catalog::getMarkInfo()
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 35b4f87..bdba3ce 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -53,6 +53,7 @@ class Form;
 class OCGs;
 class ViewerPreferences;
 class FileSpec;
+class StructTreeRoot;
 
 //------------------------------------------------------------------------
 // NameTree
@@ -123,7 +124,7 @@ public:
   GooString *readMetadata();
 
   // Return the structure tree root object.
-  Object *getStructTreeRoot();
+  StructTreeRoot* getStructTreeRoot();
 
   // Return values from the MarkInfo dictionary as flags in a bitfield.
   enum MarkInfoFlags {
@@ -227,8 +228,8 @@ private:
   NameTree *jsNameTree;		// Java Script name-tree
   GooString *baseURI;		// base URI for URI-type links
   Object metadata;		// metadata stream
-  Object structTreeRoot;	// structure tree root dictionary
   int markInfo;                 // Flags from MarkInfo dictionary
+  StructTreeRoot *structTreeRoot;	// structure tree root
   Object outline;		// outline dictionary
   Object acroForm;		// AcroForm dictionary
   Object viewerPreferences;     // ViewerPreference dictionary
diff --git a/poppler/MCOutputDev.cc b/poppler/MCOutputDev.cc
new file mode 100644
index 0000000..e593c78
--- /dev/null
+++ b/poppler/MCOutputDev.cc
@@ -0,0 +1,145 @@
+//========================================================================
+//
+// MCOutputDev.cc
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#include "MCOutputDev.h"
+#include "GfxFont.h"
+#include "GfxState.h"
+#include "Annot.h"
+#include "Link.h"
+#include <vector>
+
+struct MCOutputDev::Priv
+{
+  MCOpArray commands;
+  bool      capturing;
+  int       mcid;
+  GfxFont  *lastFont;
+  Guint     lastFlags;
+  double    pageWidth;
+  double    pageHeight;
+
+  Priv(int mcidA):
+    commands(),
+    capturing(false),
+    mcid(mcidA),
+    lastFont(0),
+    lastFlags(0),
+    pageWidth(0.0),
+    pageHeight(0.0)
+  {}
+};
+
+
+MCOutputDev::MCOutputDev(int mcid):
+  p(new Priv(mcid))
+{
+}
+
+
+MCOutputDev::~MCOutputDev()
+{
+  delete p;
+}
+
+
+void MCOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
+{
+  if (state) {
+    p->pageWidth  = state->getPageWidth();
+    p->pageHeight = state->getPageHeight();
+  } else {
+    p->pageWidth = p->pageHeight = 0.0;
+  }
+}
+
+
+void MCOutputDev::endPage()
+{
+  p->pageWidth = p->pageHeight = 0.0;
+}
+
+
+void MCOutputDev::beginMarkedContent(char *name, Dict *properties)
+{
+  int id = -1;
+  if (properties && properties->lookupInt("MCID", NULL, &id) && id == p->mcid)
+    p->capturing = true;
+}
+
+
+void MCOutputDev::endMarkedContent(GfxState *state)
+{
+  p->capturing = false;
+}
+
+
+void MCOutputDev::drawChar(GfxState *state,
+                           double xx, double yy,
+                           double dx, double dy,
+                           double ox, double oy,
+                           CharCode c, int nBytes,
+                           Unicode *u, int uLen)
+{
+  if (!p->capturing || !uLen)
+    return;
+
+  double sp, dx2, dy2, w1, h1, x1, y1;
+
+  // Subtract char and word spacing from the (dx,dy) values
+  sp = state->getCharSpace();
+  if (c == (CharCode) 0x20)
+    sp += state->getWordSpace();
+  state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
+  dx -= dx2;
+  dy -= dy2;
+  state->transformDelta(dx, dy, &w1, &h1);
+  state->transform(xx, yy, &x1, &y1);
+
+  // Throw away characters that are not inside the page boundaries.
+  if (x1 + w1 < 0 || x1 > p->pageWidth || y1 + h1 < 0 || y1 > p->pageHeight)
+    return;
+
+  // Make a sanity check on character size. Note: (x != x) <-> isnan(x)
+  if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1)
+    return;
+
+  for (int i = 0; i < uLen; i++)
+    p->commands.push_back(MCOp(u[i]));
+}
+
+
+void MCOutputDev::updateFont(GfxState *state)
+{
+  GfxFont *font = state->getFont();
+  if (!font || font == p->lastFont) return;
+
+  if (!p->lastFont || (p->lastFont->getFamily() && p->lastFont->getFamily()->cmp(font->getFamily()))) {
+    if (p->capturing && font->getFamily())
+      p->commands.push_back(MCOp(mcOpFontName, font->getFamily()->getCString()));
+    if (p->lastFont) p->lastFont->decRefCnt();
+    p->lastFont = font;
+    font->incRefCnt();
+  }
+
+  Guint flags = 0;
+  if (font->isBold()) flags |= mcOpFlagFontBold;
+  if (font->isItalic()) flags |= mcOpFlagFontItalic;
+  if (font->isFixedWidth()) flags |= mcOpFlagFontItalic;
+
+  if (p->lastFlags != flags) {
+    if (p->capturing)
+      p->commands.push_back(MCOp(mcOpFlags, flags));
+    p->lastFlags = flags;
+  }
+}
+
+
+const MCOpArray& MCOutputDev::getMCOps() const
+{
+  return p->commands;
+}
diff --git a/poppler/MCOutputDev.h b/poppler/MCOutputDev.h
new file mode 100644
index 0000000..cd7c4f5
--- /dev/null
+++ b/poppler/MCOutputDev.h
@@ -0,0 +1,108 @@
+//========================================================================
+//
+// MCOutputDev.h
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef MCDOUTPUTDEV_H
+#define MCDOUTPUTDEV_H
+
+#include "goo/gtypes.h"
+#include "goo/gmem.h"
+#include "OutputDev.h"
+#include <vector>
+
+class GfxState;
+class GooString;
+class Dict;
+
+
+enum MCOpType {
+  mcOpUnichar,
+  mcOpFontName,
+  mcOpFlags,
+  mcOpColor,
+};
+
+enum MCOpFlags {
+  mcOpFlagFontBold   = (1 << 0),
+  mcOpFlagFontItalic = (1 << 1),
+  mcOpFlagFontFixed  = (1 << 2),
+};
+
+struct MCColor {
+  double r, g, b;
+
+  Guint rgbPixel() const {
+    return ((Guint) (r * 255) & 0xFF) << 16
+         | ((Guint) (g * 255) & 0xFF) << 8
+         | ((Guint) (b * 255) & 0xFF);
+  }
+};
+
+struct MCOp {
+  MCOpType type;
+  union {
+    Unicode unichar;
+    char   *value;
+    Guint   flags;
+    MCColor color;
+  };
+
+  MCOp(const MCOp& op): type(op.type) {
+    switch (type) {
+      case mcOpFlags: flags = op.flags; break;
+      case mcOpUnichar: unichar = op.unichar; break;
+      case mcOpFontName: value = strdup(op.value); break;
+      case mcOpColor: memcpy(&color, &op.color, sizeof(MCColor)); break;
+    }
+  }
+  MCOp(): type(mcOpFontName), value(NULL) {}
+  MCOp(Unicode u): type(mcOpUnichar), unichar(u) {}
+  MCOp(MCOpType t, Guint f): type(t), flags(f) {}
+  MCOp(MCOpType t, const char *s = NULL): type(t), value(strdup(s)) {}
+  ~MCOp() { if (type == mcOpFontName) gfree(value); }
+};
+
+
+typedef std::vector<MCOp> MCOpArray;
+
+
+class MCOutputDev: public OutputDev {
+public:
+  MCOutputDev(int mcid);
+  virtual ~MCOutputDev();
+
+  virtual GBool isOk() { return gTrue; }
+  virtual GBool upsideDown() { return gTrue; }
+  virtual GBool useDrawChar() { return gTrue; }
+  virtual GBool interpretType3Chars() { return gFalse; }
+  virtual GBool needNonText() { return gFalse; }
+  virtual GBool needCharCount() { return gFalse; }
+
+  virtual void startPage(int pageNum, GfxState *state, XRef *xref);
+  virtual void endPage();
+
+  virtual void restoreState(GfxState *state) { updateFont(state); }
+  virtual void updateFont(GfxState *state);
+
+  virtual void drawChar(GfxState *state,
+                        double xx, double yy,
+                        double dx, double dy,
+                        double ox, double oy,
+                        CharCode c, int nBytes,
+                        Unicode *u, int uLen);
+
+  virtual void beginMarkedContent(char *name, Dict *properties);
+  virtual void endMarkedContent(GfxState *state);
+
+  const MCOpArray& getMCOps() const;
+
+private:
+  struct Priv;
+  Priv *p;
+};
+
+#endif /* !MCOUTPUTDEV_H */
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index ac51d05..eaff39d 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -236,6 +236,8 @@ poppler_include_HEADERS =	\
 	StdinPDFDocBuilder.h	\
 	Stream-CCITT.h		\
 	Stream.h		\
+	StructElement.h		\
+	StructTreeRoot.h	\
 	UnicodeMap.h		\
 	UnicodeMapTables.h	\
 	UnicodeTypeTable.h	\
@@ -250,6 +252,7 @@ poppler_include_HEADERS =	\
 	NameToUnicodeTable.h	\
 	PSOutputDev.h		\
 	TextOutputDev.h		\
+	MCOutputDev.h		\
 	SecurityHandler.h	\
 	UTF.h			\
 	UTF8.h			\
@@ -315,6 +318,8 @@ libpoppler_la_SOURCES =		\
 	StdinCachedFile.cc	\
 	StdinPDFDocBuilder.cc	\
 	Stream.cc 		\
+	StructElement.cc	\
+	StructTreeRoot.cc	\
 	strtok_r.cpp		\
 	UnicodeMap.cc		\
 	UnicodeTypeTable.cc	\
@@ -323,6 +328,7 @@ libpoppler_la_SOURCES =		\
 	XRef.cc			\
 	PSOutputDev.cc		\
 	TextOutputDev.cc	\
+	MCOutputDev.cc		\
 	PageLabelInfo.h		\
 	PageLabelInfo.cc	\
 	SecurityHandler.cc	\
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index da9bf5b..48189bc 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -60,6 +60,7 @@ class Outline;
 class Linearization;
 class SecurityHandler;
 class Hints;
+class StructTreeRoot;
 
 enum PDFWriteMode {
   writeStandard,
@@ -139,7 +140,7 @@ public:
   GooString *readMetadata() { return catalog->readMetadata(); }
 
   // Return the structure tree root object.
-  Object *getStructTreeRoot() { return catalog->getStructTreeRoot(); }
+  StructTreeRoot *getStructTreeRoot() { return catalog->getStructTreeRoot(); }
 
   // Get page.
   Page *getPage(int page);
diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc
new file mode 100644
index 0000000..c99c9fa
--- /dev/null
+++ b/poppler/StructElement.cc
@@ -0,0 +1,1361 @@
+//========================================================================
+//
+// StructElement.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "StructElement.h"
+#include "StructTreeRoot.h"
+#include "TextOutputDev.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
+#include "PDFDoc.h"
+#include "Dict.h"
+
+#include <assert.h>
+
+class GfxState;
+
+
+static GBool isPlacementName(Object* value)
+{
+  return value->isName("Block")
+      || value->isName("Inline")
+      || value->isName("Before")
+      || value->isName("Start")
+      || value->isName("End");
+}
+
+static GBool isWritingModeName(Object* value)
+{
+  return value->isName("LrTb")
+      || value->isName("RlTb")
+      || value->isName("TbRl");
+}
+
+static GBool isBorderStyleName(Object* value)
+{
+  return value->isName("None")
+      || value->isName("Hidden")
+      || value->isName("Dotted")
+      || value->isName("Dashed")
+      || value->isName("Solid")
+      || value->isName("Double")
+      || value->isName("Groove")
+      || value->isName("Ridge")
+      || value->isName("Inset")
+      || value->isName("Outset");
+}
+
+static GBool isTextAlignName(Object* value)
+{
+  return value->isName("Start")
+      || value->isName("End")
+      || value->isName("Center")
+      || value->isName("Justify");
+}
+
+static GBool isBlockAlignName(Object* value)
+{
+  return value->isName("Before")
+      || value->isName("Middle")
+      || value->isName("After")
+      || value->isName("Justify");
+}
+
+static GBool isInlineAlignName(Object* value)
+{
+  return value->isName("Start")
+      || value->isName("End")
+      || value->isName("Center");
+}
+
+static GBool isNumber(Object* value);
+
+static GBool isLineHeight(Object* value)
+{
+  return value->isName("Normal")
+      || value->isName("Auto")
+      || isNumber(value);
+}
+
+static GBool isTextDecorationName(Object* value)
+{
+  return value->isName("None")
+      || value->isName("Underline")
+      || value->isName("Overline")
+      || value->isName("LineThrough");
+}
+
+static GBool isRubyAlignName(Object* value)
+{
+  return value->isName("Start")
+      || value->isName("End")
+      || value->isName("Center")
+      || value->isName("Justify")
+      || value->isName("Distribute");
+}
+
+static GBool isRubyPositionName(Object* value)
+{
+  return value->isName("Before")
+      || value->isName("After")
+      || value->isName("Warichu")
+      || value->isName("Inline");
+}
+
+static GBool isGlyphOrientationName(Object* value)
+{
+  return value->isName("Auto")
+      || value->isName("90")
+      || value->isName("180")
+      || value->isName("270")
+      || value->isName("360")
+      || value->isName("-90")
+      || value->isName("-180");
+}
+
+static GBool isListNumberingName(Object* value)
+{
+  return value->isName("None")
+      || value->isName("Disc")
+      || value->isName("Circle")
+      || value->isName("Square")
+      || value->isName("Decimal")
+      || value->isName("UpperRoman")
+      || value->isName("LowerRoman")
+      || value->isName("UpperAlpha")
+      || value->isName("LowerAlpha");
+}
+
+static GBool isFieldRoleName(Object* value)
+{
+  return value->isName("rb")
+      || value->isName("cb")
+      || value->isName("pb")
+      || value->isName("tv");
+}
+
+static GBool isFieldCheckedName(Object* value)
+{
+  return value->isName("on")
+      || value->isName("off")
+      || value->isName("neutral");
+}
+
+static GBool isTableScopeName(Object* value)
+{
+  return value->isName("Row")
+      || value->isName("Column")
+      || value->isName("Both");
+}
+
+static GBool isRGBColor(Object* value)
+{
+  if (!(value->isArray() && value->arrayGetLength() == 3))
+    return gFalse;
+
+  GBool okay = gTrue;
+  for (int i = 0; i < 3; i++) {
+    Object obj;
+    if (!value->arrayGet(i, &obj)->isNum()) {
+      okay = gFalse;
+      obj.free();
+      break;
+    }
+    if (obj.getNum() < 0.0 || obj.getNum() > 1.0) {
+      okay = gFalse;
+      obj.free();
+      break;
+    }
+    obj.free();
+  }
+
+  return okay;
+}
+
+static GBool isNatural(Object* value)
+{
+  return (value->isInt()   && value->getInt()   > 0)
+      || (value->isInt64() && value->getInt64() > 0);
+}
+
+static GBool isPositive(Object* value)
+{
+  return value->isNum() && value->getNum() >= 0.0;
+}
+
+static GBool isNumber(Object* value)
+{
+  return value->isNum();
+}
+
+static GBool isNumber_or_AutoName(Object* value)
+{
+  return isNumber(value) || value->isName("Auto");
+}
+
+static GBool isTextString(Object* value)
+{
+  // XXX: Shall isName() also be checked?
+  return value->isString();
+}
+
+
+#define ARRAY_CHECKER(name, checkItem, length, allowSingle, allowNulls) \
+    static GBool name(Object* value) {                                  \
+      if (!value->isArray())                                            \
+        return allowSingle ? checkItem(value) : gFalse;                 \
+                                                                        \
+      if (length && value->arrayGetLength() != length)                  \
+        return gFalse;                                                  \
+                                                                        \
+      GBool okay = gTrue;                                               \
+      for (int i = 0; i < value->arrayGetLength(); i++) {               \
+        Object obj;                                                     \
+        value->arrayGet(i, &obj);                                       \
+        if ((!allowNulls && obj.isNull()) || !checkItem(&obj)) {        \
+          okay = gFalse;                                                \
+          obj.free();                                                   \
+          break;                                                        \
+        }                                                               \
+        obj.free();                                                     \
+      }                                                                 \
+      return okay;                                                      \
+    }
+
+ARRAY_CHECKER(isRGBColor_or_OptX4, isRGBColor,        4, gTrue,  gTrue );
+ARRAY_CHECKER(isPositive_or_OptX4, isPositive,        4, gTrue,  gTrue );
+ARRAY_CHECKER(isPositive_or_X4,    isPositive,        4, gTrue,  gFalse);
+ARRAY_CHECKER(isBorderStyle,       isBorderStyleName, 4, gTrue,  gTrue );
+ARRAY_CHECKER(isNumber_X4,         isNumber,          4, gFalse, gFalse);
+ARRAY_CHECKER(isNumber_or_Xn,      isNumber,          0, gTrue,  gFalse);
+ARRAY_CHECKER(isTableHeaders,      isTextString,      0, gFalse, gFalse);
+
+
+// Type of functions used to do type-checking on attribute values
+typedef GBool (*AttributeCheckFunc)(Object*);
+
+// Maps attributes to their names and whether the attribute can be inherited.
+struct AttributeMapEntry {
+  Attribute::Type    type;
+  const char*        name;
+  const Object*      defval;
+  GBool              inherit;
+  AttributeCheckFunc check;
+};
+
+struct AttributeDefaults {
+  Object Inline;
+  Object LrTb;
+  Object Normal;
+  Object Distribute;
+  Object off;
+  Object Zero;
+  Object Auto;
+  Object Start;
+  Object None;
+  Object Before;
+  Object Nat1;
+
+  AttributeDefaults() {
+    Inline.initName("Inline");
+    LrTb.initName("LrTb");
+    Normal.initName("Normal");
+    Distribute.initName("Distribute");
+    off.initName("off");
+
+    Zero.initReal(0.0);
+    Auto.initName("Auto");
+    Start.initName("Start");
+    None.initName("None");
+    Before.initName("Before");
+    Nat1.initInt(1);
+  }
+};
+
+static const AttributeDefaults attributeDefaults;
+
+
+#define ATTR_LIST_END      { Attribute::Unknown, NULL, NULL, gFalse, NULL }
+#define ATTR_D(x, i, c, v) { Attribute::x, #x, &attributeDefaults.v, i, c }
+#define ATTR_N(x, i, c)    { Attribute::x, #x, NULL, i, c }
+
+static const AttributeMapEntry attributeMapCommonShared[] =
+{
+  ATTR_D(Placement,       gFalse, isPlacementName, Inline),
+  ATTR_D(WritingMode,     gFalse, isWritingModeName, LrTb),
+  ATTR_N(BackgroundColor, gFalse, isRGBColor),
+  ATTR_N(BorderColor,     gTrue,  isRGBColor_or_OptX4),
+  ATTR_D(BorderStyle,     gFalse, isBorderStyle, None),
+  ATTR_N(BorderThickness, gTrue,  isPositive_or_OptX4),
+  ATTR_D(Padding,         gFalse, isPositive_or_X4, Zero),
+  ATTR_N(Color,           gTrue,  isRGBColor),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonBlock[] =
+{
+  ATTR_D(SpaceBefore, gFalse, isPositive, Zero),
+  ATTR_D(SpaceAfter,  gFalse, isPositive, Zero),
+  ATTR_D(StartIndent, gTrue,  isNumber,   Zero),
+  ATTR_D(EndIndent,   gTrue,  isNumber,   Zero),
+  ATTR_D(TextIndent,  gTrue,  isNumber,   Zero),
+  ATTR_D(TextAlign,   gTrue,  isTextAlignName, Start),
+  ATTR_N(BBox,        gFalse, isNumber_X4),
+  ATTR_D(Width,       gFalse, isNumber_or_AutoName, Auto),
+  ATTR_D(Height,      gFalse, isNumber_or_AutoName, Auto),
+  ATTR_D(BlockAlign,  gTrue,  isBlockAlignName, Before),
+  ATTR_D(InlineAlign, gTrue,  isInlineAlignName, Start),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonInline[] =
+{
+  ATTR_D(BaselineShift,            gFalse, isNumber, Zero),
+  ATTR_D(LineHeight,               gTrue,  isLineHeight, Normal),
+  ATTR_N(TextDecorationColor,      gTrue,  isRGBColor),
+  ATTR_N(TextDecorationThickness,  gTrue,  isPositive),
+  ATTR_D(TextDecorationType,       gFalse, isTextDecorationName, None),
+  ATTR_D(GlyphOrientationVertical, gTrue,  isGlyphOrientationName, Auto),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonRubyText[] =
+{
+  ATTR_D(RubyPosition, gTrue, isRubyPositionName, Before),
+  ATTR_D(RubyAlign,    gTrue, isRubyAlignName, Distribute),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonColumns[] =
+{
+  ATTR_D(ColumnCount,  gFalse, isNatural, Nat1),
+  ATTR_N(ColumnGap,    gFalse, isNumber_or_Xn),
+  ATTR_N(ColumnWidths, gFalse, isNumber_or_Xn),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonList[] = {
+  ATTR_D(ListNumbering, gFalse, isListNumberingName, None),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonPrintField[] =
+{
+  ATTR_N(Role,    gFalse, isFieldRoleName),
+  ATTR_D(checked, gFalse, isFieldCheckedName, off),
+  ATTR_N(Desc,    gFalse, isTextString),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonTable[] =
+{
+  ATTR_N(Headers, gFalse, isTableHeaders),
+  ATTR_N(Scope,   gFalse, isTableScopeName),
+  ATTR_N(Summary, gFalse, isTextString),
+  ATTR_LIST_END
+};
+
+static const AttributeMapEntry attributeMapCommonTableCell[] =
+{
+  ATTR_D(RowSpan,      gFalse, isNatural, Nat1),
+  ATTR_D(ColSpan,      gFalse, isNatural, Nat1),
+  ATTR_D(TBorderStyle, gTrue,  isBorderStyle, None),
+  ATTR_D(TPadding,     gTrue,  isPositive_or_X4, Zero),
+  ATTR_LIST_END
+};
+
+
+static const AttributeMapEntry* attributeMapAll[] = {
+  attributeMapCommonShared,
+  attributeMapCommonBlock,
+  attributeMapCommonInline,
+  attributeMapCommonRubyText,
+  attributeMapCommonColumns,
+  attributeMapCommonList,
+  attributeMapCommonPrintField,
+  attributeMapCommonTable,
+  attributeMapCommonTableCell,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapShared[] = {
+  attributeMapCommonShared,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapBlock[] = {
+  attributeMapCommonShared,
+  attributeMapCommonBlock,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapInline[] = {
+  attributeMapCommonShared,
+  attributeMapCommonInline,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapTableCell[] = {
+  attributeMapCommonShared,
+  attributeMapCommonBlock,
+  attributeMapCommonTable,
+  attributeMapCommonTableCell,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapRubyText[] = {
+  attributeMapCommonShared,
+  attributeMapCommonInline,
+  attributeMapCommonRubyText,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapColumns[] = {
+  attributeMapCommonShared,
+  attributeMapCommonInline,
+  attributeMapCommonColumns,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapList[] = {
+  attributeMapCommonShared,
+  attributeMapCommonList,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapPrintField[] = {
+  attributeMapCommonShared,
+  attributeMapCommonPrintField,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapTable[] = {
+  attributeMapCommonShared,
+  attributeMapCommonBlock,
+  attributeMapCommonTable,
+  NULL,
+};
+
+static const AttributeMapEntry* attributeMapIllustration[] = {
+  // XXX: Illustrations may have some attributes from the "shared", "inline",
+  //      the "block" sets. This is a loose specification; making it better
+  //      means duplicating entries from the sets. This seems good enough...
+  attributeMapCommonShared,
+  attributeMapCommonBlock,
+  attributeMapCommonInline,
+  NULL,
+};
+
+// Table mapping owners of attributes to their names.
+static const struct OwnerMapEntry {
+  Attribute::Owner owner;
+  const char*      name;
+} ownerMap[] = {
+  // XXX: Those are sorted in the owner priority resolution order. If the
+  //      same attribute is defined with two owners, the order in the table
+  //      can be used to know which one has more priority.
+  { Attribute::XML_1_00,       "XML-1.00"       },
+  { Attribute::HTML_3_20,      "HTML-3.20"      },
+  { Attribute::HTML_4_01,      "HTML-4.01"      },
+  { Attribute::OEB_1_00,       "OEB-1.00"       },
+  { Attribute::RTF_1_05,       "RTF-1.05"       },
+  { Attribute::CSS_1_00,       "CSS-1.00"       },
+  { Attribute::CSS_2_00,       "CSS-2.00"       },
+  { Attribute::Layout,         "Layout"         },
+  { Attribute::PrintField,     "PrintField"     },
+  { Attribute::Table,          "Table"          },
+  { Attribute::List,           "List"           },
+  { Attribute::UserProperties, "UserProperties" },
+};
+
+
+static GBool ownerHasMorePriority(Attribute::Owner a, Attribute::Owner b)
+{
+  unsigned a_index, b_index;
+
+  for (unsigned i = a_index = b_index = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) {
+    if (ownerMap[i].owner == a)
+      a_index = i;
+    if (ownerMap[i].owner == b)
+      b_index = i;
+  }
+
+  return a_index < b_index;
+}
+
+
+// Maps element types to their names and also serves as lookup table
+// for additional element type attributes.
+
+enum ElementType {
+  elementTypeUndefined,
+  elementTypeInline,
+  elementTypeBlock,
+};
+
+static const struct TypeMapEntry {
+  StructElement::Type       type;
+  const char*               name;
+  ElementType               elementType;
+  const AttributeMapEntry** attributes;
+} typeMap[] = {
+  { StructElement::Document,   "Document",   elementTypeInline,    attributeMapShared       },
+  { StructElement::Part,       "Part",       elementTypeInline,    attributeMapShared       },
+  { StructElement::Art,        "Art",        elementTypeInline,    attributeMapColumns      },
+  { StructElement::Sect,       "Sect",       elementTypeInline,    attributeMapColumns      },
+  { StructElement::Div,        "Div",        elementTypeInline,    attributeMapColumns      },
+  { StructElement::BlockQuote, "BlockQuote", elementTypeInline,    attributeMapInline       },
+  { StructElement::Caption,    "Caption",    elementTypeInline,    attributeMapInline       },
+  { StructElement::NonStruct,  "NonStruct",  elementTypeInline,    attributeMapInline       },
+  { StructElement::Index,      "Index",      elementTypeInline,    attributeMapInline       },
+  { StructElement::Private,    "Private",    elementTypeInline,    attributeMapInline       },
+  { StructElement::Span,       "Span",       elementTypeInline,    attributeMapInline       },
+  { StructElement::Quote,      "Quote",      elementTypeInline,    attributeMapInline       },
+  { StructElement::Note,       "Note",       elementTypeInline,    attributeMapInline       },
+  { StructElement::Reference,  "Reference",  elementTypeInline,    attributeMapInline       },
+  { StructElement::BibEntry,   "BibEntry",   elementTypeInline,    attributeMapInline       },
+  { StructElement::Code,       "Code",       elementTypeInline,    attributeMapInline       },
+  { StructElement::Link,       "Link",       elementTypeInline,    attributeMapInline       },
+  { StructElement::Annot,      "Annot",      elementTypeInline,    attributeMapInline       },
+  { StructElement::Ruby,       "Ruby",       elementTypeInline,    attributeMapRubyText     },
+  { StructElement::RB,         "RB",         elementTypeUndefined, attributeMapRubyText     },
+  { StructElement::RT,         "RT",         elementTypeUndefined, attributeMapRubyText     },
+  { StructElement::RP,         "RP",         elementTypeUndefined, attributeMapShared       },
+  { StructElement::Warichu,    "Warichu",    elementTypeInline,    attributeMapRubyText     },
+  { StructElement::WT,         "WT",         elementTypeUndefined, attributeMapShared       },
+  { StructElement::WP,         "WP",         elementTypeUndefined, attributeMapShared       },
+  { StructElement::P,          "P",          elementTypeBlock,     attributeMapBlock        },
+  { StructElement::H,          "H",          elementTypeBlock,     attributeMapBlock        },
+  { StructElement::H1,         "H1",         elementTypeBlock,     attributeMapBlock        },
+  { StructElement::H2,         "H2",         elementTypeBlock,     attributeMapBlock        },
+  { StructElement::H3,         "H3",         elementTypeBlock,     attributeMapBlock        },
+  { StructElement::H4,         "H4",         elementTypeBlock,     attributeMapBlock        },
+  { StructElement::H5,         "H5",         elementTypeBlock,     attributeMapBlock        },
+  { StructElement::H6,         "H6",         elementTypeBlock,     attributeMapBlock        },
+  { StructElement::L,          "L",          elementTypeBlock,     attributeMapList         },
+  { StructElement::LI,         "LI",         elementTypeBlock,     attributeMapBlock        },
+  { StructElement::Lbl,        "Lbl",        elementTypeBlock,     attributeMapBlock        },
+  { StructElement::Table,      "Table",      elementTypeBlock,     attributeMapTable        },
+  { StructElement::TR,         "TR",         elementTypeUndefined, attributeMapShared       },
+  { StructElement::TH,         "TH",         elementTypeUndefined, attributeMapTableCell    },
+  { StructElement::TD,         "TD",         elementTypeUndefined, attributeMapTableCell    },
+  { StructElement::THead,      "THead",      elementTypeUndefined, attributeMapShared       },
+  { StructElement::TFoot,      "TFoot",      elementTypeUndefined, attributeMapShared       },
+  { StructElement::TBody,      "TBody",      elementTypeUndefined, attributeMapShared       },
+  { StructElement::Figure,     "Figure",     elementTypeUndefined, attributeMapIllustration },
+  { StructElement::Formula,    "Formula",    elementTypeUndefined, attributeMapIllustration },
+  { StructElement::Form,       "Form",       elementTypeUndefined, attributeMapIllustration },
+  { StructElement::TOC,        "TOC",        elementTypeUndefined, attributeMapShared       },
+  { StructElement::TOCI,       "TOCI",       elementTypeUndefined, attributeMapShared       },
+};
+
+
+//------------------------------------------------------------------------
+// Helpers for the attribute and structure type tables
+//------------------------------------------------------------------------
+
+static inline const AttributeMapEntry*
+getAttributeMapEntry(const AttributeMapEntry** entryList, Attribute::Type type)
+{
+  assert(entryList);
+  while (*entryList) {
+    const AttributeMapEntry* entry = *entryList;
+    while (entry->type != Attribute::Unknown) {
+      assert(entry->name);
+      if (type == entry->type)
+        return entry;
+      entry++;
+    }
+    entryList++;
+  }
+  return NULL;
+}
+
+static inline const AttributeMapEntry*
+getAttributeMapEntry(const AttributeMapEntry** entryList, const char* name)
+{
+  assert(entryList);
+  while (*entryList) {
+    const AttributeMapEntry* entry = *entryList;
+    while (entry->type != Attribute::Unknown) {
+      assert(entry->name);
+      if (strcmp(name, entry->name) == 0)
+        return entry;
+      entry++;
+    }
+    entryList++;
+  }
+  return NULL;
+}
+
+static inline const OwnerMapEntry* getOwnerMapEntry(Attribute::Owner owner)
+{
+  for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) {
+    if (owner == ownerMap[i].owner)
+      return &ownerMap[i];
+  }
+  return NULL;
+}
+
+static inline const OwnerMapEntry* getOwnerMapEntry(const char* name)
+{
+  for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) {
+    if (strcmp(name, ownerMap[i].name) == 0)
+      return &ownerMap[i];
+  }
+  return NULL;
+}
+
+static const char* ownerToName(Attribute::Owner owner)
+{
+  const OwnerMapEntry* entry = getOwnerMapEntry(owner);
+  return entry ? entry->name : "UnknownOwner";
+}
+
+Attribute::Owner nameToOwner(const char* name)
+{
+  const OwnerMapEntry* entry = getOwnerMapEntry(name);
+  return entry ? entry->owner : Attribute::UnknownOwner;
+}
+
+static inline const TypeMapEntry* getTypeMapEntry(StructElement::Type type)
+{
+  for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) {
+    if (type == typeMap[i].type)
+      return &typeMap[i];
+  }
+  return NULL;
+}
+
+static inline const TypeMapEntry* getTypeMapEntry(const char* name)
+{
+  for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) {
+    if (strcmp(name, typeMap[i].name) == 0)
+      return &typeMap[i];
+  }
+  return NULL;
+}
+
+static const char* typeToName(StructElement::Type type)
+{
+  if (type == StructElement::MCID)
+    return "MarkedContent";
+
+  const TypeMapEntry* entry = getTypeMapEntry(type);
+  return entry ? entry->name : "Unknown";
+}
+
+static StructElement::Type nameToType(const char* name)
+{
+  const TypeMapEntry* entry = getTypeMapEntry(name);
+  return entry ? entry->type : StructElement::Unknown;
+}
+
+
+//------------------------------------------------------------------------
+// Attribute
+//------------------------------------------------------------------------
+
+Attribute::Attribute(const char* nameA, Object* valueA, GBool copyValue):
+  type(UserProperty),
+  owner(UserProperties),
+  revision(0),
+  name(nameA),
+  value(),
+  hidden(gFalse),
+  formatted(NULL)
+{
+  assert(valueA);
+
+  if (copyValue)
+    valueA->copy(&value);
+  else
+    valueA->shallowCopy(&value);
+}
+
+Attribute::Attribute(Type type, Object* valueA, GBool copyValue):
+  type(type),
+  owner(UserProperties), // TODO: Determine corresponding owner from Type
+  revision(0),
+  name(),
+  value(),
+  hidden(gFalse),
+  formatted(NULL)
+{
+  assert(valueA);
+
+  if (copyValue)
+    valueA->copy(&value);
+  else
+    valueA->shallowCopy(&value);
+
+  if (!typeCheck()) {
+    type = Unknown;
+  }
+}
+
+Attribute::~Attribute()
+{
+  delete formatted;
+  value.free();
+}
+
+const char* Attribute::getTypeName() const
+{
+  if (type == UserProperty)
+    return name.getCString();
+
+  const AttributeMapEntry* entry = getAttributeMapEntry(attributeMapAll, type);
+  if (entry)
+    return entry->name;
+
+  return "Unknown";
+}
+
+const char* Attribute::getOwnerName() const
+{
+  return ownerToName(owner);
+}
+
+Object* Attribute::getDefaultValue(Attribute::Type type)
+{
+  const AttributeMapEntry* entry = getAttributeMapEntry(attributeMapAll, type);
+  return entry ? const_cast<Object*>(entry->defval) : NULL;
+}
+
+void Attribute::setFormattedValue(const char* formattedA)
+{
+  if (formattedA) {
+    if (formatted)
+      formatted->Set(formattedA);
+    else
+      formatted = new GooString(formattedA);
+  } else {
+    delete formatted;
+  }
+}
+
+GBool Attribute::typeCheck(StructElement* element)
+{
+  // If an element is passed, tighther type-checking can be done.
+  if (element) {
+    const TypeMapEntry* elementTypeEntry = getTypeMapEntry(element->getType());
+    if (elementTypeEntry && elementTypeEntry->attributes) {
+      const AttributeMapEntry* entry = getAttributeMapEntry(elementTypeEntry->attributes, type);
+      if (entry) {
+        if (entry->check && !((*entry->check)(&value))) {
+          return gFalse;
+        }
+      } else {
+        // No entry: the attribute is not valid for the containing element.
+        return gFalse;
+      }
+    }
+  }
+
+  return gTrue;
+}
+
+Attribute::Type Attribute::typeForName(const char* name, StructElement* element)
+{
+  const AttributeMapEntry** attributes = attributeMapAll;
+  if (element) {
+    const TypeMapEntry* elementTypeEntry = getTypeMapEntry(element->getType());
+    if (elementTypeEntry && elementTypeEntry->attributes) {
+      attributes = elementTypeEntry->attributes;
+    }
+  }
+
+  const AttributeMapEntry* entry = getAttributeMapEntry(attributes, name);
+  return entry ? entry->type : Unknown;
+}
+
+Attribute* Attribute::parseUserProperty(Dict* property)
+{
+  Object obj, value;
+  const char* name = NULL;
+
+  if (property->lookup("N", &obj)->isString())
+    name = obj.getString()->getCString();
+  else if (obj.isName())
+    name = obj.getName();
+  else {
+    error(errSyntaxError, -1, "N object is wrong type ({0:s})", obj.getTypeName());
+    obj.free();
+    return NULL;
+  }
+
+  if (property->lookup("V", &value)->isNull()) {
+    error(errSyntaxError, -1, "V object is wrong type ({0:s})", value.getTypeName());
+    value.free();
+    obj.free();
+    return NULL;
+  }
+
+  Attribute *attribute = new Attribute(name, &value, gFalse);
+  obj.free();
+
+  if (property->lookup("F", &obj)->isString()) {
+    attribute->setFormattedValue(obj.getString()->getCString());
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "F object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  if (property->lookup("H", &obj)->isBool()) {
+    attribute->setHidden(obj.getBool());
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "H object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  return attribute;
+}
+
+
+//------------------------------------------------------------------------
+// StructElement
+//------------------------------------------------------------------------
+
+StructElement::StructData::StructData():
+  parentRef(),
+  altText(0),
+  actualText(0),
+  id(0),
+  title(0),
+  expandedAbbr(0),
+  language(0),
+  revision(0),
+  elements(),
+  attributes()
+{
+}
+
+StructElement::StructData::~StructData()
+{
+  delete altText;
+  delete actualText;
+  delete id;
+  delete title;
+  gfree(language);
+  parentRef.free();
+  for (std::vector<StructElement*>::iterator i = elements.begin(); i != elements.end(); ++i) delete *i;
+  for (std::vector<Attribute*>::iterator i = attributes.begin(); i != attributes.end(); ++i) delete *i;
+}
+
+
+StructElement::StructElement(Dict* element, StructTreeRoot* treeRootA, StructElement* parentA):
+  type(Unknown),
+  treeRoot(treeRootA),
+  parent(parentA),
+  pageRef(),
+  s(new StructData())
+{
+  assert(treeRoot);
+  assert(element);
+  parse(element);
+}
+
+StructElement::StructElement(int mcid, StructTreeRoot* treeRootA, StructElement* parentA):
+  type(MCID),
+  treeRoot(treeRootA),
+  parent(parentA),
+  pageRef(),
+  c(new ContentData(mcid))
+{
+  assert(treeRoot);
+  assert(parent);
+  assert(c->mcid != InvalidMCID);
+}
+
+StructElement::~StructElement()
+{
+  if (isContent())
+    delete c;
+  else
+    delete s;
+  pageRef.free();
+}
+
+GBool StructElement::isBlock() const
+{
+  const TypeMapEntry* entry = getTypeMapEntry(type);
+  return entry ? (entry->elementType == elementTypeBlock) : gFalse;
+}
+
+GBool StructElement::isInline() const
+{
+  const TypeMapEntry* entry = getTypeMapEntry(type);
+  return entry ? (entry->elementType == elementTypeInline) : gFalse;
+}
+
+GBool StructElement::hasPageRef() const
+{
+  return pageRef.isRef() || (parent && parent->hasPageRef());
+}
+
+Ref StructElement::getPageRef() const
+{
+  if (pageRef.isRef())
+    return pageRef.getRef();
+
+  if (parent)
+    return parent->getPageRef();
+
+  static const Ref invalidRef = { -1, -1 };
+  return invalidRef;
+}
+
+const char* StructElement::getTypeName() const
+{
+  return typeToName(type);
+}
+
+const Attribute* StructElement::findAttribute(Attribute::Type attributeType, GBool inherit,
+                                              Attribute::Owner attributeOwner) const
+{
+  if (isContent())
+    return parent->findAttribute(attributeType, inherit, attributeOwner);
+
+  if (attributeType != Attribute::Unknown && attributeType != Attribute::UserProperty) {
+    const Attribute *result = NULL;
+
+    if (attributeOwner == Attribute::UnknownOwner) {
+      // Search for the attribute, no matter who the owner is
+      for (unsigned i = 0; i < getNumAttributes(); i++) {
+        const Attribute *attr = getAttribute(i);
+        if (attributeType == attr->getType()) {
+          if (!result || ownerHasMorePriority(attr->getOwner(), result->getOwner()))
+            result = attr;
+        }
+      }
+    } else {
+      // Search for the attribute, with a specific owner
+      for (unsigned i = 0; i < getNumAttributes(); i++) {
+        const Attribute *attr = getAttribute(i);
+        if (attributeType == attr->getType() && attributeOwner == attr->getOwner()) {
+          result = attr;
+          break;
+        }
+      }
+    }
+
+    if (result)
+      return result;
+
+    if (inherit && parent) {
+      const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, attributeType);
+      assert(entry);
+      // TODO: Take into account special inheritance cases, for example:
+      //       inline elements which have been changed to be block using
+      //       "/Placement/Block" have slightly different rules.
+      if (entry->inherit)
+        return parent->findAttribute(attributeType, inherit, attributeOwner);
+    }
+  }
+  return NULL;
+}
+
+GooString* StructElement::getText(GooString *string, GBool recursive) const
+{
+  if (isContent()) {
+    UnicodeMap *map = globalParams->getTextEncoding();
+    if (!map) {
+      GooString mapName("UTF-8");
+      map = UnicodeMap::parse(&mapName);
+    }
+    assert(map);
+
+    const MCOpArray& ops(getMCOps());
+    if (!ops.size())
+      return NULL;
+
+    if (!string)
+      string = new GooString();
+
+    char buf[9];
+    int n;
+
+    for (MCOpArray::const_iterator i = ops.begin(); i != ops.end(); ++i) {
+      if (i->type == mcOpUnichar) {
+        n = map->mapUnicode(i->unichar, buf, sizeof(buf));
+        string->append(buf, n);
+      }
+    }
+    map->decRefCnt();
+    return string;
+  }
+
+  if (!recursive)
+    return NULL;
+
+  // Do a depth-first traversal, to get elements in logical order
+  if (!string)
+    string = new GooString();
+
+  for (unsigned i = 0; i < getNumElements(); i++)
+    getElement(i)->getText(string, recursive);
+
+  return string;
+}
+
+
+const MCOpArray StructElement::getMCOps() const
+{
+  if (!isContent())
+    return MCOpArray(); // Empty array
+
+  MCOutputDev mcdev(getMCID());
+  int startPage = 0, endPage = 0;
+
+  if (hasPageRef()) {
+    Ref ref = getPageRef();
+    startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen);
+  }
+
+  if (!(startPage && endPage)) {
+    startPage = 1;
+    endPage = treeRoot->getDoc()->getNumPages();
+  }
+
+  treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse);
+  return mcdev.getMCOps();
+}
+
+
+void StructElement::parse(Dict* element)
+{
+  Object obj;
+
+  // Type is optional, but if present must be StructElem
+  if (!element->lookup("Type", &obj)->isNull() && !obj.isName("StructElem")) {
+    error(errSyntaxError, -1, "Type of StructElem object is wrong");
+    obj.free();
+    return;
+  }
+  obj.free();
+
+  // Parent object reference (required).
+  if (!element->lookupNF("P", &s->parentRef)->isRef()) {
+    error(errSyntaxError, -1, "P object is wrong type ({0:s})", obj.getTypeName());
+    return;
+  }
+
+  // Check whether the S-type is valid for the top level
+  // element and create a node of the appropriate type.
+  if (!element->lookup("S", &obj)->isName()) {
+    error(errSyntaxError, -1, "S object is wrong type ({0:s})", obj.getTypeName());
+    obj.free();
+    return;
+  }
+
+  // Type name may not be standard, resolve through RoleMap first.
+  // TODO: roleMap entries may need to be resolved recursively until
+  //       a known standard name is found, cycles may be present.
+  if (treeRoot->getRoleMap()) {
+    Object resolved;
+    if (treeRoot->getRoleMap()->lookup(obj.getName(), &resolved)->isName()) {
+      type = nameToType(resolved.getName());
+    } else if (resolved.isNull()) {
+      type = nameToType(obj.getName());
+    } else {
+      error(errSyntaxError, -1, "Value in RoleMap is wrong type ({0:s})", resolved.getTypeName());
+      resolved.free();
+      obj.free();
+      return;
+    }
+    resolved.free();
+  } else {
+    type = nameToType(obj.getName());
+  }
+  if (type == Unknown) {
+    error(errSyntaxError, -1, "StructElem object is wrong type ({0:s})", obj.getName());
+    obj.free();
+    return;
+  }
+  obj.free();
+
+  // Object ID (optional), to be looked at the IDTree in the tree root.
+  if (element->lookup("ID", &obj)->isString()) {
+    s->id = new GooString(obj.getString());
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "ID object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Page reference (optional) in which at least one of the child items
+  // is to be rendered in. Note: each element stores only the /Pg value
+  // contained by it, and StructElement::getPageRef() may look in parent
+  // elements to find the page where an element belongs.
+  if (!element->lookupNF("Pg", &pageRef)->isRef() && !pageRef.isNull()) {
+    error(errSyntaxWarning, -1, "Pg object is wrong type ({0:s})", pageRef.getTypeName());
+  }
+
+  // Revision number (optional).
+  if (element->lookup("R", &obj)->isInt()) {
+    s->revision = obj.getInt();
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "R object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Element title (optional).
+  if (element->lookup("T", &obj)->isString()) {
+    s->title = new GooString(obj.getString());
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "T object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Language (optional).
+  if (element->lookup("Lang", &obj)->isString()) {
+    s->language = obj.getString()->getCString();
+    obj.initNull(); // The StructElement takes ownership of the GooString
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "Lang object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Alternative text (optional).
+  if (element->lookup("Alt", &obj)->isString()) {
+    s->altText = obj.getString();
+    obj.initNull(); // The StructElement takes ownership of the GooString
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "Alt object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Expanded form of an abbreviation (optional).
+  if (element->lookup("E", &obj)->isString()) {
+    s->expandedAbbr = obj.getString();
+    obj.initNull(); // The StructElement takes ownership of the GooString
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "E object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Actual text (optional).
+  if (element->lookup("ActualText", &obj)->isString()) {
+    s->actualText = obj.getString();
+    obj.initNull(); // The StructElement takes ownership of the GooString
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "ActualText object is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Attributes directly attached to the element (optional).
+  if (element->lookup("A", &obj)->isDict()) {
+    parseAttributes(obj.getDict());
+  } else if (obj.isArray()) {
+    Object iobj;
+    unsigned attrIndex = getNumAttributes();
+    for (int i = 0; i < obj.arrayGetLength(); i++) {
+      if (obj.arrayGet(i, &iobj)->isDict()) {
+        attrIndex = getNumAttributes();
+        parseAttributes(obj.getDict());
+      } else if (iobj.isInt()) {
+        const int revision = iobj.getInt();
+        // Set revision numbers for the elements previously created.
+        for (unsigned j = attrIndex; j < getNumAttributes(); j++)
+          getAttribute(j)->setRevision(revision);
+      } else {
+        error(errSyntaxWarning, -1, "A item is wrong type ({0:s})", iobj.getTypeName());
+      }
+      iobj.free();
+    }
+  } else if (!obj.isNull()) {
+    error(errSyntaxWarning, -1, "A is wrong type ({0:s})", obj.getTypeName());
+  }
+  obj.free();
+
+  // Attributes referenced indirectly through the ClassMap (optional).
+  if (treeRoot->getClassMap()) {
+    Object classes;
+    if (element->lookup("C", &classes)->isName()) {
+      Object attr;
+      if (treeRoot->getClassMap()->lookup(classes.getName(), &attr)->isDict()) {
+        parseAttributes(attr.getDict(), gTrue);
+      } else if (attr.isArray()) {
+        for (int i = 0; i < attr.arrayGetLength(); i++) {
+          Object iobj;
+          unsigned attrIndex = getNumAttributes();
+          if (attr.arrayGet(i, &iobj)->isDict()) {
+            attrIndex = getNumAttributes();
+            parseAttributes(iobj.getDict(), gTrue);
+          } else if (iobj.isInt()) {
+            // Set revision numbers for the elements previously created.
+            const int revision = iobj.getInt();
+            for (unsigned j = attrIndex; j < getNumAttributes(); j++)
+              getAttribute(j)->setRevision(revision);
+          } else {
+            error(errSyntaxWarning, -1, "C item is wrong type ({0:s})", iobj.getTypeName());
+          }
+        }
+      } else if (!attr.isNull()) {
+        error(errSyntaxWarning, -1, "C object is wrong type ({0:s})", classes.getTypeName());
+      }
+      classes.free();
+    }
+  }
+
+  parseChildren(element);
+}
+
+StructElement* StructElement::parseChild(Object* childObj)
+{
+  assert(childObj);
+  StructElement* child = NULL;
+
+  if (childObj->isInt()) {
+    child = new StructElement(childObj->getInt(), treeRoot, this);
+  } else if (childObj->isDict("MCR")) {
+    /*
+     * TODO: The optional Stm/StwOwn attributes are not handled, so all the
+     *      page will be always scanned when calling StructElement::getText().
+     */
+    Object mcidObj;
+    Object refObj;
+
+    if (!childObj->dictLookup("MCID", &mcidObj)->isInt()) {
+      error(errSyntaxError, -1, "MCID object is wrong type ({0:s})", mcidObj.getTypeName());
+      mcidObj.free();
+      return NULL;
+    }
+
+    child = new StructElement(mcidObj.getInt(), treeRoot, this);
+
+    if (childObj->dictLookupNF("Pg", &refObj)->isRef()) {
+      // XXX Unclassy manipulation of the page reference (ugh!)
+      child->pageRef = refObj;
+    } else {
+      refObj.free();
+    }
+  } else if (childObj->isDict("OBJR")) {
+    // TODO: PDF Object Reference
+  } else if (childObj->isDict()) {
+    child = new StructElement(childObj->getDict(), treeRoot, this);
+  } else {
+    error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", childObj->getTypeName());
+  }
+
+  if (child) {
+    if (child->isOk())
+      appendElement(child);
+    else {
+      delete child;
+      child = NULL;
+    }
+  }
+
+  return child;
+}
+
+void StructElement::parseChildren(Dict* element)
+{
+  Object kids;
+
+  if (element->lookup("K", &kids)->isArray()) {
+    for (int i = 0; i < kids.arrayGetLength(); i++) {
+      Object obj;
+      kids.arrayGet(i, &obj);
+      parseChild(&obj);
+      obj.free();
+    }
+  } else if (kids.isDict() || kids.isInt()) {
+    parseChild(&kids);
+  } else if (!kids.isNull()) {
+    error(errSyntaxWarning, -1, "K in StructElement is wrong type ({0:s})", kids.getTypeName());
+  }
+
+  kids.free();
+}
+
+void StructElement::parseAttributes(Dict* attributes, GBool keepExisting)
+{
+  Object owner;
+  if (attributes->lookup("O", &owner)->isName("UserProperties")) {
+    // In this case /P is an array of UserProperty dictionaries
+    Object userProperties;
+    if (attributes->lookup("P", &userProperties)->isArray()) {
+      for (int i = 0; i < userProperties.arrayGetLength(); i++) {
+        Object property;
+        if (userProperties.arrayGet(i, &property)->isDict()) {
+          Attribute* attribute = Attribute::parseUserProperty(property.getDict());
+          if (attribute && attribute->isOk()) {
+            appendAttribute(attribute);
+          } else {
+            error(errSyntaxWarning, -1, "Item in P is invalid");
+            delete attribute;
+          }
+        } else {
+          error(errSyntaxWarning, -1, "Item in P is wrong type ({0:s})", property.getTypeName());
+        }
+        property.free();
+      }
+    } else {
+      error(errSyntaxWarning, -1, "P is wrong type ({0:s})", userProperties.getTypeName());
+    }
+    userProperties.free();
+  } else if (owner.isName()) {
+    // In this case /P contains standard attributes.
+    // Check first if the owner is a valid standard one.
+    Attribute::Owner ownerValue = nameToOwner(owner.getName());
+    if (ownerValue != Attribute::UnknownOwner) {
+      // Iterate over the entries of the "attributes" dictionary.
+      // The /O entry (owner) is skipped.
+      for (int i = 0; i < attributes->getLength(); i++) {
+        const char* key = attributes->getKey(i);
+        if (strcmp(key, "O") != 0) {
+          Attribute::Type type = Attribute::typeForName(key, this);
+
+          // Check if the attribute is already defined.
+          if (keepExisting) {
+            GBool exists = gFalse;
+            for (unsigned j = 0; j < getNumAttributes(); j++) {
+              if (getAttribute(j)->getType() == type) {
+                exists = gTrue;
+                break;
+              }
+            }
+            if (exists)
+              continue;
+          }
+
+          if (type != Attribute::Unknown) {
+            Object value;
+            GBool typeCheckOk = gTrue;
+            Attribute* attribute = new Attribute(type, attributes->getVal(i, &value), gFalse);
+            if (attribute->isOk() && (typeCheckOk = attribute->typeCheck(this))) {
+              appendAttribute(attribute);
+            } else {
+              // It is not needed to free "value", the Attribute instance
+              // owns the contents, so deleting "attribute" is enough.
+              if (!typeCheckOk) {
+                error(errSyntaxWarning, -1, "Attribute {0:s} value is of wrong type ({1:s})",
+                      attribute->getTypeName(), attribute->getValue()->getTypeName());
+              }
+              delete attribute;
+            }
+          } else {
+            error(errSyntaxWarning, -1, "Wrong Attribute '{0:s}' in element {1:s}", key, getTypeName());
+          }
+        }
+      }
+    } else {
+      error(errSyntaxWarning, -1, "O object is invalid value ({0:s})", owner.getName());
+    }
+  } else if (!owner.isNull()) {
+    error(errSyntaxWarning, -1, "O is wrong type ({0:s})", owner.getTypeName());
+  }
+  owner.free();
+}
diff --git a/poppler/StructElement.h b/poppler/StructElement.h
new file mode 100644
index 0000000..63fb051
--- /dev/null
+++ b/poppler/StructElement.h
@@ -0,0 +1,273 @@
+//========================================================================
+//
+// StructElement.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef STRUCTELEMENT_H
+#define STRUCTELEMENT_H
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "goo/gtypes.h"
+#include "goo/GooString.h"
+#include "MCOutputDev.h"
+#include "Object.h"
+#include <vector>
+
+class GooString;
+class Dict;
+class StructElement;
+class StructTreeRoot;
+class TextWordList;
+
+
+class Attribute {
+public:
+  enum Type {
+    Unknown = 0,        // Uninitialized, parsing error, etc.
+    UserProperty,       // User defined attribute (i.e. non-standard)
+
+    // Common standard attributes
+    Placement, WritingMode, BackgroundColor, BorderColor, BorderStyle,
+    BorderThickness, Color, Padding,
+
+    // Block element standard attributes
+    SpaceBefore, SpaceAfter, StartIndent, EndIndent, TextIndent, TextAlign,
+    BBox, Width, Height, BlockAlign, InlineAlign, TBorderStyle, TPadding,
+
+    // Inline element standard attributes
+    BaselineShift, LineHeight, TextDecorationColor, TextDecorationThickness,
+    TextDecorationType, RubyAlign, RubyPosition, GlyphOrientationVertical,
+
+    // Column-only standard attributes
+    ColumnCount, ColumnGap, ColumnWidths,
+
+    // List-only standard attributes
+    ListNumbering,
+
+    // PrintField-only standard attributes
+    Role, checked, Desc,
+
+    // Table-only standard attributes
+    RowSpan, ColSpan, Headers, Scope, Summary,
+  };
+
+  enum Owner {
+    UnknownOwner = 0,
+    // User-defined attributes
+    UserProperties,
+    // Standard attributes
+    Layout, List, PrintField, Table,
+    // Translation to other formats
+    XML_1_00, HTML_3_20, HTML_4_01, OEB_1_00, RTF_1_05, CSS_1_00, CSS_2_00,
+  };
+
+  // Creates a standard attribute. The name is predefined, and the
+  // value is type-checked to conform to the PDF specification.
+  Attribute(Type type, Object* value, GBool copyValue = gTrue);
+
+  // Creates an UserProperty attribute, with an arbitrary name and value.
+  Attribute(const char* name, Object* value, GBool copyValue = gTrue);
+
+  GBool isOk() const { return type != Unknown; }
+
+  // Name, type and value can be set only on construction.
+  Type getType() const { return type; }
+  Owner getOwner() const { return owner; }
+  const char* getTypeName() const;
+  const char* getOwnerName() const;
+  Object* getValue() const { return &value; }
+  static Object* getDefaultValue(Type type);
+
+  const char* getName() const { return type == UserProperty ? name.getCString() : getTypeName(); }
+
+  // The revision is optional, and defaults to zero.
+  Guint getRevision() const { return revision; }
+  void setRevision(Guint revisionA) { revision = revisionA; }
+
+  // Hidden elements should not be displayed by the user agent
+  GBool isHidden() const { return hidden; }
+  void setHidden(GBool hiddenA) { hidden = hiddenA; }
+
+  // The formatted value may be in the PDF, or be left undefined (NULL).
+  // In the later case the user agent should provide a default representation.
+  const char* getFormattedValue() const { return formatted ? formatted->getCString() : NULL; }
+  void setFormattedValue(const char *formattedA);
+
+  ~Attribute();
+
+private:
+  Type type;
+  Owner owner;
+  Guint revision;
+  mutable GooString name;
+  mutable Object value;
+  GBool hidden;
+  GooString *formatted;
+
+  GBool typeCheck(StructElement* element = NULL);
+  static Type typeForName(const char* name, StructElement* element = NULL);
+  static Attribute* parseUserProperty(Dict* property);
+
+  friend class StructElement;
+};
+
+
+class StructElement {
+public:
+  enum Type {
+    Unknown = 0,
+    MCID,                                   // MCID reference, used internally
+
+    Document, Part, Art, Sect, Div,         // Structural elements
+
+    Span, Quote, Note, Reference, BibEntry, // Inline elements
+    Code, Link, Annot,
+    BlockQuote, Caption, NonStruct,
+    TOC, TOCI, Index, Private,
+
+    P, H, H1, H2, H3, H4, H5, H6,           // Paragraph-like
+
+    L, LI, Lbl,                             // List elements
+
+    Table, TR, TH, TD, THead, TFoot, TBody, // Table elements
+
+    Ruby, RB, RT, RP,                       // Ruby text elements
+    Warichu, WT, WP,
+
+    Figure, Formula, Form,                  // Illustration-like elements
+  };
+
+  static const int InvalidMCID = -1;
+
+  const char* getTypeName() const;
+  Type getType() const { return type; }
+  GBool isOk() const { return type != Unknown; }
+  GBool isBlock() const;
+  GBool isInline() const;
+
+  // TODO Handle object references (OBJR)
+  inline GBool isContent() const { return (type == MCID) && (c->mcid != InvalidMCID); }
+
+  int getMCID() const { return isContent() ? c->mcid : InvalidMCID; }
+  Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef.getRef(); }
+  GBool hasPageRef() const;
+  Ref getPageRef() const;
+  StructTreeRoot* getStructTreeRoot() const { return treeRoot; }
+
+  // Optional element identifier.
+  const GooString* getID() const { return isContent() ? NULL : s->id; }
+
+  // Optional ISO language name, e.g. en_US
+  const char* getLang(GBool recursive = gTrue) const
+  { return isContent() ? parent->getLang(recursive)
+                       : (s->language ? s->language : (recursive && parent ? parent->getLang() : NULL)); }
+
+  // Optional revision number, defaults to zero.
+  Guint getRevision() const { return isContent() ? 0 : s->revision; }
+  void setRevision(Guint revision) { if (isContent()) s->revision = revision; }
+
+  // Optional element title, in human-readable form.
+  const GooString* getTitle() const { return isContent() ? NULL : s->title; }
+
+  // Optional element expanded abbreviation text.
+  const GooString* getExpandedAbbr() const { return isContent() ? NULL : s->expandedAbbr; }
+
+  unsigned getNumElements() const { return isContent() ? 0 : s->elements.size(); }
+  const StructElement* getElement(int i) const { return isContent() ? NULL : s->elements.at(i); }
+  StructElement* getElement(int i) { return isContent() ? NULL : s->elements.at(i); }
+
+  void appendElement(StructElement* element)
+  { if (!isContent() && element && element->isOk()) s->elements.push_back(element); }
+
+  unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); }
+  const Attribute* getAttribute(int i) const { return isContent() ? NULL : s->attributes.at(i); }
+  Attribute* getAttribute(int i) { return isContent() ? NULL : s->attributes.at(i); }
+
+  void appendAttribute(Attribute* attribute)
+  { if (!isContent() && attribute) s->attributes.push_back(attribute); }
+
+  const Attribute* findAttribute(Attribute::Type attributeType, GBool inherit = gFalse,
+                                 Attribute::Owner owner = Attribute::UnknownOwner) const;
+
+  GooString* getAltText() const { return isContent() ? NULL : s->altText; }
+  GooString* getActualText() const { return isContent() ? NULL : s->actualText; }
+
+  // Content text referenced by the element:
+  //
+  // - For MCID reference elements, this is just the text of the
+  //   corresponding marked content object in the page stream, regardless
+  //   of the setting of the "recursive" flag.
+  // - For other elements, if the "recursive" flag is set, the text
+  //   enclosed by *all* the child MCID reference elements of the subtree
+  //   is returned. The text is assembled by traversing the leaf MCID
+  //   reference elements in logical order.
+  // - In any other case, the function returns NULL.
+  //
+  // The text will be appended to the passed GooString. If NULL is passed,
+  // a new string is returned, and the ownership passed to the caller.
+  //
+  GooString* getText(GooString *string = NULL, GBool recursive = gTrue) const;
+
+  const MCOpArray getMCOps() const;
+
+  ~StructElement();
+
+private:
+  typedef std::vector<Attribute*>     AttrPtrArray;
+  typedef std::vector<StructElement*> ElemPtrArray;
+
+  struct StructData {
+    Object       parentRef;
+    GooString   *altText;
+    GooString   *actualText;
+    GooString   *id;
+    GooString   *title;
+    GooString   *expandedAbbr;
+    char        *language;
+    Guint        revision;
+    ElemPtrArray elements;
+    AttrPtrArray attributes;
+
+    StructData();
+    ~StructData();
+  };
+
+  // Data in content elements (MCID, MCR)
+  struct ContentData {
+    int mcid;
+
+    ContentData(int mcidA = InvalidMCID): mcid(mcidA) {}
+  };
+
+  // Common data
+  Type type;
+  StructTreeRoot* treeRoot;
+  StructElement* parent;
+  mutable Object pageRef;
+
+  union {
+    StructData  *s;
+    ContentData *c;
+  };
+
+  StructElement(Dict* elementDict, StructTreeRoot* treeRootA, StructElement* parentA = 0);
+  StructElement(int mcid, StructTreeRoot* treeRootA, StructElement* parentA);
+
+  void parse(Dict* elementDict);
+  StructElement* parseChild(Object* childObj);
+  void parseChildren(Dict* element);
+  void parseAttributes(Dict* element, GBool keepExisting = gFalse);
+
+  friend class StructTreeRoot;
+};
+
+#endif
+
diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc
new file mode 100644
index 0000000..727bfe9
--- /dev/null
+++ b/poppler/StructTreeRoot.cc
@@ -0,0 +1,120 @@
+//========================================================================
+//
+// StructTreeRoot.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "goo/GooString.h"
+#include "StructTreeRoot.h"
+#include "StructElement.h"
+#include "Object.h"
+#include "Dict.h"
+
+#include <assert.h>
+
+
+StructTreeRoot::StructTreeRoot(PDFDoc *docA, Dict* structTreeRootDict, GBool marked):
+  doc(docA),
+  roleMap(),
+  classMap(),
+  elements()
+{
+  assert(doc);
+  assert(structTreeRootDict);
+  parse(structTreeRootDict, marked);
+}
+
+StructTreeRoot::~StructTreeRoot()
+{
+  for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i)
+    delete *i;
+  classMap.free();
+  roleMap.free();
+}
+
+void StructTreeRoot::parse(Dict* root, GBool marked)
+{
+  // The RoleMap/ClassMap dictionaries are needed by all the parsing
+  // functions, which will resolve the custom names to canonical
+  // standard names.
+  root->lookup("RoleMap", &roleMap);
+  root->lookup("ClassMap", &classMap);
+
+  Object kids;
+  if (root->lookup("K", &kids)->isArray()) {
+    if (marked && kids.arrayGetLength() > 1) {
+      error(errSyntaxWarning, -1, "K in StructTreeRoot has more than one children in a tagged PDF");
+    }
+    for (int i = 0; i < kids.arrayGetLength(); i++) {
+      Object obj;
+      kids.arrayGet(i, &obj);
+      if (obj.isDict()) {
+        StructElement* child = new StructElement(obj.getDict(), this);
+        if (child->isOk()) {
+          if (marked && !(child->getType() == StructElement::Document ||
+                          child->getType() == StructElement::Part ||
+                          child->getType() == StructElement::Art ||
+                          child->getType() == StructElement::Div)) {
+            error(errSyntaxWarning, -1, "StructTreeRoot element of tagged PDF is wrong type ({0:s})", child->getTypeName());
+          }
+          appendElement(child);
+        } else {
+          error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed");
+          delete child;
+        }
+      } else {
+        error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", obj.getTypeName());
+      }
+      obj.free();
+    }
+  } else if (kids.isDict()) {
+    if (marked) {
+      error(errSyntaxWarning, -1, "K has a child of wrong type for a tagged PDF ({0:s})", kids.getTypeName());
+    }
+    StructElement* child = new StructElement(kids.getDict(), this);
+    if (child->isOk()) {
+      appendElement(child);
+    } else {
+      error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed");
+      delete child;
+    }
+  } else if (!kids.isNull()) {
+    error(errSyntaxWarning, -1, "K in StructTreeRoot is wrong type ({0:s})", kids.getTypeName());
+  }
+
+  kids.free();
+}
+
+static const StructElement* findElementAux(GooString* elementId, const StructElement* element)
+{
+  if (element->getID()->cmp(elementId) == 0) {
+    return element;
+  }
+  for (unsigned i = 0; i < element->getNumElements(); i++) {
+    const StructElement* child = findElementAux(elementId, element->getElement(i));
+    if (child) {
+      return child;
+    }
+  }
+  return NULL;
+}
+
+const StructElement* StructTreeRoot::findElement(GooString* elementId) const
+{
+  assert(elementId);
+  for (unsigned i = 0; i < getNumElements(); i++) {
+    const StructElement* element = findElementAux(elementId, getElement(i));
+    if (element) {
+      return element;
+    }
+  }
+  return NULL;
+}
diff --git a/poppler/StructTreeRoot.h b/poppler/StructTreeRoot.h
new file mode 100644
index 0000000..2952d93
--- /dev/null
+++ b/poppler/StructTreeRoot.h
@@ -0,0 +1,56 @@
+//========================================================================
+//
+// StructTreeRoot.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef STRUCTTREEROOT_H
+#define STRUCTTREEROOT_H
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "goo/gtypes.h"
+#include "Object.h"
+#include "StructElement.h"
+#include <vector>
+
+class Dict;
+class PDFDoc;
+
+
+class StructTreeRoot
+{
+public:
+  StructTreeRoot(PDFDoc *docA, Dict* rootDict, GBool marked);
+  ~StructTreeRoot();
+
+  PDFDoc* getDoc() { return doc; }
+  Dict* getRoleMap() { return roleMap.isDict() ? roleMap.getDict() : NULL; }
+  Dict* getClassMap() { return classMap.isDict() ? classMap.getDict() : NULL; }
+  unsigned getNumElements() const { return elements.size(); }
+  const StructElement* getElement(int i) const { return elements.at(i); }
+  StructElement* getElement(int i) { return elements.at(i); }
+  void appendElement(StructElement* element)
+  { if (element && element->isOk()) elements.push_back(element); }
+  const StructElement* findElement(GooString* elementId) const;
+
+private:
+  PDFDoc *doc;
+
+  Object roleMap;
+  Object classMap;
+
+  typedef std::vector<StructElement*> ElemPtrArray;
+  ElemPtrArray elements;
+
+  void parse(Dict* rootDict, GBool marked);
+};
+
+#endif
+
-- 
1.8.3



More information about the poppler mailing list