[poppler] 2 commits - poppler/Object.h poppler/StructElement.cc poppler/StructTreeRoot.cc poppler/StructTreeRoot.h utils/pdfinfo.cc

Adrian Johnson ajohnson at kemper.freedesktop.org
Thu Jan 4 05:14:24 UTC 2018


 poppler/Object.h          |    8 ++
 poppler/StructElement.cc  |    2 
 poppler/StructTreeRoot.cc |  124 ++++++++++++++++++++++++++--------------------
 poppler/StructTreeRoot.h  |   14 +++--
 utils/pdfinfo.cc          |    6 --
 5 files changed, 91 insertions(+), 63 deletions(-)

New commits:
commit 6233710068e8406cb44741bdc74d1a0e2582e5cf
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Thu Jan 4 15:41:11 2018 +1030

    Remove error for wrong child type for tagged pdf
    
    It is harmless and as a few PDFs do this it just adds noise to the output.
    
    Bug #103587

diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc
index efcb6fb4..fed05b1b 100644
--- a/poppler/StructTreeRoot.cc
+++ b/poppler/StructTreeRoot.cc
@@ -95,9 +95,6 @@ void StructTreeRoot::parse(Dict *root)
       }
     }
   } else if (kids.isDict()) {
-    if (marked) {
-      error(errSyntaxWarning, -1, "K has a child of wrong type for a tagged PDF ({0:s})", kids.getTypeName());
-    }
     StructElement *child = new StructElement(kids.getDict(), this, NULL, seenElements);
     if (child->isOk()) {
       appendChild(child);
commit 321538259a9c79a99ce846a6ea2d94dd7fa56f61
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Sun Nov 26 20:43:15 2017 +1030

    Fix some bugs in StructTreeRoot parsing of parent tree
    
    - Add support for parsing child nodes in the number tree
    - Number tree keys do not have to be consecutive numbers. Use
      map instead of vector for parentTree.
    - Due to performance impact of iterating a map instead of
      vector in parentTreeAdd, add a reverse mapping from Ref
      to parentTree.
    - Add mcid parameter to findParentElement() to enable finding
      the parent when there are multiple MCIDs on the same page.
    - Move RefCompare from pdfinfo.cc to Object.h so it can be
      used by other files.
    
    Bug #103912

diff --git a/poppler/Object.h b/poppler/Object.h
index f2ca20d1..a333e7c8 100644
--- a/poppler/Object.h
+++ b/poppler/Object.h
@@ -86,6 +86,14 @@ struct Ref {
   int gen;			// generation number
 };
 
+struct RefCompare {
+  bool operator() (const Ref& lhs, const Ref& rhs) const {
+    if (lhs.num != rhs.num)
+      return lhs.num < rhs.num;
+    return lhs.gen < rhs.gen;
+  }
+};
+
 //------------------------------------------------------------------------
 // object types
 //------------------------------------------------------------------------
diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc
index 62925474..e46fafea 100644
--- a/poppler/StructElement.cc
+++ b/poppler/StructElement.cc
@@ -1198,7 +1198,7 @@ StructElement *StructElement::parseChild(Object *ref,
      *      page will be always scanned when calling StructElement::getText().
      */
     Object mcidObj = childObj->dictLookup("MCID");
-    if (mcidObj.isInt()) {
+    if (!mcidObj.isInt()) {
       error(errSyntaxError, -1, "MCID object is wrong type ({0:s})", mcidObj.getTypeName());
       return NULL;
     }
diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc
index 5f561115..efcb6fb4 100644
--- a/poppler/StructTreeRoot.cc
+++ b/poppler/StructTreeRoot.cc
@@ -55,50 +55,7 @@ void StructTreeRoot::parse(Dict *root)
   // be filled-in later when parsing them.
   Object obj = root->lookup("ParentTree");
   if (obj.isDict()) {
-    Object nums = obj.dictLookup("Nums");
-    if (nums.isArray()) {
-      if (nums.arrayGetLength() % 2 == 0) {
-        parentTree.resize(nums.arrayGetLength() / 2);
-        // Index numbers in even positions, references in odd ones
-        for (int i = 0; i < nums.arrayGetLength(); i += 2) {
-          Object index = nums.arrayGet(i);
-
-          if (!index.isInt()) {
-            error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i, index.getTypeName());
-            continue;
-          }
-          const int idx = index.getInt();
-          if (idx < 0 || idx >= (int)parentTree.size()) {
-            error(errSyntaxError, -1, "Nums item at position {0:d} is invalid value ({1:d}): [0..{2:d}]", i, idx, parentTree.size() - 1);
-            continue;
-          }
-
-          Object value = nums.arrayGetNF(i + 1);
-          if (value.isRef()) {
-            parentTree[idx].resize(1);
-            parentTree[idx][0].ref = value.getRef();
-          } else {
-	    value = nums.arrayGet(i + 1);
-	    if (value.isArray()) {
-	      parentTree[idx].resize(value.arrayGetLength());
-	      for (int j = 0; j < value.arrayGetLength(); j++) {
-		Object itemvalue = value.arrayGetNF(j);
-		if (itemvalue.isRef())
-		  parentTree[idx][j].ref = itemvalue.getRef();
-		else
-		  error(errSyntaxError, -1, "Nums array item at position {0:d}/{1:d} is invalid type ({2:s})", i, j, itemvalue.getTypeName());
-	      }
-	    } else {
-	      error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i + 1, value.getTypeName());
-	    }
-	  }
-        }
-      } else {
-        error(errSyntaxError, -1, "Nums array length is not a even ({0:d})", nums.arrayGetLength());
-      }
-    } else {
-      error(errSyntaxError, -1, "Nums object is wrong type ({0:s})", nums.getTypeName());
-    }
+    parseNumberTreeNode(obj.getDict());
   }
 
   std::set<int> seenElements;
@@ -154,14 +111,80 @@ void StructTreeRoot::parse(Dict *root)
   } else if (!kids.isNull()) {
     error(errSyntaxWarning, -1, "K in StructTreeRoot is wrong type ({0:s})", kids.getTypeName());
   }
+
+  // refToParentMap is only used during parsing. Ensure all memory used by it is freed.
+  std::multimap<Ref, Parent*, RefCompare>().swap(refToParentMap);
 }
 
-void StructTreeRoot::parentTreeAdd(const Ref &objectRef, StructElement *element)
+void StructTreeRoot::parseNumberTreeNode(Dict *node)
 {
-  for (std::vector< std::vector<Parent> >::iterator i = parentTree.begin(); i != parentTree.end(); ++i) {
-    for (std::vector<Parent>::iterator j = i->begin(); j != i->end(); ++j) {
-      if (j->ref.num == objectRef.num && j->ref.gen == objectRef.gen)
-        j->element = element;
+  Object kids = node->lookup("Kids");
+  if (kids.isArray()) {
+    for (int i = 0; i < kids.arrayGetLength(); i++) {
+      Object obj = kids.arrayGet(i);
+      if (obj.isDict()) {
+	parseNumberTreeNode(obj.getDict());
+      } else {
+	error(errSyntaxError, -1, "Kids item at position {0:d} is wrong type ({1:s})", i, obj.getTypeName());
+      }
     }
+    return;
+  } else if (!kids.isNull()) {
+    error(errSyntaxError, -1, "Kids object is wrong type ({0:s})", kids.getTypeName());
   }
+
+  Object nums = node->lookup("Nums");
+  if (nums.isArray()) {
+    if (nums.arrayGetLength() % 2 == 0) {
+      // keys in even positions, references in odd ones
+      for (int i = 0; i < nums.arrayGetLength(); i += 2) {
+	Object key = nums.arrayGet(i);
+
+	if (!key.isInt()) {
+	  error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i, key.getTypeName());
+	  continue;
+	}
+	int keyVal = key.getInt();
+	std::vector<Parent>& vec = parentTree[keyVal];
+
+	Object value = nums.arrayGet(i + 1);
+	if (value.isArray()) {
+	  vec.resize(value.arrayGetLength());
+	  memset(vec.data(), 0, vec.size()*sizeof(Parent*));
+	  for (int j = 0; j < value.arrayGetLength(); j++) {
+	    Object itemvalue = value.arrayGetNF(j);
+	    if (itemvalue.isRef()) {
+	      Ref ref = itemvalue.getRef();
+	      vec[j].ref = ref;
+	      refToParentMap.insert(std::pair<Ref, Parent*>(ref, &vec[j]));
+	    } else if (!itemvalue.isNull()) {
+	      error(errSyntaxError, -1, "Nums array item at position {0:d}/{1:d} is invalid type ({2:s})", i, j, itemvalue.getTypeName());
+	    }
+	  }
+	} else {
+	  value = nums.arrayGetNF(i + 1);
+	  if (value.isRef()) {
+	    Ref ref = value.getRef();
+	    vec.resize(1);
+	    vec[0].ref = ref;
+	    refToParentMap.insert(std::pair<Ref, Parent*>(ref, &vec[0]));
+	  } else {
+	    error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i + 1, value.getTypeName());
+	  }
+	}
+      }
+    } else {
+      error(errSyntaxError, -1, "Nums array length is not a even ({0:d})", nums.arrayGetLength());
+    }
+  } else {
+    error(errSyntaxError, -1, "Nums object is wrong type ({0:s})", nums.getTypeName());
+  }
+}
+
+
+void StructTreeRoot::parentTreeAdd(const Ref &objectRef, StructElement *element)
+{
+  auto range = refToParentMap.equal_range(objectRef);
+  for (auto it = range.first; it !=range.second; ++it)
+    it->second->element = element;
 }
diff --git a/poppler/StructTreeRoot.h b/poppler/StructTreeRoot.h
index 3b1f3c84..ca688499 100644
--- a/poppler/StructTreeRoot.h
+++ b/poppler/StructTreeRoot.h
@@ -18,6 +18,7 @@
 #include "goo/gtypes.h"
 #include "Object.h"
 #include "StructElement.h"
+#include <map>
 #include <vector>
 
 class Dict;
@@ -43,9 +44,12 @@ public:
     }
   }
 
-  const StructElement *findParentElement(unsigned index) const {
-    if (index < parentTree.size() && parentTree[index].size() == 1) {
-      return parentTree[index][0].element;
+  const StructElement *findParentElement(int key, unsigned mcid = 0) const {
+    auto it = parentTree.find(key);
+    if (it != parentTree.end()) {
+      if (mcid < it->second.size()) {
+	return it->second[mcid].element;
+      }
     }
     return NULL;
   }
@@ -71,9 +75,11 @@ private:
   Object roleMap;
   Object classMap;
   ElemPtrArray elements;
-  std::vector< std::vector<Parent> > parentTree;
+  std::map<int, std::vector<Parent> > parentTree;
+  std::multimap<Ref, Parent*, RefCompare> refToParentMap;
 
   void parse(Dict *rootDict);
+  void parseNumberTreeNode(Dict *node);
   void parentTreeAdd(const Ref &objectRef, StructElement *element);
 
   friend class StructElement;
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index a3099cf6..4cb569c5 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -291,12 +291,6 @@ static void printStruct(const StructElement *element, unsigned indent) {
   }
 }
 
-struct RefCompare {
-  bool operator() (const Ref& lhs, const Ref& rhs) const {
-    return lhs.num < rhs.num;
-  }
-};
-
 struct GooStringCompare {
   bool operator() (GooString* lhs, GooString* rhs) const {
     return lhs->cmp(const_cast<GooString*>(rhs)) < 0;


More information about the poppler mailing list