[poppler] Linearization support

Hib Eris hib at hiberis.nl
Thu Apr 29 04:52:32 PDT 2010


Hi all,

I have two series of patches that allow poppler to handle linearized
documents more efficiently.

The first series, linearization-patches-v1.txt, changes to way the
XRef table is initialized and used. A linearized document contains two
XRef tables, one at the beginning of the document and one at the end.
The one at the beginning contains the xref's necessary to render the
first page. To allow rendering the frist page without having to load
the xref table at the end, I changed to code to read the entries in
the end xref table (and other previous xref tables) only when needed.

The second series, page-on-demand-v1.txt, changes how the page tree is
initialized. In a linearized document the 'Page' objects are scattered
all over the document. If you only want to render a specific page, you
do not want to have to read all these Page objects to initialize a
page tree. Linearized documents contain extra information in hint
tables that allow you to find the Page objects without using the page
tree. These patches make poppler initialize the page tree only when
needed, instead of on document initialization. Because page objects
are parsed only when you call getPage(), it can no longer be
guaranteed  that getPage() will always return a valid Page object, so
you will have to check it's return value.

These patches, along with the HTTP streaming support make it possible
to use pdftoppm to render a specific page of a remote document by only
loading the necessary document data for that page.


Hib
-------------- next part --------------
From a9f2d19d18dd509d3f495c4c2fbb830516fa0527 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 6 Apr 2010 19:24:42 +0200
Subject: [PATCH 01/12] Cleanup XRef constructors

---
 poppler/XRef.cc |   14 ++++++--------
 poppler/XRef.h  |    1 +
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 3ab23d9..8ae30b2 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -226,7 +226,7 @@ Object *ObjectStream::getObject(int objIdx, int objNum, Object *obj) {
 // XRef
 //------------------------------------------------------------------------
 
-XRef::XRef() {
+void XRef::init() {
   ok = gTrue;
   errCode = errNone;
   entries = NULL;
@@ -236,17 +236,15 @@ XRef::XRef() {
   objStr = NULL;
 }
 
+XRef::XRef() {
+  init();
+}
+
 XRef::XRef(BaseStream *strA) {
   Guint pos;
   Object obj;
 
-  ok = gTrue;
-  errCode = errNone;
-  size = 0;
-  entries = NULL;
-  streamEnds = NULL;
-  streamEndsLen = 0;
-  objStr = NULL;
+  init();
 
   encrypted = gFalse;
   permFlags = defPermFlags;
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 2dbd469..98db234 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -155,6 +155,7 @@ private:
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
 
+  void init();
   Guint getStartXref();
   GBool readXRef(Guint *pos);
   GBool readXRefTable(Parser *parser, Guint *pos);
-- 
1.6.4.2


From f1bf4283fce1793d5d0a07810c7de4bfd0389562 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 6 Apr 2010 19:16:45 +0200
Subject: [PATCH 02/12] Create no more XRef entries than specified

---
 poppler/XRef.cc |  126 +++++++++++++++++++++++++++---------------------------
 poppler/XRef.h  |    5 ++-
 2 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 8ae30b2..49ff809 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -230,6 +230,7 @@ void XRef::init() {
   ok = gTrue;
   errCode = errNone;
   entries = NULL;
+  capacity = 0;
   size = 0;
   streamEnds = NULL;
   streamEndsLen = 0;
@@ -311,6 +312,50 @@ XRef::~XRef() {
   }
 }
 
+int XRef::reserve(int newSize)
+{
+  if (newSize > capacity) {
+
+    int realNewSize;
+    for (realNewSize = capacity ? 2 * capacity : 1024;
+          newSize > realNewSize && realNewSize > 0;
+          realNewSize <<= 1) ;
+    if ((realNewSize < 0) ||
+        (realNewSize >= INT_MAX / (int)sizeof(XRefEntry))) {
+      return 0;
+    }
+
+    entries = (XRefEntry *)greallocn(entries, realNewSize, sizeof(XRefEntry));
+    capacity = realNewSize;
+  }
+
+  return capacity;
+}
+
+int XRef::resize(int newSize)
+{
+  if (newSize > size) {
+
+    if (reserve(newSize) < newSize) return size;
+
+    for (int i = size; i < newSize; ++i) {
+      entries[i].offset = 0xffffffff;
+      entries[i].type = xrefEntryFree;
+      entries[i].obj.initNull ();
+      entries[i].updated = false;
+      entries[i].gen = 0;
+    }
+  } else {
+    for (int i = newSize; i < size; i++) {
+      entries[i].obj.free ();
+    }
+  }
+
+  size = newSize;
+
+  return size;
+}
+
 // Read the 'startxref' position.
 Guint XRef::getStartXref() {
   char buf[xrefSearchSize+1];
@@ -398,7 +443,7 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
   GBool more;
   Object obj, obj2;
   Guint pos2;
-  int first, n, newSize, i;
+  int first, n, i;
 
   while (1) {
     parser->getObj(&obj);
@@ -417,29 +462,13 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
     n = obj.getInt();
     obj.free();
     if (first < 0 || n < 0 || first + n < 0) {
-      goto err1;
+      goto err0;
     }
     if (first + n > size) {
-      for (newSize = size ? 2 * size : 1024;
-	   first + n > newSize && newSize > 0;
-	   newSize <<= 1) ;
-      if (newSize < 0) {
-	goto err1;
-      }
-      if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
+      if (resize(first + n) != first + n) {
         error(-1, "Invalid 'obj' parameters'");
-        goto err1;
+        goto err0;
       }
- 
-      entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-      for (i = size; i < newSize; ++i) {
-	entries[i].offset = 0xffffffff;
-	entries[i].type = xrefEntryFree;
-	entries[i].obj.initNull ();
-	entries[i].updated = false;
-	entries[i].gen = 0;
-      }
-      size = newSize;
     }
     for (i = first; i < first + n; ++i) {
       if (!parser->getObj(&obj)->isInt()) {
@@ -520,6 +549,7 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
 
  err1:
   obj.free();
+ err0:
   ok = gFalse;
   return gFalse;
 }
@@ -542,19 +572,10 @@ GBool XRef::readXRefStream(Stream *xrefStr, Guint *pos) {
     goto err1;
   }
   if (newSize > size) {
-    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-      error(-1, "Invalid 'size' parameter.");
-      return gFalse;
-    }
-    entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-    for (i = size; i < newSize; ++i) {
-      entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
-      entries[i].obj.initNull ();
-      entries[i].updated = false;
-      entries[i].gen = 0;
+    if (resize(newSize) != newSize) {
+      error(-1, "Invalid 'size' parameter");
+      goto err0;
     }
-    size = newSize;
   }
 
   if (!dict->lookupNF("W", &obj)->isArray() ||
@@ -627,31 +648,16 @@ GBool XRef::readXRefStream(Stream *xrefStr, Guint *pos) {
 
 GBool XRef::readXRefStreamSection(Stream *xrefStr, int *w, int first, int n) {
   Guint offset;
-  int type, gen, c, newSize, i, j;
+  int type, gen, c, i, j;
 
   if (first + n < 0) {
     return gFalse;
   }
   if (first + n > size) {
-    for (newSize = size ? 2 * size : 1024;
-	 first + n > newSize && newSize > 0;
-	 newSize <<= 1) ;
-    if (newSize < 0) {
-      return gFalse;
-    }
-    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-      error(-1, "Invalid 'size' inside xref table.");
+    if (resize(first + n) != size) {
+      error(-1, "Invalid 'size' inside xref table");
       return gFalse;
     }
-    entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-    for (i = size; i < newSize; ++i) {
-      entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
-      entries[i].obj.initNull ();
-      entries[i].updated = false;
-      entries[i].gen = 0;
-    }
-    size = newSize;
   }
   for (i = first; i < first + n; ++i) {
     if (w[0] == 0) {
@@ -712,13 +718,13 @@ GBool XRef::constructXRef() {
   int newSize;
   int streamEndsSize;
   char *p;
-  int i;
   GBool gotRoot;
   char* token = NULL;
   bool oneCycle = true;
   int offset = 0;
 
   gfree(entries);
+  capacity = 0;
   size = 0;
   entries = NULL;
 
@@ -800,19 +806,10 @@ GBool XRef::constructXRef() {
 		      error(-1, "Bad object number");
 		      return gFalse;
 		    }
-		    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-		      error(-1, "Invalid 'obj' parameters.");
+		    if (resize(newSize) != newSize) {
+		      error(-1, "Invalid 'obj' parameters");
 		      return gFalse;
 		    }
-		    entries = (XRefEntry *)
-		        greallocn(entries, newSize, sizeof(XRefEntry));
-		    for (i = size; i < newSize; ++i) {
-		      entries[i].offset = 0xffffffff;
-		      entries[i].type = xrefEntryFree;
-		      entries[i].obj.initNull ();
-		      entries[i].updated = false;
-		    }
-		    size = newSize;
 		  }
 		  if (entries[num].type == xrefEntryFree ||
 		      gen >= entries[num].gen) {
@@ -1085,7 +1082,10 @@ Guint XRef::strToUnsigned(char *s) {
 
 void XRef::add(int num, int gen, Guint offs, GBool used) {
   if (num >= size) {
-    entries = (XRefEntry *)greallocn(entries, num + 1, sizeof(XRefEntry));
+    if (num >= capacity) {
+      entries = (XRefEntry *)greallocn(entries, num + 1, sizeof(XRefEntry));
+      capacity = num + 1;
+    }
     for (int i = size; i < num + 1; ++i) {
       entries[i].offset = 0xffffffff;
       entries[i].type = xrefEntryFree;
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 98db234..f86e5ee 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -136,7 +136,8 @@ private:
   Guint start;			// offset in file (to allow for garbage
 				//   at beginning of file)
   XRefEntry *entries;		// xref entries
-  int size;			// size of <entries> array
+  int capacity;			// size of <entries> array
+  int size;			// number of entries
   int rootNum, rootGen;		// catalog dict
   GBool ok;			// true if xref table is valid
   int errCode;			// error code (if <ok> is false)
@@ -156,6 +157,8 @@ private:
   GBool ownerPasswordOk;	// true if owner password is correct
 
   void init();
+  int reserve(int newSize);
+  int resize(int newSize);
   Guint getStartXref();
   GBool readXRef(Guint *pos);
   GBool readXRefTable(Parser *parser, Guint *pos);
-- 
1.6.4.2


From fa5a0beb6f86a6cea7ba98ef5ef3c04a53b7319d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 28 Apr 2010 12:45:42 +0200
Subject: [PATCH 03/12] Use XRef::add() in XRef::addIndirectObject()

---
 poppler/XRef.cc |    4 +---
 1 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 49ff809..012f91c 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -1127,10 +1127,8 @@ Ref XRef::addIndirectObject (Object* o) {
   XRefEntry *e;
   if (entryIndexToUse == -1) {
     entryIndexToUse = size;
-    size++;
-    entries = (XRefEntry *)greallocn(entries, size, sizeof(XRefEntry));
+    add(entryIndexToUse, 0, 0, gFalse);
     e = &entries[entryIndexToUse];
-    e->gen = 0;
   } else {
     //reuse a free entry
     e = &entries[entryIndexToUse];
-- 
1.6.4.2


From 7ce2bb1aac50315145ebc21ae34b5c37f00e0c35 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 14 Apr 2010 12:20:49 +0200
Subject: [PATCH 04/12] Use XRef::getEntry() to access entries

---
 poppler/XRef.cc |   49 +++++++++++++++++++++++++------------------------
 poppler/XRef.h  |    2 +-
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 012f91c..d615ec0 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -927,7 +927,7 @@ Object *XRef::fetch(int num, int gen, Object *obj) {
     goto err;
   }
 
-  e = &entries[num];
+  e = getEntry(num);
   if(!e->obj.isNull ()) { //check for updated object
     obj = e->obj.copy(obj);
     return obj;
@@ -1047,20 +1047,20 @@ GBool XRef::getStreamEnd(Guint streamStart, Guint *streamEnd) {
   return gTrue;
 }
 
-int XRef::getNumEntry(Guint offset) const
+int XRef::getNumEntry(Guint offset)
 {
   if (size > 0)
   {
     int res = 0;
-    Guint resOffset = entries[0].offset;
-    XRefEntry e;
+    Guint resOffset = getEntry(0)->offset;
+    XRefEntry *e;
     for (int i = 1; i < size; ++i)
     {
-      e = entries[i];
-      if (e.offset < offset && e.offset >= resOffset)
+      e = getEntry(i);
+      if (e->offset < offset && e->offset >= resOffset)
       {
         res = i;
-        resOffset = e.offset;
+        resOffset = e->offset;
       }
     }
     return res;
@@ -1095,7 +1095,7 @@ void XRef::add(int num, int gen, Guint offs, GBool used) {
     }
     size = num + 1;
   }
-  XRefEntry *e = &entries[num];
+  XRefEntry *e = getEntry(num);
   e->gen = gen;
   e->obj.initNull ();
   e->updated = false;
@@ -1113,25 +1113,26 @@ void XRef::setModifiedObject (Object* o, Ref r) {
     error(-1,"XRef::setModifiedObject on unknown ref: %i, %i\n", r.num, r.gen);
     return;
   }
-  entries[r.num].obj.free();
-  o->copy(&entries[r.num].obj);
-  entries[r.num].updated = true;
+  XRefEntry *e = getEntry(r.num);
+  e->obj.free();
+  o->copy(&(e->obj));
+  e->updated = true;
 }
 
 Ref XRef::addIndirectObject (Object* o) {
   int entryIndexToUse = -1;
   for (int i = 1; entryIndexToUse == -1 && i < size; ++i) {
-    if (entries[i].type == xrefEntryFree) entryIndexToUse = i;
+    if (getEntry(i)->type == xrefEntryFree) entryIndexToUse = i;
   }
 
   XRefEntry *e;
   if (entryIndexToUse == -1) {
     entryIndexToUse = size;
     add(entryIndexToUse, 0, 0, gFalse);
-    e = &entries[entryIndexToUse];
+    e = getEntry(entryIndexToUse);
   } else {
     //reuse a free entry
-    e = &entries[entryIndexToUse];
+    e = getEntry(entryIndexToUse);
     //we don't touch gen number, because it should have been 
     //incremented when the object was deleted
   }
@@ -1147,13 +1148,13 @@ Ref XRef::addIndirectObject (Object* o) {
 
 void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   //create free entries linked-list
-  if (entries[0].gen != 65535) {
+  if (getEntry(0)->gen != 65535) {
     error(-1, "XRef::writeToFile, entry 0 of the XRef is invalid (gen != 65535)\n");
   }
   int lastFreeEntry = 0;
   for (int i=0; i<size; i++) {
-    if (entries[i].type == xrefEntryFree) {
-      entries[lastFreeEntry].offset = i;
+    if (getEntry(i)->type == xrefEntryFree) {
+      getEntry(lastFreeEntry)->offset = i;
       lastFreeEntry = i;
     }
   }
@@ -1163,10 +1164,10 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
     outStr->printf("xref\r\n");
     outStr->printf("%i %i\r\n", 0, size);
     for (int i=0; i<size; i++) {
-      XRefEntry &e = entries[i];
+      XRefEntry *e = getEntry(i);
 
-      if(e.gen > 65535) e.gen = 65535; //cap generation number to 65535 (required by PDFReference)
-      outStr->printf("%010i %05i %c\r\n", e.offset, e.gen, (e.type==xrefEntryFree)?'f':'n');
+      if(e->gen > 65535) e->gen = 65535; //cap generation number to 65535 (required by PDFReference)
+      outStr->printf("%010i %05i %c\r\n", e->offset, e->gen, (e->type==xrefEntryFree)?'f':'n');
     }
   } else {
     //write the new xref
@@ -1175,16 +1176,16 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
     while (i < size) {
       int j;
       for(j=i; j<size; j++) { //look for consecutive entries
-        if ((entries[j].type == xrefEntryFree) && (entries[j].gen == 0))
+        if ((getEntry(j)->type == xrefEntryFree) && (getEntry(j)->gen == 0))
           break;
       }
       if (j-i != 0)
       {
         outStr->printf("%i %i\r\n", i, j-i);
         for (int k=i; k<j; k++) {
-          XRefEntry &e = entries[k];
-          if(e.gen > 65535) e.gen = 65535; //cap generation number to 65535 (required by PDFReference)
-          outStr->printf("%010i %05i %c\r\n", e.offset, e.gen, (e.type==xrefEntryFree)?'f':'n');
+          XRefEntry *e = getEntry(k);
+          if(e->gen > 65535) e->gen = 65535; //cap generation number to 65535 (required by PDFReference)
+          outStr->printf("%010i %05i %c\r\n", e->offset, e->gen, (e->type==xrefEntryFree)?'f':'n');
         }
         i = j;
       }
diff --git a/poppler/XRef.h b/poppler/XRef.h
index f86e5ee..344b764 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -117,7 +117,7 @@ public:
   GBool getStreamEnd(Guint streamStart, Guint *streamEnd);
 
   // Retuns the entry that belongs to the offset
-  int getNumEntry(Guint offset) const;
+  int getNumEntry(Guint offset);
 
   // Direct access.
   int getSize() { return size; }
-- 
1.6.4.2


From d1849ee55fb37f20db86e7a5cf2f44e63478cd66 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 15 Apr 2010 17:34:13 +0200
Subject: [PATCH 05/12] Read XRef table sections on demand

---
 poppler/XRef.cc |   37 ++++++++++++++++++++++++++++++++-----
 poppler/XRef.h  |    6 ++++--
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index d615ec0..a0c77fc 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -242,7 +242,6 @@ XRef::XRef() {
 }
 
 XRef::XRef(BaseStream *strA) {
-  Guint pos;
   Object obj;
 
   init();
@@ -254,11 +253,11 @@ XRef::XRef(BaseStream *strA) {
   // read the trailer
   str = strA;
   start = str->getStart();
-  pos = getStartXref();
+  prevXRefOffset = pos;
 
   // if there was a problem with the 'startxref' position, try to
   // reconstruct the xref table
-  if (pos == 0) {
+  if (prevXRefOffset == 0) {
     if (!(ok = constructXRef())) {
       errCode = errDamaged;
       return;
@@ -266,7 +265,7 @@ XRef::XRef(BaseStream *strA) {
 
   // read the xref table
   } else {
-    while (readXRef(&pos)) ;
+     readXRef(&prevXRefOffset);
 
     // if there was a problem with the xref table,
     // try to reconstruct it
@@ -278,6 +277,18 @@ XRef::XRef(BaseStream *strA) {
     }
   }
 
+  // set size according to trailer dict
+  trailerDict.dictLookupNF("Size", &obj);
+  if (obj.isInt() && (resize(obj.getInt()) == obj.getInt())) {
+    obj.free();
+  } else {
+    obj.free();
+    if (!(ok = constructXRef())) {
+      errCode = errDamaged;
+      return;
+    }
+  }
+
   // get the root dictionary (catalog) object
   trailerDict.dictLookupNF("Root", &obj);
   if (obj.isRef()) {
@@ -340,7 +351,7 @@ int XRef::resize(int newSize)
 
     for (int i = size; i < newSize; ++i) {
       entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
+      entries[i].type = xrefEntryNone;
       entries[i].obj.initNull ();
       entries[i].updated = false;
       entries[i].gen = 0;
@@ -1194,3 +1205,19 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   }
 }
 
+XRefEntry *XRef::getEntry(int i)
+{
+  if (entries[i].type == xrefEntryNone) {
+
+    while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+
+    if (entries[i].type == xrefEntryNone) {
+       error(-1, "Invalid XRef entry");
+       entries[i].type = xrefEntryFree;
+    }
+  }
+
+  return &entries[i];
+}
+
+
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 344b764..a013e5a 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -45,7 +45,8 @@ class ObjectStream;
 enum XRefEntryType {
   xrefEntryFree,
   xrefEntryUncompressed,
-  xrefEntryCompressed
+  xrefEntryCompressed,
+  xrefEntryNone
 };
 
 struct XRefEntry {
@@ -121,7 +122,7 @@ public:
 
   // Direct access.
   int getSize() { return size; }
-  XRefEntry *getEntry(int i) { return &entries[i]; }
+  XRefEntry *getEntry(int i);
   Object *getTrailerDict() { return &trailerDict; }
 
   // Write access
@@ -155,6 +156,7 @@ private:
   int permFlags;		// permission bits
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
+  Guint prevXRefOffset;		// position of prev XRef section (= next to read)
 
   void init();
   int reserve(int newSize);
-- 
1.6.4.2


From 298adf8080b96fa7d4c2534c087ab50453ddb679 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 18:26:17 +0100
Subject: [PATCH 06/12] Add Linearization dictionary support

---
 CMakeLists.txt           |    2 +
 poppler/Linearization.cc |  212 ++++++++++++++++++++++++++++++++++++++++++++++
 poppler/Linearization.h  |   45 ++++++++++
 poppler/Makefile.am      |    2 +
 poppler/PDFDoc.cc        |   13 +++
 poppler/PDFDoc.h         |    5 +
 6 files changed, 279 insertions(+), 0 deletions(-)
 create mode 100644 poppler/Linearization.cc
 create mode 100644 poppler/Linearization.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1eba1fe..a119a6d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,6 +249,7 @@ set(poppler_SRCS
   poppler/JBIG2Stream.cc
   poppler/Lexer.cc
   poppler/Link.cc
+  poppler/Linearization.cc
   poppler/LocalPDFDocBuilder.cc
   poppler/NameToCharCode.cc
   poppler/Object.cc
@@ -394,6 +395,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/JBIG2Stream.h
     poppler/Lexer.h
     poppler/Link.h
+    poppler/Linearization.h
     poppler/LocalPDFDocBuilder.h
     poppler/Movie.h
     poppler/NameToCharCode.h
diff --git a/poppler/Linearization.cc b/poppler/Linearization.cc
new file mode 100644
index 0000000..b2791c8
--- /dev/null
+++ b/poppler/Linearization.cc
@@ -0,0 +1,212 @@
+//========================================================================
+//
+// Linearization.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#include "Linearization.h"
+#include "Parser.h"
+#include "Lexer.h"
+
+//------------------------------------------------------------------------
+// Linearization
+//------------------------------------------------------------------------
+
+Linearization::Linearization (BaseStream *str)
+{
+  Parser *parser;
+  Object obj1, obj2, obj3, obj4, obj5;
+
+  linDict.initNull();
+
+  str->reset();
+  obj1.initNull();
+  parser = new Parser(NULL,
+      new Lexer(NULL, str->makeSubStream(str->getStart(), gFalse, 0, &obj1)),
+      gTrue);
+  parser->getObj(&obj1);
+  parser->getObj(&obj2);
+  parser->getObj(&obj3);
+  parser->getObj(&linDict);
+  parser->getObj(&obj4);
+  if (obj1.isInt() && obj2.isInt() && obj3.isCmd("obj") && linDict.isDict()) {
+    linDict.dictLookup("Linearized", &obj5);
+    if (!(obj5.isNum() && obj5.getNum() > 0)) {
+       linDict.free();
+       linDict.initNull();
+    }
+    obj5.free();
+  }
+  obj4.free();
+  obj4.free();
+  obj3.free();
+  obj2.free();
+  obj1.free();
+  delete parser;
+}
+
+Linearization:: ~Linearization()
+{
+  linDict.free();
+}
+
+Guint Linearization::getLength()
+{
+  int length = -1;
+
+  if (linDict.isDict()) {
+    linDict.getDict()->lookupInt("L", NULL, &length);
+  }
+
+  return length;
+}
+
+Guint Linearization::getHintsOffset()
+{
+  int hintsOffset = -1;
+
+  Object obj1, obj2;
+
+  if (linDict.isDict()) {
+    linDict.dictLookup("H", &obj1);
+    if (obj1.isArray() && (obj1.arrayGetLength()>=1)) {
+      obj1.arrayGet(0, &obj2);
+      if (obj2.isInt()) {
+        hintsOffset = obj2.getInt();
+      }
+      obj2.free();
+    }
+    obj1.free();
+  }
+
+  return hintsOffset;
+}
+
+Guint Linearization::getHintsLength()
+{
+  int hintsLength = -1;
+
+  Object obj1, obj2;
+
+  if (linDict.isDict()) {
+    linDict.dictLookup("H", &obj1);
+    if (obj1.isArray() && (obj1.arrayGetLength()>=2)) {
+      obj1.arrayGet(1, &obj2);
+      if (obj2.isInt()) {
+        hintsLength = obj2.getInt();
+      }
+      obj2.free();
+    }
+    obj1.free();
+  }
+
+  return hintsLength;
+}
+
+Guint Linearization::getHintsOffset2()
+{
+  int hintsOffset2 = -1;
+
+  Object obj1, obj2;
+
+  if (linDict.isDict()) {
+    linDict.dictLookup("H", &obj1);
+    if (obj1.isArray() && (obj1.arrayGetLength()>=3)) {
+      obj1.arrayGet(2, &obj2);
+      if (obj2.isInt()) {
+        hintsOffset2 = obj2.getInt();
+      }
+      obj2.free();
+    }
+    obj1.free();
+  }
+
+  return hintsOffset2;
+}
+
+Guint Linearization::getHintsLength2()
+{
+  int hintsLength2 = -1;
+
+  Object obj1, obj2;
+
+  if (linDict.isDict()) {
+    linDict.dictLookup("H", &obj1);
+    if (obj1.isArray() && (obj1.arrayGetLength()>=4)) {
+      obj1.arrayGet(3, &obj2);
+      if (obj2.isInt()) {
+        hintsLength2 = obj2.getInt();
+      }
+      obj2.free();
+    }
+    obj1.free();
+  }
+
+  return hintsLength2;
+}
+
+int Linearization::getObjectNumberFirst()
+{
+  int objectNumberFirst = -1;
+
+  if (linDict.isDict()) {
+    linDict.getDict()->lookupInt("O", NULL, &objectNumberFirst);
+  }
+
+  return objectNumberFirst;
+}
+
+
+Guint Linearization::getEndFirst()
+{
+  int pageEndFirst = -1;
+
+  if (linDict.isDict()) {
+    linDict.getDict()->lookupInt("E", NULL, &pageEndFirst);
+  }
+
+  return pageEndFirst;
+}
+
+int Linearization::getNumPages()
+{
+  int numPages;
+
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("N", NULL, &numPages) &&
+      numPages > 0) {
+    return numPages;
+  } else {
+    error(-1, "Page count in linearization table is invalid");
+    return 0;
+  }
+}
+
+Guint Linearization::getMainXRefEntriesOffset()
+{
+  int mainXRefEntriesOffset = -1;
+
+  if (linDict.isDict()) {
+    linDict.getDict()->lookupInt("T", NULL, &mainXRefEntriesOffset);
+  }
+
+  return mainXRefEntriesOffset;
+}
+
+int Linearization::getPageFirst()
+{
+  int pageFirst = 1; // Optional, defaults to 1.
+
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("P", NULL, &pageFirst)) {
+     pageFirst++;
+  }
+
+  return pageFirst;
+}
+
+
diff --git a/poppler/Linearization.h b/poppler/Linearization.h
new file mode 100644
index 0000000..6728a75
--- /dev/null
+++ b/poppler/Linearization.h
@@ -0,0 +1,45 @@
+//========================================================================
+//
+// Linearization.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#ifndef LINEARIZATION_H
+#define LINEARIZATION_H
+
+#include "goo/gtypes.h"
+#include "Object.h"
+class BaseStream;
+
+//------------------------------------------------------------------------
+// Linearization
+//------------------------------------------------------------------------
+
+class Linearization {
+public:
+
+  Linearization(BaseStream *str);
+  ~Linearization();
+
+  Guint getLength();
+  Guint getHintsOffset();
+  Guint getHintsLength();
+  Guint getHintsOffset2();
+  Guint getHintsLength2();
+  int getObjectNumberFirst();
+  Guint getEndFirst();
+  int getNumPages();
+  Guint getMainXRefEntriesOffset();
+  int getPageFirst();
+
+private:
+
+  Object linDict;
+
+};
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 5dd8082..8c1e019 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -209,6 +209,7 @@ poppler_include_HEADERS =	\
 	JArithmeticDecoder.h	\
 	JBIG2Stream.h		\
 	Lexer.h			\
+	Linearization.h 	\
 	Link.h			\
 	LocalPDFDocBuilder.h	\
 	Movie.h                 \
@@ -287,6 +288,7 @@ libpoppler_la_SOURCES =		\
 	JArithmeticDecoder.cc	\
 	JBIG2Stream.cc		\
 	Lexer.cc 		\
+	Linearization.cc 	\
 	Link.cc 		\
 	LocalPDFDocBuilder.cc	\
 	Movie.cc                \
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 2d1477d..fe568a0 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -52,6 +52,7 @@
 #include "Catalog.h"
 #include "Stream.h"
 #include "XRef.h"
+#include "Linearization.h"
 #include "Link.h"
 #include "OutputDev.h"
 #include "Error.h"
@@ -82,6 +83,7 @@ void PDFDoc::init()
   file = NULL;
   str = NULL;
   xref = NULL;
+  linearization = NULL;
   catalog = NULL;
 #ifndef DISABLE_OUTLINE
   outline = NULL;
@@ -242,6 +244,9 @@ PDFDoc::~PDFDoc() {
   if (xref) {
     delete xref;
   }
+  if (linearization) {
+    delete linearization;
+  }
   if (str) {
     delete str;
   }
@@ -412,6 +417,14 @@ void PDFDoc::processLinks(OutputDev *out, int page) {
   catalog->getPage(page)->processLinks(out, catalog);
 }
 
+Linearization *PDFDoc::getLinearization()
+{
+  if (!linearization) {
+    linearization = new Linearization(str);
+  }
+  return linearization;
+}
+
 GBool PDFDoc::isLinearized() {
   Parser *parser;
   Object obj1, obj2, obj3, obj4, obj5;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 6d7dea2..011f4c0 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -48,6 +48,7 @@ class Links;
 class LinkAction;
 class LinkDest;
 class Outline;
+class Linearization;
 
 enum PDFWriteMode {
   writeStandard,
@@ -89,6 +90,9 @@ public:
   // Get file name.
   GooString *getFileName() { return fileName; }
 
+  // Get the linearization table.
+  Linearization *getLinearization();
+
   // Get the xref table.
   XRef *getXRef() { return xref; }
 
@@ -242,6 +246,7 @@ private:
   void *guiData;
   int pdfMajorVersion;
   int pdfMinorVersion;
+  Linearization *linearization;
   XRef *xref;
   Catalog *catalog;
 #ifndef DISABLE_OUTLINE
-- 
1.6.4.2


From 677e771b646c62788bdad0ad1dbeeb953c29f737 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 13 Apr 2010 18:51:40 +0200
Subject: [PATCH 07/12] Add getLength() to BaseStream

---
 poppler/Stream.cc |   11 ++++++-----
 poppler/Stream.h  |   11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/poppler/Stream.cc b/poppler/Stream.cc
index 0771e25..f4f9351 100644
--- a/poppler/Stream.cc
+++ b/poppler/Stream.cc
@@ -363,8 +363,9 @@ void FileOutStream::printf(const char *format, ...)
 // BaseStream
 //------------------------------------------------------------------------
 
-BaseStream::BaseStream(Object *dictA) {
+BaseStream::BaseStream(Object *dictA, Guint lengthA) {
   dict = *dictA;
+  length = lengthA;
 }
 
 BaseStream::~BaseStream() {
@@ -677,7 +678,7 @@ GBool StreamPredictor::getNextLine() {
 
 FileStream::FileStream(FILE *fA, Guint startA, GBool limitedA,
 		       Guint lengthA, Object *dictA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   f = fA;
   start = startA;
   limited = limitedA;
@@ -802,7 +803,7 @@ void FileStream::moveStart(int delta) {
 
 CachedFileStream::CachedFileStream(CachedFile *ccA, Guint startA,
         GBool limitedA, Guint lengthA, Object *dictA)
-  : BaseStream(dictA)
+  : BaseStream(dictA, lengthA)
 {
   cc = ccA;
   start = startA;
@@ -900,7 +901,7 @@ void CachedFileStream::moveStart(int delta)
 //------------------------------------------------------------------------
 
 MemStream::MemStream(char *bufA, Guint startA, Guint lengthA, Object *dictA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   buf = bufA;
   start = startA;
   length = lengthA;
@@ -964,7 +965,7 @@ void MemStream::moveStart(int delta) {
 
 EmbedStream::EmbedStream(Stream *strA, Object *dictA,
 			 GBool limitedA, Guint lengthA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   str = strA;
   limited = limitedA;
   length = lengthA;
diff --git a/poppler/Stream.h b/poppler/Stream.h
index 49ae8fb..6896d20 100644
--- a/poppler/Stream.h
+++ b/poppler/Stream.h
@@ -240,7 +240,7 @@ private:
 class BaseStream: public Stream {
 public:
 
-  BaseStream(Object *dictA);
+  BaseStream(Object *dictA, Guint lengthA);
   virtual ~BaseStream();
   virtual Stream *makeSubStream(Guint start, GBool limited,
 				Guint length, Object *dict) = 0;
@@ -250,11 +250,16 @@ public:
   virtual Stream *getUndecodedStream() { return this; }
   virtual Dict *getDict() { return dict.getDict(); }
   virtual GooString *getFileName() { return NULL; }
+  virtual Guint getLength() { return length; }
 
   // Get/set position of first byte of stream within the file.
   virtual Guint getStart() = 0;
   virtual void moveStart(int delta) = 0;
 
+protected:
+
+  Guint length;
+
 private:
 
   Object dict;
@@ -401,7 +406,6 @@ private:
   FILE *f;
   Guint start;
   GBool limited;
-  Guint length;
   char buf[fileStreamBufSize];
   char *bufPtr;
   char *bufEnd;
@@ -446,7 +450,6 @@ private:
   CachedFile *cc;
   Guint start;
   GBool limited;
-  Guint length;
   char buf[cachedStreamBufSize];
   char *bufPtr;
   char *bufEnd;
@@ -490,7 +493,6 @@ private:
 
   char *buf;
   Guint start;
-  Guint length;
   char *bufEnd;
   char *bufPtr;
   GBool needFree;
@@ -530,7 +532,6 @@ private:
 
   Stream *str;
   GBool limited;
-  Guint length;
 };
 
 //------------------------------------------------------------------------
-- 
1.6.4.2


From b7f47f34df6da648b34a810e07baf2ba1a4cd32e Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 19:16:14 +0100
Subject: [PATCH 08/12] Pass size of file when creating FileStream

---
 poppler/PDFDoc.cc |   19 +++++++++++++++++--
 1 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index fe568a0..0a018fd 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -44,6 +44,7 @@
 #ifdef _WIN32
 #  include <windows.h>
 #endif
+#include <sys/stat.h>
 #include "goo/gstrtod.h"
 #include "goo/GooString.h"
 #include "poppler-config.h"
@@ -98,12 +99,18 @@ PDFDoc::PDFDoc()
 PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
 	       GooString *userPassword, void *guiDataA) {
   Object obj;
+  int size = 0;
 
   init();
 
   fileName = fileNameA;
   guiData = guiDataA;
 
+  struct stat buf;
+  if (stat(fileName->getCString(), &buf) == 0) {
+     size = buf.st_size;
+  }
+
   // try to open file
 #ifdef VMS
   file = fopen(fileName->getCString(), "rb", "ctx=stm");
@@ -123,7 +130,7 @@ PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
 
   // create stream
   obj.initNull();
-  str = new FileStream(file, 0, gFalse, 0, &obj);
+  str = new FileStream(file, 0, gFalse, size, &obj);
 
   ok = setup(ownerPassword, userPassword);
 }
@@ -154,11 +161,19 @@ PDFDoc::PDFDoc(wchar_t *fileNameA, int fileNameLen, GooString *ownerPassword,
 
   // try to open file
   // NB: _wfopen is only available in NT
+  struct stat buf;
+  int size;
   version.dwOSVersionInfoSize = sizeof(version);
   GetVersionEx(&version);
   if (version.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+    if (_wstat(fileName2, &buf) == 0) {
+      size = buf.st_size;
+    }
     file = _wfopen(fileName2, L"rb");
   } else {
+    if (_wstat(fileName->getCString(), &buf) == 0) {
+      size = buf.st_size;
+    }
     file = fopen(fileName->getCString(), "rb");
   }
   if (!file) {
@@ -169,7 +184,7 @@ PDFDoc::PDFDoc(wchar_t *fileNameA, int fileNameLen, GooString *ownerPassword,
 
   // create stream
   obj.initNull();
-  str = new FileStream(file, 0, gFalse, 0, &obj);
+  str = new FileStream(file, 0, gFalse, size, &obj);
 
   ok = setup(ownerPassword, userPassword);
 }
-- 
1.6.4.2


From cd0ffbf22dabb7582ac5e64b0dfbce31eb50e99c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 19:32:59 +0100
Subject: [PATCH 09/12] Improve linearization check

---
 poppler/PDFDoc.cc |   33 +++++----------------------------
 1 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 0a018fd..35f5cc9 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -441,34 +441,11 @@ Linearization *PDFDoc::getLinearization()
 }
 
 GBool PDFDoc::isLinearized() {
-  Parser *parser;
-  Object obj1, obj2, obj3, obj4, obj5;
-  GBool lin;
-
-  lin = gFalse;
-  obj1.initNull();
-  parser = new Parser(xref,
-	     new Lexer(xref,
-	       str->makeSubStream(str->getStart(), gFalse, 0, &obj1)),
-	     gTrue);
-  parser->getObj(&obj1);
-  parser->getObj(&obj2);
-  parser->getObj(&obj3);
-  parser->getObj(&obj4);
-  if (obj1.isInt() && obj2.isInt() && obj3.isCmd("obj") &&
-      obj4.isDict()) {
-    obj4.dictLookup("Linearized", &obj5);
-    if (obj5.isNum() && obj5.getNum() > 0) {
-      lin = gTrue;
-    }
-    obj5.free();
-  }
-  obj4.free();
-  obj3.free();
-  obj2.free();
-  obj1.free();
-  delete parser;
-  return lin;
+  if ((str->getLength()) &&
+      (getLinearization()->getLength() == str->getLength()))
+    return gTrue;
+  else
+    return gFalse;
 }
 
 int PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
-- 
1.6.4.2


From 15c02cc41e8af2e0d73b435c843e19cd8d904c93 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 7 Apr 2010 12:05:56 +0200
Subject: [PATCH 10/12] Move getStartXref from XRef to PDFDoc

---
 poppler/PDFDoc.cc |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 poppler/PDFDoc.h  |    5 ++++
 poppler/XRef.cc   |   50 +------------------------------------------
 poppler/XRef.h    |    6 +----
 4 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 35f5cc9..6c4159a 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -34,6 +34,7 @@
 #pragma implementation
 #endif
 
+#include <ctype.h>
 #include <locale.h>
 #include <stdio.h>
 #include <errno.h>
@@ -72,6 +73,9 @@
 #define headerSearchSize 1024	// read this many bytes at beginning of
 				//   file to look for '%PDF'
 
+#define xrefSearchSize 1024	// read this many bytes at end of file
+				//   to look for 'startxref'
+
 //------------------------------------------------------------------------
 // PDFDoc
 //------------------------------------------------------------------------
@@ -89,6 +93,7 @@ void PDFDoc::init()
 #ifndef DISABLE_OUTLINE
   outline = NULL;
 #endif
+  startXRefPos = ~(Guint)0;
 }
 
 PDFDoc::PDFDoc()
@@ -222,7 +227,7 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
   checkHeader();
 
   // read xref table
-  xref = new XRef(str);
+  xref = new XRef(str, getStartXRef());
   if (!xref->isOk()) {
     error(-1, "Couldn't read xref table");
     errCode = xref->getErrorCode();
@@ -889,7 +894,7 @@ void PDFDoc::writeTrailer (Guint uxrefOffset, int uxrefSize, OutStream* outStr,
   trailerDict->set("Root", &obj1);
 
   if (incrUpdate) { 
-    obj1.initInt(xref->getLastXRefPos());
+    obj1.initInt(getStartXRef());
     trailerDict->set("Prev", &obj1);
   }
   
@@ -927,3 +932,55 @@ PDFDoc *PDFDoc::ErrorPDFDoc(int errorCode, GooString *fileNameA)
 
   return doc;
 }
+
+Guint PDFDoc::strToUnsigned(char *s) {
+  Guint x;
+  char *p;
+  int i;
+
+  x = 0;
+  for (p = s, i = 0; *p && isdigit(*p) && i < 10; ++p, ++i) {
+    x = 10 * x + (*p - '0');
+  }
+  return x;
+}
+
+// Read the 'startxref' position.
+Guint PDFDoc::getStartXRef()
+{
+  if (startXRefPos == ~(Guint)0) {
+
+    {
+      char buf[xrefSearchSize+1];
+      char *p;
+      int c, n, i;
+
+      // read last xrefSearchSize bytes
+      str->setPos(xrefSearchSize, -1);
+      for (n = 0; n < xrefSearchSize; ++n) {
+        if ((c = str->getChar()) == EOF) {
+          break;
+        }
+        buf[n] = c;
+      }
+      buf[n] = '\0';
+
+      // find startxref
+      for (i = n - 9; i >= 0; --i) {
+        if (!strncmp(&buf[i], "startxref", 9)) {
+          break;
+        }
+      }
+      if (i < 0) {
+        startXRefPos = 0;
+      }
+      for (p = &buf[i+9]; isspace(*p); ++p) ;
+      startXRefPos =  strToUnsigned(p);
+    }
+
+  }
+
+  return startXRefPos;
+}
+
+
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 011f4c0..d093b59 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -239,6 +239,9 @@ private:
   GBool checkFooter();
   void checkHeader();
   GBool checkEncryption(GooString *ownerPassword, GooString *userPassword);
+  // Get the offset of the start xref table.
+  Guint getStartXRef();
+  Guint strToUnsigned(char *s);
 
   GooString *fileName;
   FILE *file;
@@ -258,6 +261,8 @@ private:
   //If there is an error opening the PDF file with fopen() in the constructor, 
   //then the POSIX errno will be here.
   int fopenErrno;
+
+  Guint startXRefPos;		// offset of last xref table
 };
 
 #endif
diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index a0c77fc..b69bf9a 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -47,11 +47,6 @@
 #include "XRef.h"
 
 //------------------------------------------------------------------------
-
-#define xrefSearchSize 1024	// read this many bytes at end of file
-				//   to look for 'startxref'
-
-//------------------------------------------------------------------------
 // Permission bits
 // Note that the PDF spec uses 1 base (eg bit 3 is 1<<2)
 //------------------------------------------------------------------------
@@ -241,7 +236,7 @@ XRef::XRef() {
   init();
 }
 
-XRef::XRef(BaseStream *strA) {
+XRef::XRef(BaseStream *strA, Guint pos) {
   Object obj;
 
   init();
@@ -367,37 +362,6 @@ int XRef::resize(int newSize)
   return size;
 }
 
-// Read the 'startxref' position.
-Guint XRef::getStartXref() {
-  char buf[xrefSearchSize+1];
-  char *p;
-  int c, n, i;
-
-  // read last xrefSearchSize bytes
-  str->setPos(xrefSearchSize, -1);
-  for (n = 0; n < xrefSearchSize; ++n) {
-    if ((c = str->getChar()) == EOF) {
-      break;
-    }
-    buf[n] = c;
-  }
-  buf[n] = '\0';
-
-  // find startxref
-  for (i = n - 9; i >= 0; --i) {
-    if (!strncmp(&buf[i], "startxref", 9)) {
-      break;
-    }
-  }
-  if (i < 0) {
-    return 0;
-  }
-  for (p = &buf[i+9]; isspace(*p); ++p) ;
-  lastXRefPos = strToUnsigned(p);
-
-  return lastXRefPos;
-}
-
 // Read one xref table section.  Also reads the associated trailer
 // dictionary, and returns the prev pointer (if any).
 GBool XRef::readXRef(Guint *pos) {
@@ -1079,18 +1043,6 @@ int XRef::getNumEntry(Guint offset)
   else return -1;
 }
 
-Guint XRef::strToUnsigned(char *s) {
-  Guint x;
-  char *p;
-  int i;
-
-  x = 0;
-  for (p = s, i = 0; *p && isdigit(*p) && i < 10; ++p, ++i) {
-    x = 10 * x + (*p - '0');
-  }
-  return x;
-}
-
 void XRef::add(int num, int gen, Guint offs, GBool used) {
   if (num >= size) {
     if (num >= capacity) {
diff --git a/poppler/XRef.h b/poppler/XRef.h
index a013e5a..7cd8ebe 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -63,7 +63,7 @@ public:
   // Constructor, create an empty XRef, used for PDF writing
   XRef();
   // Constructor.  Read xref table from stream.
-  XRef(BaseStream *strA);
+  XRef(BaseStream *strA, Guint pos);
 
   // Destructor.
   ~XRef();
@@ -106,9 +106,6 @@ public:
   // Return the number of objects in the xref table.
   int getNumObjects() { return size; }
 
-  // Return the offset of the last xref table.
-  Guint getLastXRefPos() { return lastXRefPos; }
-
   // Return the catalog object reference.
   int getRootNum() { return rootNum; }
   int getRootGen() { return rootGen; }
@@ -143,7 +140,6 @@ private:
   GBool ok;			// true if xref table is valid
   int errCode;			// error code (if <ok> is false)
   Object trailerDict;		// trailer dictionary
-  Guint lastXRefPos;		// offset of last xref table
   Guint *streamEnds;		// 'endstream' positions - only used in
 				//   damaged files
   int streamEndsLen;		// number of valid entries in streamEnds
-- 
1.6.4.2


From a4caf1906288f6cf40f1bfa842ea08425c32b967 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 7 Apr 2010 12:35:05 +0200
Subject: [PATCH 11/12] Use XRef table at start of linearized document

---
 poppler/PDFDoc.cc |   27 ++++++++++++++++++++++++++-
 1 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 6c4159a..e590f3c 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -73,6 +73,10 @@
 #define headerSearchSize 1024	// read this many bytes at beginning of
 				//   file to look for '%PDF'
 
+#define linearizationSearchSize 1024	// read this many bytes at beginning of
+					// file to look for linearization
+					// dictionary
+
 #define xrefSearchSize 1024	// read this many bytes at end of file
 				//   to look for 'startxref'
 
@@ -950,7 +954,28 @@ Guint PDFDoc::getStartXRef()
 {
   if (startXRefPos == ~(Guint)0) {
 
-    {
+    if (isLinearized()) {
+      char buf[linearizationSearchSize+1];
+      int c, n, i;
+
+      str->setPos(0);
+      for (n = 0; n < linearizationSearchSize; ++n) {
+        if ((c = str->getChar()) == EOF) {
+          break;
+        }
+        buf[n] = c;
+      }
+      buf[n] = '\0';
+
+      // find end of first obj
+      startXRefPos = 0;
+      for (i = 0; i < n; i++) {
+        if (!strncmp("endobj", &buf[i], 6)) {
+           startXRefPos = i+6;
+           break;
+        }
+      }
+    } else {
       char buf[xrefSearchSize+1];
       char *p;
       int c, n, i;
-- 
1.6.4.2


From 25dcd380a03b911b300691502d1641425693c766 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Sun, 25 Apr 2010 17:34:49 +0200
Subject: [PATCH 12/12] Use linearization data to parse XRef entries

---
 poppler/PDFDoc.cc |   12 +++++++++++-
 poppler/PDFDoc.h  |    3 +++
 poppler/XRef.cc   |   45 +++++++++++++++++++++++++++++++++++++++++++--
 poppler/XRef.h    |    6 +++++-
 4 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index e590f3c..172f766 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -231,7 +231,7 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
   checkHeader();
 
   // read xref table
-  xref = new XRef(str, getStartXRef());
+  xref = new XRef(str, getStartXRef(), getMainXRefEntriesOffset());
   if (!xref->isOk()) {
     error(-1, "Couldn't read xref table");
     errCode = xref->getErrorCode();
@@ -1008,4 +1008,14 @@ Guint PDFDoc::getStartXRef()
   return startXRefPos;
 }
 
+Guint PDFDoc::getMainXRefEntriesOffset()
+{
+  Guint mainXRefEntriesOffset = 0;
+
+  if (isLinearized()) {
+    mainXRefEntriesOffset = getLinearization()->getMainXRefEntriesOffset();
+  }
+
+  return mainXRefEntriesOffset;
+}
 
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index d093b59..f6f8c8f 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -241,6 +241,9 @@ private:
   GBool checkEncryption(GooString *ownerPassword, GooString *userPassword);
   // Get the offset of the start xref table.
   Guint getStartXRef();
+  // Get the offset of the entries in the main XRef table of a
+  // linearized document (0 for non linearized documents).
+  Guint getMainXRefEntriesOffset();
   Guint strToUnsigned(char *s);
 
   GooString *fileName;
diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index b69bf9a..ab92cd8 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -230,16 +230,19 @@ void XRef::init() {
   streamEnds = NULL;
   streamEndsLen = 0;
   objStr = NULL;
+  mainXRefEntriesOffset = 0;
+  xRefStream = gFalse;
 }
 
 XRef::XRef() {
   init();
 }
 
-XRef::XRef(BaseStream *strA, Guint pos) {
+XRef::XRef(BaseStream *strA, Guint pos, Guint mainXRefEntriesOffsetA) {
   Object obj;
 
   init();
+  mainXRefEntriesOffset = mainXRefEntriesOffsetA;
 
   encrypted = gFalse;
   permFlags = defPermFlags;
@@ -396,6 +399,9 @@ GBool XRef::readXRef(Guint *pos) {
     if (!parser->getObj(&obj)->isStream()) {
       goto err1;
     }
+    if (trailerDict.isNone()) {
+      xRefStream = gTrue;
+    }
     more = readXRefStream(obj.getStream(), pos);
     obj.free();
 
@@ -1157,11 +1163,46 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   }
 }
 
+GBool XRef::parseEntry(Guint offset, XRefEntry *entry)
+{
+  GBool r;
+
+  Object obj;
+  obj.initNull();
+  Parser parser = Parser(NULL, new Lexer(NULL,
+     str->makeSubStream(offset, gFalse, 20, &obj)), gTrue);
+
+  Object obj1, obj2, obj3;
+  if ((parser.getObj(&obj1)->isInt()) &&
+      (parser.getObj(&obj2)->isInt()) &&
+      (parser.getObj(&obj3)->isCmd("n") || obj3.isCmd("f"))) {
+    entry->offset = (Guint) obj1.getInt();
+    entry->gen = obj2.getInt();
+    entry->type = obj3.isCmd("n") ? xrefEntryUncompressed : xrefEntryFree;
+    entry->obj.initNull ();
+    entry->updated = false;
+    r = gTrue;
+  } else {
+    r = gFalse;
+  }
+  obj1.free();
+  obj2.free();
+  obj3.free();
+
+  return r;
+}
+
 XRefEntry *XRef::getEntry(int i)
 {
   if (entries[i].type == xrefEntryNone) {
 
-    while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+    if ((!xRefStream) && mainXRefEntriesOffset) {
+      if (!parseEntry(mainXRefEntriesOffset + 20*i, &entries[i])) {
+        error(-1, "Failed to parse XRef entry [%d].", i);
+      }
+    } else {
+      while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+    }
 
     if (entries[i].type == xrefEntryNone) {
        error(-1, "Invalid XRef entry");
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 7cd8ebe..a4548e6 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -63,7 +63,7 @@ public:
   // Constructor, create an empty XRef, used for PDF writing
   XRef();
   // Constructor.  Read xref table from stream.
-  XRef(BaseStream *strA, Guint pos);
+  XRef(BaseStream *strA, Guint pos, Guint mainXRefEntriesOffsetA = 0);
 
   // Destructor.
   ~XRef();
@@ -153,6 +153,8 @@ private:
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
   Guint prevXRefOffset;		// position of prev XRef section (= next to read)
+  Guint mainXRefEntriesOffset;	// offset of entries in main XRef table
+  GBool xRefStream;		// true if last XRef section is a stream
 
   void init();
   int reserve(int newSize);
@@ -164,6 +166,8 @@ private:
   GBool readXRefStream(Stream *xrefStr, Guint *pos);
   GBool constructXRef();
   Guint strToUnsigned(char *s);
+  GBool parseEntry(Guint offset, XRefEntry *entry);
+
 };
 
 #endif
-- 
1.6.4.2
-------------- next part --------------
From 2159c5ac797fcedaeef2191d03acf12bd8351cb1 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:03:54 +0200
Subject: [PATCH 01/15] add PDFDoc::getPage()

---
 poppler/PDFDoc.cc |    8 ++++++++
 poppler/PDFDoc.h  |    3 +++
 2 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 172f766..b52b280 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -1019,3 +1019,11 @@ Guint PDFDoc::getMainXRefEntriesOffset()
   return mainXRefEntriesOffset;
 }
 
+Page *PDFDoc::getPage(int page)
+{
+  if ((page < 1) || page > getNumPages()) return NULL;
+
+  {
+    return catalog->getPage(page);
+  }
+}
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index f6f8c8f..011e6e1 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -127,6 +127,9 @@ public:
   // Return the structure tree root object.
   Object *getStructTreeRoot() { return catalog->getStructTreeRoot(); }
 
+  // Get page.
+  Page *getPage(int page);
+
   // Display a page.
   void displayPage(OutputDev *out, int page,
 		   double hDPI, double vDPI, int rotate,
-- 
1.6.4.2


From 5de023c53f1c9581b2cb7ca322bb8f2045d8fc0d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:36:08 +0200
Subject: [PATCH 02/15] Use PDFDoc::getPage() in PDFDoc

---
 poppler/PDFDoc.cc |   28 +++++++++++++++++++++-------
 poppler/PDFDoc.h  |   10 +++++-----
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index b52b280..89dba6f 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -395,8 +395,11 @@ void PDFDoc::displayPage(OutputDev *out, int page,
   if (globalParams->getPrintCommands()) {
     printf("***** page %d *****\n", page);
   }
-  catalog->getPage(page)->display(out, hDPI, vDPI,
-				  rotate, useMediaBox, crop, printing, catalog,
+
+  Page *p = getPage(page);
+  if (!p) return;
+
+  p->display(out, hDPI, vDPI, rotate, useMediaBox, crop, printing, catalog,
 				  abortCheckCbk, abortCheckCbkData,
 				  annotDisplayDecideCbk, annotDisplayDecideCbkData);
 }
@@ -425,8 +428,11 @@ void PDFDoc::displayPageSlice(OutputDev *out, int page,
 			      void *abortCheckCbkData,
                               GBool (*annotDisplayDecideCbk)(Annot *annot, void *user_data),
                               void *annotDisplayDecideCbkData) {
-  catalog->getPage(page)->displaySlice(out, hDPI, vDPI,
-				       rotate, useMediaBox, crop,
+
+  Page *p = getPage(page);
+  if (!p) return;
+
+  p->displaySlice(out, hDPI, vDPI, rotate, useMediaBox, crop,
 				       sliceX, sliceY, sliceW, sliceH,
 				       printing, catalog,
 				       abortCheckCbk, abortCheckCbkData,
@@ -434,11 +440,19 @@ void PDFDoc::displayPageSlice(OutputDev *out, int page,
 }
 
 Links *PDFDoc::getLinks(int page) {
-  return catalog->getPage(page)->getLinks(catalog);
+  Page *p = getPage(page);
+  if (!p) {
+    Object obj;
+    obj.initNull();
+    return new Links (&obj, NULL);
+  }
+  return p->getLinks(catalog);
 }
-  
+
 void PDFDoc::processLinks(OutputDev *out, int page) {
-  catalog->getPage(page)->processLinks(out, catalog);
+  Page *p = getPage(page);
+  if (!p) return;
+  p->processLinks(out, catalog);
 }
 
 Linearization *PDFDoc::getLinearization()
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 011e6e1..8de139f 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -107,15 +107,15 @@ public:
 
   // Get page parameters.
   double getPageMediaWidth(int page)
-    { return catalog->getPage(page)->getMediaWidth(); }
+    { return getPage(page) ? getPage(page)->getMediaWidth() : 0.0 ; }
   double getPageMediaHeight(int page)
-    { return catalog->getPage(page)->getMediaHeight(); }
+    { return getPage(page) ? getPage(page)->getMediaHeight() : 0.0 ; }
   double getPageCropWidth(int page)
-    { return catalog->getPage(page)->getCropWidth(); }
+    { return getPage(page) ? getPage(page)->getCropWidth() : 0.0 ; }
   double getPageCropHeight(int page)
-    { return catalog->getPage(page)->getCropHeight(); }
+    { return getPage(page) ? getPage(page)->getCropHeight() : 0.0 ; }
   int getPageRotate(int page)
-    { return catalog->getPage(page)->getRotate(); }
+    { return getPage(page) ? getPage(page)->getRotate() : 0 ; }
 
   // Get number of pages.
   int getNumPages() { return catalog->getNumPages(); }
-- 
1.6.4.2


From 739a4430d509999901c6834a358e0fcf88e7a826 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 20:48:30 +0200
Subject: [PATCH 03/15] Use PDFDoc::getPage() in FontInfo

---
 poppler/FontInfo.cc |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/poppler/FontInfo.cc b/poppler/FontInfo.cc
index 0037e07..c348d14 100644
--- a/poppler/FontInfo.cc
+++ b/poppler/FontInfo.cc
@@ -70,7 +70,9 @@ GooList *FontInfoScanner::scan(int nPages) {
   }
 
   for (int pg = currentPage; pg < lastPage; ++pg) {
-    page = doc->getCatalog()->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) continue;
+
     if ((resDict = page->getResourceDict())) {
       scanFonts(resDict, result);
     }
-- 
1.6.4.2


From e287781c5490ae23dd5506a33f814c1d33f66a8b Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 11:11:11 +0200
Subject: [PATCH 04/15] Use PDFDoc::getPage() in pdfinfo

---
 utils/pdfinfo.cc |   22 +++++++++++++++-------
 1 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 2abe8b4..a94e4e8 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -257,7 +257,11 @@ int main(int argc, char *argv[]) {
   if (printBoxes) {
     if (multiPage) {
       for (pg = firstPage; pg <= lastPage; ++pg) {
-	page = doc->getCatalog()->getPage(pg);
+	page = doc->getPage(pg);
+	if (!page) {
+          error(-1, "Failed to print boxes for page %d", pg);
+	  continue;
+	}
 	sprintf(buf, "Page %4d MediaBox: ", pg);
 	printBox(buf, page->getMediaBox());
 	sprintf(buf, "Page %4d CropBox:  ", pg);
@@ -270,12 +274,16 @@ int main(int argc, char *argv[]) {
 	printBox(buf, page->getArtBox());
       }
     } else {
-      page = doc->getCatalog()->getPage(firstPage);
-      printBox("MediaBox:       ", page->getMediaBox());
-      printBox("CropBox:        ", page->getCropBox());
-      printBox("BleedBox:       ", page->getBleedBox());
-      printBox("TrimBox:        ", page->getTrimBox());
-      printBox("ArtBox:         ", page->getArtBox());
+      page = doc->getPage(firstPage);
+      if (!page) {
+        error(-1, "Failed to print boxes for page %d", firstPage);
+      } else {
+        printBox("MediaBox:       ", page->getMediaBox());
+        printBox("CropBox:        ", page->getCropBox());
+        printBox("BleedBox:       ", page->getBleedBox());
+        printBox("TrimBox:        ", page->getTrimBox());
+        printBox("ArtBox:         ", page->getArtBox());
+      }
     }
   }
 
-- 
1.6.4.2


From 8766d0369705e84f835eb80c795c3ac00d3111ff Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 11:19:53 +0200
Subject: [PATCH 05/15] Use PDFDoc::getPage() in pdffonts

---
 utils/pdffonts.cc |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/utils/pdffonts.cc b/utils/pdffonts.cc
index 81b20e4..30e25dc 100644
--- a/utils/pdffonts.cc
+++ b/utils/pdffonts.cc
@@ -166,7 +166,11 @@ int main(int argc, char *argv[]) {
   fonts = NULL;
   fontsLen = fontsSize = 0;
   for (pg = firstPage; pg <= lastPage; ++pg) {
-    page = doc->getCatalog()->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) {
+      error(-1, "Failed to read fonts from page %d", pg);
+      continue;
+    }
     if ((resDict = page->getResourceDict())) {
       scanFonts(resDict, doc);
     }
-- 
1.6.4.2


From 8ba03542b2f29feb184e17a88bedb211eaf23adb Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 15:52:20 +0200
Subject: [PATCH 06/15] Use PDFDoc::getPage() in glib

---
 glib/poppler-action.cc   |    4 ++--
 glib/poppler-document.cc |   17 ++++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/glib/poppler-action.cc b/glib/poppler-action.cc
index ffc1842..a0f8576 100644
--- a/glib/poppler-action.cc
+++ b/glib/poppler-action.cc
@@ -422,13 +422,13 @@ find_annot_movie_for_action (PopplerDocument *document,
 
     xref->fetch (ref->num, ref->gen, &annotObj);
   } else if (link->hasAnnotTitle ()) {
-    Catalog *catalog = document->doc->getCatalog ();
     Object annots;
     GooString *title = link->getAnnotTitle ();
     int i;
 
     for (i = 1; i <= document->doc->getNumPages (); ++i) {
-      Page *p = catalog->getPage (i);
+      Page *p = document->doc->getPage (i);
+      if (!p) continue;
 
       if (p->getAnnots (&annots)->isArray ()) {
         int j;
diff --git a/glib/poppler-document.cc b/glib/poppler-document.cc
index cd6794a..ccb0f1c 100644
--- a/glib/poppler-document.cc
+++ b/glib/poppler-document.cc
@@ -379,15 +379,14 @@ PopplerPage *
 poppler_document_get_page (PopplerDocument  *document,
 			   int               index)
 {
-  Catalog *catalog;
   Page *page;
 
   g_return_val_if_fail (0 <= index &&
 			index < poppler_document_get_n_pages (document),
 			NULL);
 
-  catalog = document->doc->getCatalog();
-  page = catalog->getPage (index + 1);
+  page = document->doc->getPage (index + 1);
+  if (!page) return NULL;
 
   return _poppler_page_new (document, page, index);
 }
@@ -1909,18 +1908,22 @@ PopplerFormField *
 poppler_document_get_form_field (PopplerDocument *document,
 				 gint             id)
 {
-  Catalog *catalog = document->doc->getCatalog();
+  Page *page;
   unsigned pageNum;
   unsigned fieldNum;
   FormPageWidgets *widgets;
   FormWidget *field;
 
   FormWidget::decodeID (id, &pageNum, &fieldNum);
-  
-  widgets = catalog->getPage (pageNum)->getPageWidgets ();
+
+  page = document->doc->getPage (pageNum);
+  if (!page)
+    return NULL;
+
+  widgets = page->getPageWidgets ();
   if (!widgets)
     return NULL;
-  
+
   field = widgets->getWidget (fieldNum);
   if (field)
     return _poppler_form_field_new (document, field);
-- 
1.6.4.2


From 12c8bf93e3e8e38f357f3165e5a4517056a9af0b Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 17:59:01 +0200
Subject: [PATCH 07/15] Use PDFDoc::getPage() in qt4

Note API change: With this patch, Document::Page(int index) can now return NULL
when poppler fails to create a page. Any application using these bindings
should check the return value.
---
 qt4/src/poppler-document.cc |    8 +++++++-
 qt4/src/poppler-link.cc     |    6 ++++--
 qt4/src/poppler-page.cc     |    3 ++-
 qt4/src/poppler-qt4.h       |    3 +++
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/qt4/src/poppler-document.cc b/qt4/src/poppler-document.cc
index 41d35b6..dc0ce97 100644
--- a/qt4/src/poppler-document.cc
+++ b/qt4/src/poppler-document.cc
@@ -98,7 +98,13 @@ namespace Poppler {
 
     Page *Document::page(int index) const
     {
-	return new Page(m_doc, index);
+	Page *page = new Page(m_doc, index);
+	if (!page->isOk()) {
+	  delete page;
+	  return NULL;
+	}
+
+	return page;
     }
 
     bool Document::isLocked() const
diff --git a/qt4/src/poppler-link.cc b/qt4/src/poppler-link.cc
index de06242..4f54201 100644
--- a/qt4/src/poppler-link.cc
+++ b/qt4/src/poppler-link.cc
@@ -232,9 +232,11 @@ class LinkMoviePrivate : public LinkPrivate
 		
 		int leftAux = 0, topAux = 0, rightAux = 0, bottomAux = 0;
 		
-		if (d->pageNum > 0 && d->pageNum <= data.doc->doc->getNumPages())
+		::Page *page;
+		if (d->pageNum > 0 &&
+		    d->pageNum <= data.doc->doc->getNumPages() &&
+		    (page = data.doc->doc->getPage( d->pageNum )))
 		{
-			::Page *page = data.doc->doc->getCatalog()->getPage( d->pageNum );
 			cvtUserToDev( page, left, top, &leftAux, &topAux );
 			cvtUserToDev( page, right, bottom, &rightAux, &bottomAux );
 			
diff --git a/qt4/src/poppler-page.cc b/qt4/src/poppler-page.cc
index 6dbf50f..335f2ce 100644
--- a/qt4/src/poppler-page.cc
+++ b/qt4/src/poppler-page.cc
@@ -186,8 +186,9 @@ Page::Page(DocumentData *doc, int index) {
   m_page = new PageData();
   m_page->index = index;
   m_page->parentDoc = doc;
-  m_page->page = doc->doc->getCatalog()->getPage(m_page->index + 1);
+  m_page->page = doc->doc->getPage(m_page->index + 1);
   m_page->transition = 0;
+  ok = m_page->page ? true : false;
 }
 
 Page::~Page()
diff --git a/qt4/src/poppler-qt4.h b/qt4/src/poppler-qt4.h
index 117dc43..2e77f48 100644
--- a/qt4/src/poppler-qt4.h
+++ b/qt4/src/poppler-qt4.h
@@ -587,11 +587,14 @@ delete it;
 	**/
 	QString label() const;
 	
+	bool isOk() { return ok; };
+
     private:
 	Q_DISABLE_COPY(Page)
 
 	Page(DocumentData *doc, int index);
 	PageData *m_page;
+        bool ok;
     };
 
 /**
-- 
1.6.4.2


From b9e850e99b987d15c569ad8a7ee2bb76aae413e3 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Fri, 23 Apr 2010 09:21:23 +0200
Subject: [PATCH 08/15] Use PDFDoc::getPage() in qt

Note API change: With this patch, Document::getPage(int index) can now
return NULL when poppler fails to create a page. Any application using
these bindings should check the return value.
---
 qt/poppler-document.cc |   11 +++++++++++
 qt/poppler-page.cc     |   11 +++++++----
 qt/poppler-qt.h        |    6 +++++-
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/qt/poppler-document.cc b/qt/poppler-document.cc
index bade1d1..1a5892b 100644
--- a/qt/poppler-document.cc
+++ b/qt/poppler-document.cc
@@ -113,6 +113,17 @@ int Document::getNumPages() const
   return data->doc.getNumPages();
 }
 
+Page *Document::getPage(int index) const
+{
+  Page *p = new Page(this, index);
+  if (!p->isOk()) {
+    delete p;
+    return NULL;
+  }
+
+  return p;
+}
+
 QValueList<FontInfo> Document::fonts() const
 {
   QValueList<FontInfo> ourList;
diff --git a/qt/poppler-page.cc b/qt/poppler-page.cc
index a42aa15..ef077a7 100644
--- a/qt/poppler-page.cc
+++ b/qt/poppler-page.cc
@@ -47,6 +47,7 @@ class PageData {
   const Document *doc;
   int index;
   PageTransition *transition;
+  ::Page *page;
 };
 
 Page::Page(const Document *doc, int index) {
@@ -54,6 +55,8 @@ Page::Page(const Document *doc, int index) {
   data->index = index;
   data->doc = doc;
   data->transition = 0;
+  data->page = doc->data->doc.getPage(data->index + 1);
+  ok = data->page ? true : false;
 }
 
 Page::~Page()
@@ -132,7 +135,7 @@ QString Page::getText(const Rectangle &r) const
   output_dev = new TextOutputDev(0, gFalse, gFalse, gFalse);
   data->doc->data->doc.displayPageSlice(output_dev, data->index + 1, 72, 72,
       0, false, false, false, -1, -1, -1, -1);
-  p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  p = data->page;
   if (r.isNull())
   {
     rect = p->getCropBox();
@@ -197,7 +200,7 @@ PageTransition *Page::getTransition() const
   {
     Object o;
     PageTransitionParams params;
-    params.dictObj = data->doc->data->doc.getCatalog()->getPage(data->index + 1)->getTrans(&o);
+    params.dictObj = data->page->getTrans(&o);
     data->transition = new PageTransition(params);
     o.free();
   }
@@ -208,7 +211,7 @@ QSize Page::pageSize() const
 {
   ::Page *p;
 
-  p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  p = data->page;
   if ( ( Page::Landscape == orientation() ) || (Page::Seascape == orientation() ) ) {
     return QSize( (int)p->getCropHeight(), (int)p->getCropWidth() );
   } else {
@@ -218,7 +221,7 @@ QSize Page::pageSize() const
 
 Page::Orientation Page::orientation() const
 {
-  ::Page *p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  ::Page *p = data->page;
 
   int rotation = p->getRotate();
   switch (rotation) {
diff --git a/qt/poppler-qt.h b/qt/poppler-qt.h
index a6b1e6e..549ffd2 100644
--- a/qt/poppler-qt.h
+++ b/qt/poppler-qt.h
@@ -31,6 +31,7 @@
 #include <qdom.h>
 #include <qpixmap.h>
 
+
 namespace Poppler {
 
 class Document;
@@ -198,9 +199,12 @@ class Page {
     */
     QValueList<Link*> links() const;
 
+    bool isOk() { return ok; };
+
   private:
     Page(const Document *doc, int index);
     PageData *data;
+    bool ok;
 };
 
 class DocumentData;
@@ -219,7 +223,7 @@ public:
   
   static Document *load(const QString & filePath);
   
-  Page *getPage(int index) const{ return new Page(this, index); }
+  Page *getPage(int index) const;
   
   int getNumPages() const;
   
-- 
1.6.4.2


From 0cf7f1d38625e278938880bc5cd61841533a97d1 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Fri, 23 Apr 2010 12:07:39 +0200
Subject: [PATCH 09/15] Use PDFDoc::getPage() in PSOutputDev

---
 glib/poppler-page.cc            |    1 +
 poppler/PSOutputDev.cc          |   37 ++++++++++++++++++++++---------------
 poppler/PSOutputDev.h           |   13 ++++++++-----
 qt/poppler-document.cc          |    2 +-
 qt4/src/poppler-ps-converter.cc |    1 +
 utils/pdftohtml.cc              |    2 +-
 utils/pdftops.cc                |    2 +-
 7 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 39645bd..106b636 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1161,6 +1161,7 @@ poppler_page_render_to_ps (PopplerPage   *page,
 
   if (!ps_file->out)
     ps_file->out = new PSOutputDev (ps_file->filename,
+                                    ps_file->document->doc,
                                     ps_file->document->doc->getXRef(),
                                     ps_file->document->doc->getCatalog(),
                                     NULL,
diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc
index 179a494..5e5d3d0 100644
--- a/poppler/PSOutputDev.cc
+++ b/poppler/PSOutputDev.cc
@@ -70,6 +70,7 @@
 #  include "SplashOutputDev.h"
 #endif
 #include "PSOutputDev.h"
+#include "PDFDoc.h"
 
 #ifdef MACOS
 // needed for setting type/creator of MacOS files
@@ -972,7 +973,7 @@ static void outputToFile(void *stream, char *data, int len) {
   fwrite(data, 1, len, (FILE *)stream);
 }
 
-PSOutputDev::PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
+PSOutputDev::PSOutputDev(const char *fileName, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 			 char *psTitle,
 			 int firstPage, int lastPage, PSOutMode modeA,
 			 int paperWidthA, int paperHeightA, GBool duplexA,
@@ -1033,13 +1034,14 @@ PSOutputDev::PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
   }
 
   init(outputToFile, f, fileTypeA, psTitle,
-       xrefA, catalog, firstPage, lastPage, modeA,
+       doc, xrefA, catalog, firstPage, lastPage, modeA,
        imgLLXA, imgLLYA, imgURXA, imgURYA, manualCtrlA,
        paperWidthA, paperHeightA, duplexA);
 }
 
 PSOutputDev::PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
 			 char *psTitle,
+			 PDFDoc *doc,
 			 XRef *xrefA, Catalog *catalog,
 			 int firstPage, int lastPage, PSOutMode modeA,
 			 int paperWidthA, int paperHeightA, GBool duplexA,
@@ -1068,18 +1070,17 @@ PSOutputDev::PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
   forceRasterize = forceRasterizeA;
 
   init(outputFuncA, outputStreamA, psGeneric, psTitle,
-       xrefA, catalog, firstPage, lastPage, modeA,
+       doc, xrefA, catalog, firstPage, lastPage, modeA,
        imgLLXA, imgLLYA, imgURXA, imgURYA, manualCtrlA,
        paperWidthA, paperHeightA, duplexA);
 }
 
 void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
-		       PSFileType fileTypeA, char *pstitle, XRef *xrefA, Catalog *catalog,
+		       PSFileType fileTypeA, char *pstitle, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 		       int firstPage, int lastPage, PSOutMode modeA,
 		       int imgLLXA, int imgLLYA, int imgURXA, int imgURYA,
 		       GBool manualCtrlA, int paperWidthA, int paperHeightA,
 		       GBool duplexA) {
-  Page *page;
   PDFRectangle *box;
 
   // initialize
@@ -1099,12 +1100,12 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
   imgURX = imgURXA;
   imgURY = imgURYA;
   if (paperWidth < 0 || paperHeight < 0) {
-    // this check is needed in case the document has zero pages
-    if (firstPage > 0 && firstPage <= catalog->getNumPages()) {
-      page = catalog->getPage(firstPage);
+    Page *page;
+    if ((page = doc->getPage(firstPage))) {
       paperWidth = (int)ceil(page->getMediaWidth());
       paperHeight = (int)ceil(page->getMediaHeight());
     } else {
+      error(-1, "Invalid page %d", firstPage);
       paperWidth = 1;
       paperHeight = 1;
     }
@@ -1170,14 +1171,16 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
   embFontList = new GooString();
 
   if (!manualCtrl) {
+    Page *page;
     // this check is needed in case the document has zero pages
-    if (firstPage > 0 && firstPage <= catalog->getNumPages()) {
+    if ((page = doc->getPage(firstPage))) {
       writeHeader(firstPage, lastPage,
-		  catalog->getPage(firstPage)->getMediaBox(),
-		  catalog->getPage(firstPage)->getCropBox(),
-		  catalog->getPage(firstPage)->getRotate(),
+		  page->getMediaBox(),
+		  page->getCropBox(),
+		  page->getRotate(),
 		  pstitle);
     } else {
+      error(-1, "Invalid page %d", firstPage);
       box = new PDFRectangle(0, 0, 1, 1);
       writeHeader(firstPage, lastPage, box, box, 0, pstitle);
       delete box;
@@ -1190,7 +1193,7 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
       writePS("%%EndProlog\n");
       writePS("%%BeginSetup\n");
     }
-    writeDocSetup(catalog, firstPage, lastPage, duplexA);
+    writeDocSetup(doc, catalog, firstPage, lastPage, duplexA);
     if (mode != psModeForm) {
       writePS("%%EndSetup\n");
     }
@@ -1400,7 +1403,7 @@ void PSOutputDev::writeXpdfProcset() {
   }
 }
 
-void PSOutputDev::writeDocSetup(Catalog *catalog,
+void PSOutputDev::writeDocSetup(PDFDoc *doc, Catalog *catalog,
 				int firstPage, int lastPage,
                                 GBool duplexA) {
   Page *page;
@@ -1416,7 +1419,11 @@ void PSOutputDev::writeDocSetup(Catalog *catalog,
     writePS("xpdf begin\n");
   }
   for (pg = firstPage; pg <= lastPage; ++pg) {
-    page = catalog->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) {
+      error(-1, "Failed writing resources for page %d", pg);
+      continue;
+    }
     if ((resDict = page->getResourceDict())) {
       setupResources(resDict);
     }
diff --git a/poppler/PSOutputDev.h b/poppler/PSOutputDev.h
index 38c838c..a84a638 100644
--- a/poppler/PSOutputDev.h
+++ b/poppler/PSOutputDev.h
@@ -50,6 +50,7 @@ struct PSFont8Info;
 struct PSFont16Enc;
 class PSOutCustomColor;
 class Function;
+class PDFDoc;
 
 //------------------------------------------------------------------------
 // PSOutputDev
@@ -75,7 +76,7 @@ class PSOutputDev: public OutputDev {
 public:
 
   // Open a PostScript output file, and write the prolog.
-  PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
+  PSOutputDev(const char *fileName, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 	      char *psTitle,
 	      int firstPage, int lastPage, PSOutMode modeA,
 	      int paperWidthA = -1, int paperHeightA = -1,
@@ -88,6 +89,7 @@ public:
   // Open a PSOutputDev that will write to a generic stream.
   PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
 	      char *psTitle,
+	      PDFDoc *doc,
 	      XRef *xrefA, Catalog *catalog,
 	      int firstPage, int lastPage, PSOutMode modeA,
 	      int paperWidthA = -1, int paperHeightA = -1,
@@ -145,9 +147,6 @@ public:
   // Write the Xpdf procset.
   void writeXpdfProcset();
 
-  // Write the document-level setup.
-  void writeDocSetup(Catalog *catalog, int firstPage, int lastPage, GBool duplexA);
-
   // Write the trailer for the current page.
   void writePageTrailer();
 
@@ -287,7 +286,7 @@ public:
 private:
 
   void init(PSOutputFunc outputFuncA, void *outputStreamA,
-	    PSFileType fileTypeA, char *pstitle, XRef *xrefA, Catalog *catalog,
+	    PSFileType fileTypeA, char *pstitle, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 	    int firstPage, int lastPage, PSOutMode modeA,
 	    int imgLLXA, int imgLLYA, int imgURXA, int imgURYA,
 	    GBool manualCtrlA, int paperWidthA, int paperHeightA,
@@ -341,6 +340,10 @@ private:
 		    double *x1, double *y1);
 #endif
   void cvtFunction(Function *func);
+
+  // Write the document-level setup.
+  void writeDocSetup(PDFDoc *doc, Catalog *catalog, int firstPage, int lastPage, GBool duplexA);
+
   void writePSChar(char c);
   void writePS(char *s);
   void writePSFmt(const char *fmt, ...);
diff --git a/qt/poppler-document.cc b/qt/poppler-document.cc
index 1a5892b..03d01fa 100644
--- a/qt/poppler-document.cc
+++ b/qt/poppler-document.cc
@@ -325,7 +325,7 @@ bool Document::print(const QString &fileName, QValueList<int> pageList, double h
 
 bool Document::print(const QString &file, QValueList<int> pageList, double hDPI, double vDPI, int rotate, int paperWidth, int paperHeight)
 {
-  PSOutputDev *psOut = new PSOutputDev(file.latin1(), data->doc.getXRef(), data->doc.getCatalog(), NULL, 1, data->doc.getNumPages(), psModePS, paperWidth, paperHeight);
+  PSOutputDev *psOut = new PSOutputDev(file.latin1(), &(data->doc), data->doc.getXRef(), data->doc.getCatalog(), NULL, 1, data->doc.getNumPages(), psModePS, paperWidth, paperHeight);
   
   if (psOut->isOk()) {
     QValueList<int>::iterator it;
diff --git a/qt4/src/poppler-ps-converter.cc b/qt4/src/poppler-ps-converter.cc
index 7a1957b..9dc82ec 100644
--- a/qt4/src/poppler-ps-converter.cc
+++ b/qt4/src/poppler-ps-converter.cc
@@ -195,6 +195,7 @@ bool PSConverter::convert()
 	
 	PSOutputDev *psOut = new PSOutputDev(outputToQIODevice, dev,
 	                                     pstitlechar,
+	                                     d->document->doc,
 	                                     d->document->doc->getXRef(),
 	                                     d->document->doc->getCatalog(),
 	                                     1,
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 3c74c6e..0558e5c 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -350,7 +350,7 @@ int main(int argc, char *argv[]) {
     psFileName = new GooString(htmlFileName->getCString());
     psFileName->append(".ps");
 
-    psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+    psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
 			    doc->getCatalog(), NULL, firstPage, lastPage, psModePS, w, h);
     psOut->setDisplayText(gFalse);
     doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
diff --git a/utils/pdftops.cc b/utils/pdftops.cc
index 0bc43a1..8231458 100644
--- a/utils/pdftops.cc
+++ b/utils/pdftops.cc
@@ -359,7 +359,7 @@ int main(int argc, char *argv[]) {
   }
 
   // write PostScript file
-  psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+  psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
 			  doc->getCatalog(), NULL, firstPage, lastPage, mode,
 			  paperWidth,
 			  paperHeight,
-- 
1.6.4.2


From 5fb2cd2ba2da29ec6b062d43701481c25f4402dc Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Sat, 24 Apr 2010 10:17:56 +0200
Subject: [PATCH 10/15] Use PDFDoc::getPage() in HtmlOutputDev

---
 utils/HtmlOutputDev.cc |    2 +-
 utils/HtmlOutputDev.h  |    2 ++
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 81f8b88..4f7dff6 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -1093,7 +1093,7 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state) {
 
 
 void HtmlOutputDev::endPage() {
-  Links *linksList = catalog->getPage(pageNum)->getLinks(catalog);
+  Links *linksList = docPage->getLinks(catalog);
   for (int i = 0; i < linksList->getNumLinks(); ++i)
   {
       doProcessLink(linksList->getLink(i));
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index 24ccfd1..48b04c6 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -256,6 +256,7 @@ public:
                                GBool (* abortCheckCbk)(void *data) = NULL,
                                void * abortCheckCbkData = NULL)
   {
+   docPage = page;
    catalog = catalogA;
    return gTrue;
   }
@@ -323,6 +324,7 @@ private:
   GooString *docTitle;
   GooList *glMetaVars;
   Catalog *catalog;
+  Page *docPage;
   friend class HtmlPage;
 };
 
-- 
1.6.4.2


From ecbc8526bf49c512578f74940c2be1c1d221a085 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 31 Mar 2010 14:39:57 +0200
Subject: [PATCH 11/15] Parse page tree on demand

---
 poppler/Catalog.cc |  266 ++++++++++++++++++++++++++++++++++-----------------
 poppler/Catalog.h  |   12 ++-
 2 files changed, 185 insertions(+), 93 deletions(-)

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index 900cdd7..416fb66 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -59,9 +59,6 @@ Catalog::Catalog(XRef *xrefA) {
   Object catDict, pagesDict, pagesDictRef;
   Object obj, obj2;
   Object optContentProps;
-  char *alreadyRead;
-  int numPages0;
-  int i;
 
   ok = gTrue;
   xref = xrefA;
@@ -78,6 +75,12 @@ Catalog::Catalog(XRef *xrefA) {
   embeddedFileNameTree = NULL;
   jsNameTree = NULL;
 
+  pagesList = NULL;
+  pagesRefList = NULL;
+  attrsList = NULL;
+  kidsIdxList = NULL;
+  lastCachedPage = 0;
+
   xref->getCatalog(&catDict);
   if (!catDict.isDict()) {
     error(-1, "Catalog object is wrong type (%s)", catDict.getTypeName());
@@ -100,31 +103,11 @@ Catalog::Catalog(XRef *xrefA) {
   if (!obj.isNum()) {
     error(-1, "Page count in top-level pages object is wrong type (%s)",
 	  obj.getTypeName());
-    pagesSize = numPages0 = 0;
+    numPages = 0;
   } else {
-    pagesSize = numPages0 = (int)obj.getNum();
+    numPages = (int)obj.getNum();
   }
   obj.free();
-  pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
-  pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
-  for (i = 0; i < pagesSize; ++i) {
-    pages[i] = NULL;
-    pageRefs[i].num = -1;
-    pageRefs[i].gen = -1;
-  }
-  alreadyRead = (char *)gmalloc(xref->getNumObjects());
-  memset(alreadyRead, 0, xref->getNumObjects());
-  if (catDict.dictLookupNF("Pages", &pagesDictRef)->isRef() &&
-      pagesDictRef.getRefNum() >= 0 &&
-      pagesDictRef.getRefNum() < xref->getNumObjects()) {
-    alreadyRead[pagesDictRef.getRefNum()] = 1;
-  }
-  pagesDictRef.free();
-  numPages = readPageTree(pagesDict.getDict(), NULL, 0, alreadyRead);
-  gfree(alreadyRead);
-  if (numPages != numPages0) {
-    error(-1, "Page count in top-level pages object is incorrect");
-  }
   pagesDict.free();
 
   // read base URI
@@ -163,6 +146,10 @@ Catalog::Catalog(XRef *xrefA) {
 Catalog::~Catalog() {
   int i;
 
+  delete kidsIdxList;
+  delete attrsList;
+  delete pagesRefList;
+  delete pagesList;
   if (pages) {
     for (i = 0; i < pagesSize; ++i) {
       if (pages[i]) {
@@ -225,91 +212,192 @@ GooString *Catalog::readMetadata() {
   return s;
 }
 
-int Catalog::readPageTree(Dict *pagesDict, PageAttrs *attrs, int start,
-			  char *alreadyRead) {
-  Object kids;
-  Object kid;
-  Object kidRef;
-  PageAttrs *attrs1, *attrs2;
-  Page *page;
-  int i, j;
-
-  attrs1 = new PageAttrs(attrs, pagesDict);
-  pagesDict->lookup("Kids", &kids);
-  if (!kids.isArray()) {
-    error(-1, "Kids object (page %d) is wrong type (%s)",
-	  start+1, kids.getTypeName());
-    return start;
-  }
-  for (i = 0; i < kids.arrayGetLength(); ++i) {
-    kids.arrayGetNF(i, &kidRef);
-    if (kidRef.isRef() &&
-	kidRef.getRefNum() >= 0 &&
-	kidRef.getRefNum() < xref->getNumObjects()) {
-      if (alreadyRead[kidRef.getRefNum()]) {
-	error(-1, "Loop in Pages tree");
-	kidRef.free();
-	continue;
+Page *Catalog::getPage(int i)
+{
+  if (i < 1) return NULL;
+
+  if (i > lastCachedPage) {
+     if (cachePageTree(i) == gFalse) return NULL;
+  }
+  return pages[i-1];
+}
+
+Ref *Catalog::getPageRef(int i)
+{
+  if (i < 1) return NULL;
+
+  if (i > lastCachedPage) {
+     if (cachePageTree(i) == gFalse) return NULL;
+  }
+  return &pageRefs[i-1];
+}
+
+GBool Catalog::cachePageTree(int page)
+{
+  Dict *pagesDict;
+
+  if (pagesList == NULL) {
+
+    Object catDict;
+    Ref pagesRef;
+
+    xref->getCatalog(&catDict);
+
+    Object pagesDictRef;
+    if (catDict.dictLookupNF("Pages", &pagesDictRef)->isRef() &&
+        pagesDictRef.getRefNum() >= 0 &&
+        pagesDictRef.getRefNum() < xref->getNumObjects()) {
+      pagesRef = pagesDictRef.getRef();
+      pagesDictRef.free();
+    } else {
+       error(-1, "Catalog dictionary does not contain a valid \"Pages\" entry");
+       pagesDictRef.free();
+       return gFalse;
+    }
+
+    Object obj;
+    catDict.dictLookup("Pages", &obj);
+    // This should really be isDict("Pages"), but I've seen at least one
+    // PDF file where the /Type entry is missing.
+    if (obj.isDict()) {
+      obj.getDict()->incRef();
+      pagesDict = obj.getDict();
+      obj.free();
+    }
+    else {
+      error(-1, "Top-level pages object is wrong type (%s)", obj.getTypeName());
+      obj.free();
+      return gFalse;
+    }
+
+    pagesSize = numPages;
+    pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
+    pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
+    for (int i = 0; i < pagesSize; ++i) {
+      pages[i] = NULL;
+      pageRefs[i].num = -1;
+      pageRefs[i].gen = -1;
+    }
+
+    pagesList = new GooVector<Dict *>();
+    pagesList->push_back(pagesDict);
+    pagesRefList = new GooVector<Ref>();
+    pagesRefList->push_back(pagesRef);
+    attrsList = new GooVector<PageAttrs *>();
+    attrsList->push_back(new PageAttrs(NULL, pagesDict));
+    kidsIdxList = new GooVector<int>();
+    kidsIdxList->push_back(0);
+    lastCachedPage = 0;
+
+  }
+
+  while(1) {
+
+    if (page <= lastCachedPage) return gTrue;
+
+    if (pagesList->empty()) return gFalse;
+
+    pagesDict = pagesList->back();
+    Object kids;
+    pagesDict->lookup("Kids", &kids);
+    if (!kids.isArray()) {
+      error(-1, "Kids object (page %d) is wrong type (%s)",
+            lastCachedPage+1, kids.getTypeName());
+      kids.free();
+      return gFalse;
+    }
+
+    int kidsIdx = kidsIdxList->back();
+    if (kidsIdx >= kids.arrayGetLength()) {
+       delete pagesList->back();
+       pagesList->pop_back();
+       pagesRefList->pop_back();
+       delete attrsList->back();
+       attrsList->pop_back();
+       kidsIdxList->pop_back();
+       if (!kidsIdxList->empty()) kidsIdxList->back()++;
+       kids.free();
+       continue;
+    }
+
+    Object kidRef;
+    kids.arrayGetNF(kidsIdx, &kidRef);
+    if (!kidRef.isRef()) {
+      error(-1, "Kid object (page %d) is not an indirect reference (%s)",
+            lastCachedPage+1, kidRef.getTypeName());
+      kidRef.free();
+      kids.free();
+      return gFalse;
+    }
+
+    for (size_t i = 0; i < pagesRefList->size(); i++) {
+      if (((*pagesRefList)[i]).num == kidRef.getRefNum()) {
+         error(-1, "Loop in Pages tree");
+         kidRef.free();
+         kids.free();
+         kidsIdxList->back()++;
+         continue;
       }
-      alreadyRead[kidRef.getRefNum()] = 1;
     }
-    kids.arrayGet(i, &kid);
+
+    Object kid;
+    kids.arrayGet(kidsIdx, &kid);
+    kids.free();
     if (kid.isDict("Page")) {
-      attrs2 = new PageAttrs(attrs1, kid.getDict());
-      page = new Page(xref, start+1, kid.getDict(), kidRef.getRef(), attrs2, getForm());
-      if (!page->isOk()) {
-	++start;
-	goto err3;
-      }
-      if (start >= pagesSize) {
-	pagesSize += 32;
-	pages = (Page **)greallocn(pages, pagesSize, sizeof(Page *));
-	pageRefs = (Ref *)greallocn(pageRefs, pagesSize, sizeof(Ref));
-	for (j = pagesSize - 32; j < pagesSize; ++j) {
-	  pages[j] = NULL;
-	  pageRefs[j].num = -1;
-	  pageRefs[j].gen = -1;
-	}
+      PageAttrs *attrs = new PageAttrs(attrsList->back(), kid.getDict());
+      Page *p = new Page(xref, lastCachedPage+1, kid.getDict(),
+                     kidRef.getRef(), attrs, form);
+      if (!p->isOk()) {
+        error(-1, "Failed to create page (page %d)", lastCachedPage+1);
+        delete p;
+        kidRef.free();
+        kid.free();
+        return gFalse;
       }
-      pages[start] = page;
-      if (kidRef.isRef()) {
-	pageRefs[start].num = kidRef.getRefNum();
-	pageRefs[start].gen = kidRef.getRefGen();
+
+      if (lastCachedPage >= numPages) {
+        error(-1, "Page count in top-level pages object is incorrect");
+        kidRef.free();
+        kid.free();
+        return gFalse;
       }
-      ++start;
+
+      pages[lastCachedPage] = p;
+      pageRefs[lastCachedPage].num = kidRef.getRefNum();
+      pageRefs[lastCachedPage].gen = kidRef.getRefGen();
+
+      lastCachedPage++;
+      kidsIdxList->back()++;
+
     // This should really be isDict("Pages"), but I've seen at least one
     // PDF file where the /Type entry is missing.
     } else if (kid.isDict()) {
-      if ((start = readPageTree(kid.getDict(), attrs1, start, alreadyRead))
-	  < 0)
-	goto err2;
+      attrsList->push_back(new PageAttrs(attrsList->back(), kid.getDict()));
+      pagesRefList->push_back(kidRef.getRef());
+      kid.getDict()->incRef();
+      pagesList->push_back(kid.getDict());
+      kidsIdxList->push_back(0);
     } else {
       error(-1, "Kid object (page %d) is wrong type (%s)",
-	    start+1, kid.getTypeName());
+            lastCachedPage+1, kid.getTypeName());
+      kidRef.free();
+      kid.free();
+      return gFalse;
     }
-    kid.free();
     kidRef.free();
+    kid.free();
+
   }
-  delete attrs1;
-  kids.free();
-  return start;
 
- err3:
-  delete page;
- err2:
-  kid.free();
-  kidRef.free();
-  kids.free();
-  delete attrs1;
-  ok = gFalse;
-  return -1;
+  return gFalse;
 }
 
 int Catalog::findPage(int num, int gen) {
   int i;
 
   for (i = 0; i < numPages; ++i) {
-    if (pageRefs[i].num == num && pageRefs[i].gen == gen)
+    Ref *ref = getPageRef(i+1);
+    if (ref->num == num && ref->gen == gen)
       return i + 1;
   }
   return 0;
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 2cab80a..5a25109 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -151,10 +151,10 @@ public:
   int getNumPages() { return numPages; }
 
   // Get a page.
-  Page *getPage(int i) { return pages[i-1]; }
+  Page *getPage(int i);
 
   // Get the reference for a page object.
-  Ref *getPageRef(int i) { return &pageRefs[i-1]; }
+  Ref *getPageRef(int i);
 
   // Return base URI, or NULL if none.
   GooString *getBaseURI() { return baseURI; }
@@ -232,6 +232,11 @@ private:
   XRef *xref;			// the xref table for this PDF file
   Page **pages;			// array of pages
   Ref *pageRefs;		// object ID for each page
+  int lastCachedPage;
+  GooVector<Dict *> *pagesList;
+  GooVector<Ref> *pagesRefList;
+  GooVector<PageAttrs *> *attrsList;
+  GooVector<int> *kidsIdxList;
   Form *form;
   int numPages;			// number of pages
   int pagesSize;		// size of pages array
@@ -251,8 +256,7 @@ private:
   PageMode pageMode;		// page mode
   PageLayout pageLayout;	// page layout
 
-  int readPageTree(Dict *pages, PageAttrs *attrs, int start,
-		   char *alreadyRead);
+  GBool cachePageTree(int page); // Cache first <page> pages.
   Object *findDestInTree(Object *tree, GooString *name, Object *obj);
 
   Object *getNames();
-- 
1.6.4.2


From 90f6562d9f148cfd28bc29a71fbfc0d2cdbb380a Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 22:01:41 +0100
Subject: [PATCH 12/15] Parse number of pages on demand

---
 poppler/Catalog.cc |   70 +++++++++++++++++++++++++++++++--------------------
 poppler/Catalog.h  |    2 +-
 2 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index 416fb66..536e474 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -64,7 +64,8 @@ Catalog::Catalog(XRef *xrefA) {
   xref = xrefA;
   pages = NULL;
   pageRefs = NULL;
-  numPages = pagesSize = 0;
+  numPages = -1;
+  pagesSize = 0;
   baseURI = NULL;
   pageLabelInfo = NULL;
   form = NULL;
@@ -89,27 +90,6 @@ Catalog::Catalog(XRef *xrefA) {
   // get the AcroForm dictionary
   catDict.dictLookup("AcroForm", &acroForm);
 
-  // read page tree
-  catDict.dictLookup("Pages", &pagesDict);
-  // This should really be isDict("Pages"), but I've seen at least one
-  // PDF file where the /Type entry is missing.
-  if (!pagesDict.isDict()) {
-    error(-1, "Top-level pages object is wrong type (%s)",
-	  pagesDict.getTypeName());
-    goto err2;
-  }
-  pagesDict.dictLookup("Count", &obj);
-  // some PDF files actually use real numbers here ("/Count 9.0")
-  if (!obj.isNum()) {
-    error(-1, "Page count in top-level pages object is wrong type (%s)",
-	  obj.getTypeName());
-    numPages = 0;
-  } else {
-    numPages = (int)obj.getNum();
-  }
-  obj.free();
-  pagesDict.free();
-
   // read base URI
   if (catDict.dictLookup("URI", &obj)->isDict()) {
     if (obj.dictLookup("Base", &obj2)->isString()) {
@@ -136,8 +116,6 @@ Catalog::Catalog(XRef *xrefA) {
   catDict.free();
   return;
 
- err2:
-  pagesDict.free();
  err1:
   catDict.free();
   ok = gFalse;
@@ -270,7 +248,7 @@ GBool Catalog::cachePageTree(int page)
       return gFalse;
     }
 
-    pagesSize = numPages;
+    pagesSize = getNumPages();
     pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
     pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
     for (int i = 0; i < pagesSize; ++i) {
@@ -395,7 +373,7 @@ GBool Catalog::cachePageTree(int page)
 int Catalog::findPage(int num, int gen) {
   int i;
 
-  for (i = 0; i < numPages; ++i) {
+  for (i = 0; i < getNumPages(); ++i) {
     Ref *ref = getPageRef(i+1);
     if (ref->num == num && ref->gen == gen)
       return i + 1;
@@ -719,7 +697,7 @@ GBool Catalog::labelToIndex(GooString *label, int *index)
       return gFalse;
   }
 
-  if (*index < 0 || *index >= numPages)
+  if (*index < 0 || *index >= getNumPages())
     return gFalse;
 
   return gTrue;
@@ -729,7 +707,7 @@ GBool Catalog::indexToLabel(int index, GooString *label)
 {
   char buffer[32];
 
-  if (index < 0 || index >= numPages)
+  if (index < 0 || index >= getNumPages())
     return gFalse;
 
   PageLabelInfo *pli = getPageLabelInfo();
@@ -845,6 +823,42 @@ EmbFile::EmbFile(Object *efDict, GooString *description)
     m_mimetype = new GooString();
 }
 
+int Catalog::getNumPages()
+{
+  if (numPages == -1)
+  {
+    Object catDict, pagesDict, obj;
+
+    xref->getCatalog(&catDict);
+    catDict.dictLookup("Pages", &pagesDict);
+    catDict.free();
+
+    // This should really be isDict("Pages"), but I've seen at least one
+    // PDF file where the /Type entry is missing.
+    if (!pagesDict.isDict()) {
+      error(-1, "Top-level pages object is wrong type (%s)",
+          pagesDict.getTypeName());
+      pagesDict.free();
+      return 0;
+    }
+
+    pagesDict.dictLookup("Count", &obj);
+    // some PDF files actually use real numbers here ("/Count 9.0")
+    if (!obj.isNum()) {
+      error(-1, "Page count in top-level pages object is wrong type (%s)",
+         obj.getTypeName());
+      numPages = 0;
+    } else {
+      numPages = (int)obj.getNum();
+    }
+
+    obj.free();
+    pagesDict.free();
+  }
+
+  return numPages;
+}
+
 PageLabelInfo *Catalog::getPageLabelInfo()
 {
   if (!pageLabelInfo) {
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 5a25109..8bca80b 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -148,7 +148,7 @@ public:
   GBool isOk() { return ok; }
 
   // Get number of pages.
-  int getNumPages() { return numPages; }
+  int getNumPages();
 
   // Get a page.
   Page *getPage(int i);
-- 
1.6.4.2


From 244b93adc29781136fab10b31ac7370c392f53e5 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 25 Mar 2010 18:53:54 +0100
Subject: [PATCH 13/15] Get number of pages from linearization table

---
 poppler/PDFDoc.cc |    9 +++++++++
 poppler/PDFDoc.h  |    2 +-
 2 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 89dba6f..8f105fd 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -1033,6 +1033,15 @@ Guint PDFDoc::getMainXRefEntriesOffset()
   return mainXRefEntriesOffset;
 }
 
+int PDFDoc::getNumPages()
+{
+  if (isLinearized()) {
+    return getLinearization()->getNumPages();
+  } else {
+    return catalog->getNumPages();
+  }
+}
+
 Page *PDFDoc::getPage(int page)
 {
   if ((page < 1) || page > getNumPages()) return NULL;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 8de139f..9069698 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -118,7 +118,7 @@ public:
     { return getPage(page) ? getPage(page)->getRotate() : 0 ; }
 
   // Get number of pages.
-  int getNumPages() { return catalog->getNumPages(); }
+  int getNumPages();
 
   // Return the contents of the metadata stream, or NULL if there is
   // no metadata.
-- 
1.6.4.2


From de1909113f3389f8df15e044470bb56c5a1d122c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 22:03:27 +0100
Subject: [PATCH 14/15] Add hint tables support

---
 CMakeLists.txt      |    2 +
 poppler/Hints.cc    |  337 +++++++++++++++++++++++++++++++++++++++++++++++++++
 poppler/Hints.h     |   89 ++++++++++++++
 poppler/Makefile.am |    2 +
 poppler/PDFDoc.cc   |   14 ++
 poppler/PDFDoc.h    |    5 +
 6 files changed, 449 insertions(+), 0 deletions(-)
 create mode 100644 poppler/Hints.cc
 create mode 100644 poppler/Hints.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a119a6d..6d43826 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,6 +245,7 @@ set(poppler_SRCS
   poppler/GfxFont.cc
   poppler/GfxState.cc
   poppler/GlobalParams.cc
+  poppler/Hints.cc
   poppler/JArithmeticDecoder.cc
   poppler/JBIG2Stream.cc
   poppler/Lexer.cc
@@ -391,6 +392,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/GfxState.h
     poppler/GfxState_helpers.h
     poppler/GlobalParams.h
+    poppler/Hints.h
     poppler/JArithmeticDecoder.h
     poppler/JBIG2Stream.h
     poppler/Lexer.h
diff --git a/poppler/Hints.cc b/poppler/Hints.cc
new file mode 100644
index 0000000..9b488ba
--- /dev/null
+++ b/poppler/Hints.cc
@@ -0,0 +1,337 @@
+//========================================================================
+//
+// Hints.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#include <config.h>
+
+#include "Hints.h"
+
+#include "Linearization.h"
+#include "Object.h"
+#include "Stream.h"
+#include "XRef.h"
+#include "Parser.h"
+#include "Lexer.h"
+
+//------------------------------------------------------------------------
+// Hints
+//------------------------------------------------------------------------
+
+Hints::Hints(BaseStream *str, Linearization *linearization, XRef *xref)
+{
+  hintsOffset = linearization->getHintsOffset();
+  hintsLength = linearization->getHintsLength();
+
+  mainXRefEntriesOffset = linearization->getMainXRefEntriesOffset();
+
+  nPages = linearization->getNumPages();
+  if (nPages < 0) nPages = 0;
+
+  pageFirst = linearization->getPageFirst();
+  pageFirst = (pageFirst > 0) && (pageFirst <= nPages) ? pageFirst : 1;
+
+  objectNumberFirst = linearization->getObjectNumberFirst();
+  pageOffsetFirst = xref->getEntry(objectNumberFirst)->offset;
+
+  pageEndFirst = linearization->getEndFirst();
+
+  nObjects = (Guint *) gmallocn(nPages, sizeof(Guint));
+  xRefOffset = (Guint *) gmallocn(nPages, sizeof(Guint));
+  pageLength = (Guint *) gmallocn(nPages, sizeof(Guint));
+  pageOffset = (Guint *) gmallocn(nPages, sizeof(Guint));
+  numSharedObject = (Guint *) gmallocn(nPages, sizeof(Guint));
+  sharedObjectId = (Guint **) gmallocn(nPages, sizeof(Guint));
+
+  readTables(str, linearization);
+}
+
+Hints::~Hints()
+{
+  gfree(nObjects);
+  gfree(xRefOffset);
+  gfree(pageLength);
+  gfree(pageOffset);
+  gfree(numSharedObject);
+  for (int i=0; i< nPages; i++) gfree(sharedObjectId[i]);
+  gfree(sharedObjectId);
+  gfree(groupLength);
+  gfree(groupOffset);
+  gfree(groupHasSignature);
+  gfree(groupNumObjects);
+  gfree(groupXRefOffset);
+}
+
+void Hints::readTables(BaseStream *str, Linearization *linearization)
+{
+  Parser *parser;
+  Object obj;
+
+  //TODO: overflow hint table?
+  int hintsEnd = hintsOffset + hintsLength - 1;
+
+  obj.initNull();
+  parser = new Parser(NULL,
+     new Lexer(NULL,
+               str->makeSubStream(hintsOffset, gFalse, hintsEnd, &obj)),
+     gTrue);
+  if (parser->getObj(&obj)->isInt() &&
+     (obj.free(), parser->getObj(&obj)->isInt()) &&
+     (obj.free(), parser->getObj(&obj)->isCmd("obj")) &&
+     (obj.free(), parser->getObj(&obj)->isStream())){
+    Stream *hintsStream = obj.getStream();
+    Dict *hintsDict = obj.streamGetDict();
+
+    int sharedStreamOffset;
+    if (hintsDict->lookupInt("S", NULL, &sharedStreamOffset)) {
+
+        hintsStream->reset();
+        readPageOffsetTable(hintsStream);
+
+        hintsStream->reset();
+        for (int i=0; i<sharedStreamOffset; i++) hintsStream->getChar();
+        readSharedObjectsTable(hintsStream);
+    }
+    obj.free();
+  }
+
+  delete parser;
+}
+
+void Hints::readPageOffsetTable(Stream *str)
+{
+  if (nPages < 1) return;
+
+  inputBits = 0; // reset on byte boundary.
+
+  nObjectLeast = readBits(32, str);
+
+  objectOffsetFirst = readBits(32, str);
+  if (objectOffsetFirst >= hintsOffset) objectOffsetFirst += hintsLength;
+
+  nBitsDiffObjects = readBits(16, str);
+
+  pageLengthLeast = readBits(32, str);
+
+  nBitsDiffPageLength = readBits(16, str);
+
+  OffsetStreamLeast = readBits(32, str);
+
+  nBitsOffsetStream = readBits(16, str);
+
+  lengthStreamLeast = readBits(32, str);
+
+  nBitsLengthStream = readBits(16, str);
+
+  nBitsNumShared = readBits(16, str);
+
+  nBitsShared = readBits(16, str);
+
+  nBitsNumerator = readBits(16, str);
+
+  denominator = readBits(16, str);
+
+  for (int i=0; i<nPages; i++) {
+    nObjects[i] = nObjectLeast + readBits(nBitsDiffObjects, str);
+  }
+
+  nObjects[0] = 0;
+  xRefOffset[0] = mainXRefEntriesOffset + 20;
+  for (int i=1; i<nPages; i++) {
+    xRefOffset[i] = xRefOffset[i-1] + 20*nObjects[i-1];
+  }
+
+  for (int i=0; i<nPages; i++) {
+    pageLength[i] = pageLengthLeast + readBits(nBitsDiffPageLength, str);
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (int i=0; i<nPages; i++) {
+    numSharedObject[i] = readBits(nBitsNumShared, str);
+  }
+  numSharedObject[0] = 0; // Do not trust the read value to be 0.
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  sharedObjectId[0] = NULL;
+  for (int i=1; i<nPages; i++) {
+    sharedObjectId[i] = (Guint *) gmallocn(numSharedObject[i], sizeof(Guint));
+    for (Guint j=0; j < numSharedObject[i]; j++) {
+      sharedObjectId[i][j] = readBits(nBitsShared, str);
+    }
+  }
+
+  pageOffset[0] = pageOffsetFirst;
+  pageLength[0] = pageEndFirst - pageOffsetFirst;
+  // set fake pageOffset[0] to correct for hint table.
+  if (pageOffset[0] < hintsOffset) {
+    pageOffset[0] += hintsLength;
+  }
+  // find pageOffsets.
+  for (int i=1; i<nPages; i++) {
+    pageOffset[i] = pageOffset[i-1] + pageLength[i-1];
+  }
+  // restore correct pageOffset[0].
+  pageOffset[0] = pageOffsetFirst;
+
+}
+
+void Hints::readSharedObjectsTable(Stream *str)
+{
+  inputBits = 0; // reset on byte boundary.
+
+  Guint firstSharedObjectNumber = readBits(32, str);
+
+  Guint firstSharedObjectOffset = readBits(32, str);
+  firstSharedObjectOffset += hintsLength;
+
+  Guint numSharedObjectsFirst = readBits(32, str);
+
+  Guint numSharedObjects = readBits(32, str);
+
+  Guint nBitsNumObjects = readBits(16, str);
+
+  Guint groupLengthLeast = readBits(32, str);
+
+  Guint nBitsDiffGroupLength = readBits(16, str);
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  groupLength = (Guint *) gmallocn(numSharedObjects, sizeof(Guint));
+  for (Guint i=0; i<numSharedObjects; i++) {
+    groupLength[i] = groupLengthLeast + readBits(nBitsDiffGroupLength, str);
+  }
+
+  groupOffset = (Guint *) gmallocn(numSharedObjects, sizeof(Guint));
+  groupOffset[0] = objectOffsetFirst;
+  for (Guint i=1; i<numSharedObjectsFirst; i++) {
+    groupOffset[i] = groupOffset[i-1] + groupLength[i-1];
+  }
+  groupOffset[numSharedObjectsFirst] = firstSharedObjectOffset;
+  for (Guint i=numSharedObjectsFirst+1; i<numSharedObjects; i++) {
+    groupOffset[i] = groupOffset[i-1] + groupLength[i-1];
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  groupHasSignature = (Guint *) gmallocn(numSharedObjects, sizeof(Guint));
+  for (Guint i=0; i<numSharedObjects; i++) {
+    groupHasSignature[i] = readBits(1, str);
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<numSharedObjects; i++) {
+    if (groupHasSignature[i]) {
+       readBits(128, str);
+    }
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  groupNumObjects = (Guint *) gmallocn(numSharedObjects, sizeof(Guint));
+  for (Guint i=0; i<numSharedObjects; i++) {
+    groupNumObjects[i] =
+       nBitsNumObjects ? 1 + readBits(nBitsNumObjects, str) : 1;
+  }
+
+  groupXRefOffset = (Guint *) gmallocn(numSharedObjects, sizeof(Guint));
+  for (Guint i=0; i<numSharedObjectsFirst; i++) {
+    groupNumObjects[i] = 0;
+    groupXRefOffset[i] = 0;
+  }
+  groupXRefOffset[numSharedObjectsFirst] =
+      mainXRefEntriesOffset + 20*firstSharedObjectNumber;
+  for (Guint i=numSharedObjectsFirst+1; i<numSharedObjects; i++) {
+    groupXRefOffset[i] = groupXRefOffset[i-1] + 20*groupNumObjects[i-1];
+  }
+
+}
+
+Guint Hints::getPageOffset(int page)
+{
+  if ((page < 1) || (page > nPages)) return 0;
+
+  if (page > pageFirst)
+    return pageOffset[page-1];
+  else if (page < pageFirst)
+    return pageOffset[page];
+  else
+    return pageOffset[0];
+}
+
+GooVector<ByteRange>* Hints::getPageRanges(int page)
+{
+  if ((page < 1) || (page > nPages)) return NULL;
+
+  int idx;
+  if (page > pageFirst)
+     idx = page-1;
+  else if (page < pageFirst)
+     idx = page;
+  else
+     idx = 0;
+
+  ByteRange pageRange;
+  GooVector<ByteRange> *v = new GooVector<ByteRange>;
+
+  pageRange.offset = pageOffset[idx];
+  pageRange.length = pageLength[idx];
+  v->push_back(pageRange);
+
+  pageRange.offset = xRefOffset[idx];
+  pageRange.length = 20*nObjects[idx];
+  v->push_back(pageRange);
+
+  for (Guint j=0; j<numSharedObject[idx]; j++) {
+     int k = sharedObjectId[idx][j];
+     pageRange.offset = groupOffset[k];
+     pageRange.length = groupLength[k];
+     v->push_back(pageRange);
+
+     pageRange.offset = groupXRefOffset[k];
+     pageRange.length = 20*groupNumObjects[k];
+     v->push_back(pageRange);
+  }
+
+  return v;
+}
+
+Guint Hints::readBit(Stream *str)
+{
+  Guint bit;
+  int c;
+
+  if (inputBits == 0) {
+    if ((c = str->getChar()) == EOF) {
+      return (Guint) -1;
+    }
+    bitsBuffer = c;
+    inputBits = 8;
+  }
+  bit = (bitsBuffer >> (inputBits - 1)) & 1;
+  --inputBits;
+  return bit;
+}
+
+Guint Hints::readBits(int n, Stream *str)
+{
+  Guint bit, bits;
+
+  if (n < 1) return -1;
+
+  if (n == 1)
+    return readBit(str);
+
+  bit = (readBit(str) << (n-1));
+  if (bit == (Guint) -1)
+    return -1;
+
+  bits = readBits(n-1, str);
+  if (bits == (Guint) -1)
+    return -1;
+
+  return bit | bits;
+}
+
+
diff --git a/poppler/Hints.h b/poppler/Hints.h
new file mode 100644
index 0000000..7cacf00
--- /dev/null
+++ b/poppler/Hints.h
@@ -0,0 +1,89 @@
+//========================================================================
+//
+// Hints.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#ifndef HINTS_H
+#define HINTS_H
+
+#include <string.h>
+#include "goo/gtypes.h"
+#include "goo/GooVector.h"
+//#include <vector>
+#include "PDFDoc.h"
+
+class Stream;
+class BaseStream;
+class Linearization;
+class XRef;
+
+//------------------------------------------------------------------------
+// Hints
+//------------------------------------------------------------------------
+
+class Hints {
+public:
+
+  Hints(BaseStream *str, Linearization *linearization, XRef *xref);
+  ~Hints();
+
+  Guint getPageOffset(int page);
+  GooVector<ByteRange>* getPageRanges(int page);
+
+private:
+
+  void readTables(BaseStream *str, Linearization *linearization);
+  void readPageOffsetTable(Stream *str);
+  void readSharedObjectsTable(Stream *str);
+
+  Guint readBit(Stream *str);
+  Guint readBits(int n, Stream *str);
+
+  Guint hintsOffset;
+  Guint hintsLength;
+  Guint mainXRefEntriesOffset;
+
+  int nPages;
+  int pageFirst;
+  Guint pageOffsetFirst;
+  Guint pageEndFirst;
+  int objectNumberFirst;
+
+  Guint nObjectLeast;
+  Guint objectOffsetFirst;
+  Guint nBitsDiffObjects;
+  Guint pageLengthLeast;
+  Guint nBitsDiffPageLength;
+  Guint OffsetStreamLeast;
+  Guint nBitsOffsetStream;
+  Guint lengthStreamLeast;
+  Guint nBitsLengthStream;
+  Guint nBitsNumShared;
+  Guint nBitsShared;
+  Guint nBitsNumerator;
+  Guint denominator;
+
+  Guint *nObjects;
+  Guint *xRefOffset;
+  Guint *pageLength;
+  Guint *pageOffset;
+  Guint *numSharedObject;
+  Guint **sharedObjectId;
+
+  Guint *groupLength;
+  Guint *groupOffset;
+  Guint *groupHasSignature;
+  Guint *groupNumObjects;
+  Guint *groupXRefOffset;
+
+  int inputBits;
+  char bitsBuffer;
+
+};
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 8c1e019..a6b7990 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -206,6 +206,7 @@ poppler_include_HEADERS =	\
 	GfxState.h		\
 	GfxState_helpers.h	\
 	GlobalParams.h		\
+	Hints.h			\
 	JArithmeticDecoder.h	\
 	JBIG2Stream.h		\
 	Lexer.h			\
@@ -285,6 +286,7 @@ libpoppler_la_SOURCES =		\
 	GfxFont.cc 		\
 	GfxState.cc		\
 	GlobalParams.cc		\
+	Hints.cc		\
 	JArithmeticDecoder.cc	\
 	JBIG2Stream.cc		\
 	Lexer.cc 		\
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 8f105fd..37e5d20 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -67,6 +67,7 @@
 #include "Outline.h"
 #endif
 #include "PDFDoc.h"
+#include "Hints.h"
 
 //------------------------------------------------------------------------
 
@@ -94,6 +95,7 @@ void PDFDoc::init()
   xref = NULL;
   linearization = NULL;
   catalog = NULL;
+  hints = NULL;
 #ifndef DISABLE_OUTLINE
   outline = NULL;
 #endif
@@ -268,6 +270,9 @@ PDFDoc::~PDFDoc() {
   if (xref) {
     delete xref;
   }
+  if (hints) {
+    delete hints;
+  }
   if (linearization) {
     delete linearization;
   }
@@ -471,6 +476,15 @@ GBool PDFDoc::isLinearized() {
     return gFalse;
 }
 
+Hints *PDFDoc::getHints()
+{
+  if (!hints && isLinearized()) {
+    hints = new Hints(str, getLinearization(), xref);
+  }
+
+  return hints;
+}
+
 int PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
   FILE *f;
   OutStream *outStr;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 9069698..b2f40c9 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -49,6 +49,7 @@ class LinkAction;
 class LinkDest;
 class Outline;
 class Linearization;
+class Hints;
 
 enum PDFWriteMode {
   writeStandard,
@@ -236,6 +237,9 @@ private:
   void saveIncrementalUpdate (OutStream* outStr);
   void saveCompleteRewrite (OutStream* outStr);
 
+  // Get hints.
+  Hints *getHints();
+
   PDFDoc();
   void init();
   GBool setup(GooString *ownerPassword, GooString *userPassword);
@@ -258,6 +262,7 @@ private:
   Linearization *linearization;
   XRef *xref;
   Catalog *catalog;
+  Hints *hints;
 #ifndef DISABLE_OUTLINE
   Outline *outline;
 #endif
-- 
1.6.4.2


From 99fa4a4f91713c28de8c17b05dd269dbd5e89324 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:06:02 +0200
Subject: [PATCH 15/15] Use hint tables for PDFDoc::getPage()

---
 poppler/PDFDoc.cc |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 poppler/PDFDoc.h  |    4 +++
 2 files changed, 76 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 37e5d20..1441428 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -100,6 +100,7 @@ void PDFDoc::init()
   outline = NULL;
 #endif
   startXRefPos = ~(Guint)0;
+  pageCache = NULL;
 }
 
 PDFDoc::PDFDoc()
@@ -259,6 +260,14 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
 }
 
 PDFDoc::~PDFDoc() {
+  if (pageCache) {
+    for (int i = 0; i < getNumPages(); i++) {
+      if (pageCache[i]) {
+        delete pageCache[i];
+      }
+    }
+    gfree(pageCache);
+  }
 #ifndef DISABLE_OUTLINE
   if (outline) {
     delete outline;
@@ -1056,11 +1065,73 @@ int PDFDoc::getNumPages()
   }
 }
 
+Guint PDFDoc::getPageOffset(int page)
+{
+  Guint offset;
+
+  if (getHints() && (offset = getHints()->getPageOffset(page))) {
+    return offset;
+  } else {
+    error(-1, "Failed getting page offset from hint table");
+    return 0;
+  }
+}
+
+Page *PDFDoc::parsePage(Guint offset, int page)
+{
+  Page *p = NULL;
+  Object obj;
+
+  obj.initNull();
+  Stream *stream = str->makeSubStream(offset, gFalse, 0, &obj);
+  Parser parser = Parser(xref, new Lexer(xref, stream), gTrue);
+
+  Object obj1, obj2, obj3, obj4;
+  if (parser.getObj(&obj1)->isInt() &&
+      parser.getObj(&obj2)->isInt() &&
+      parser.getObj(&obj3)->isCmd("obj") &&
+      parser.getObj(&obj4)->isDict("Page")) {
+    Ref pageRef;
+    Dict *pageDict;
+    pageRef.num = obj1.getInt();
+    pageRef.gen = obj2.getInt();
+    pageDict = obj4.getDict();
+    p = new Page(xref, page, pageDict, pageRef,
+                 new PageAttrs(NULL, pageDict),
+                 catalog->getForm());
+    if (!p->isOk()) {
+      delete p;
+      p = NULL;
+    }
+  }
+  obj4.free();
+  obj3.free();
+  obj2.free();
+  obj1.free();
+
+  return p;
+}
+
 Page *PDFDoc::getPage(int page)
 {
   if ((page < 1) || page > getNumPages()) return NULL;
 
-  {
+  if (isLinearized()) {
+    if (!pageCache) {
+      pageCache = (Page **) gmallocn(getNumPages(), sizeof(Page *));
+      for (int i = 0; i < getNumPages(); i++) {
+        pageCache[i] = NULL;
+      }
+    }
+    if (!pageCache[page-1]) {
+      pageCache[page-1] = parsePage(getPageOffset(page), page);
+      if (!pageCache[page-1]) {
+         error(-1, "Failed parsing page %d at offset",
+               page, getPageOffset(page));
+      }
+    }
+    return pageCache[page-1];
+  } else {
     return catalog->getPage(page);
   }
 }
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index b2f40c9..99c005e 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -237,6 +237,9 @@ private:
   void saveIncrementalUpdate (OutStream* outStr);
   void saveCompleteRewrite (OutStream* outStr);
 
+  Guint getPageOffset(int page);
+  Page *parsePage(Guint offset, int page);
+
   // Get hints.
   Hints *getHints();
 
@@ -266,6 +269,7 @@ private:
 #ifndef DISABLE_OUTLINE
   Outline *outline;
 #endif
+  Page **pageCache;
 
   GBool ok;
   int errCode;
-- 
1.6.4.2


More information about the poppler mailing list