[poppler] Linearization support

Hib Eris hib at hiberis.nl
Mon May 3 06:13:07 PDT 2010


Hi,

On Sun, May 2, 2010 at 9:34 PM, Hib Eris <hib at hiberis.nl> wrote:
> Hi again,
>
> On Thu, Apr 29, 2010 at 1:52 PM, Hib Eris <hib at hiberis.nl> wrote:
>> Hi all,
>>
>> I have two series of patches that allow poppler to handle linearized
>> documents more efficiently.
>
> I have updated my patches to improve the handling of malformed
> documents. Here are the new patches.
>
> Hib
>

I noticed that the last patch I send yesterday does not work well for
the first page in some documents.
The cause is that the offset data in the hints tables of some
documents does not seem to follow the PDF specfication. Fortunately,
for the first page you can get the offset from the linearization
dictionary and the first page xref table. This is actually better,
since you won't need to read the hints tables to render the first
page.

I have updated the patches accordingly. My apologies for all the noise.

Hib
-------------- next part --------------
From a9f2d19d18dd509d3f495c4c2fbb830516fa0527 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 6 Apr 2010 19:24:42 +0200
Subject: [PATCH 01/12] Cleanup XRef constructors

---
 poppler/XRef.cc |   14 ++++++--------
 poppler/XRef.h  |    1 +
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 3ab23d9..8ae30b2 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -226,7 +226,7 @@ Object *ObjectStream::getObject(int objIdx, int objNum, Object *obj) {
 // XRef
 //------------------------------------------------------------------------
 
-XRef::XRef() {
+void XRef::init() {
   ok = gTrue;
   errCode = errNone;
   entries = NULL;
@@ -236,17 +236,15 @@ XRef::XRef() {
   objStr = NULL;
 }
 
+XRef::XRef() {
+  init();
+}
+
 XRef::XRef(BaseStream *strA) {
   Guint pos;
   Object obj;
 
-  ok = gTrue;
-  errCode = errNone;
-  size = 0;
-  entries = NULL;
-  streamEnds = NULL;
-  streamEndsLen = 0;
-  objStr = NULL;
+  init();
 
   encrypted = gFalse;
   permFlags = defPermFlags;
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 2dbd469..98db234 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -155,6 +155,7 @@ private:
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
 
+  void init();
   Guint getStartXref();
   GBool readXRef(Guint *pos);
   GBool readXRefTable(Parser *parser, Guint *pos);
-- 
1.6.4.2


From f1bf4283fce1793d5d0a07810c7de4bfd0389562 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 6 Apr 2010 19:16:45 +0200
Subject: [PATCH 02/12] Create no more XRef entries than specified

---
 poppler/XRef.cc |  126 +++++++++++++++++++++++++++---------------------------
 poppler/XRef.h  |    5 ++-
 2 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 8ae30b2..49ff809 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -230,6 +230,7 @@ void XRef::init() {
   ok = gTrue;
   errCode = errNone;
   entries = NULL;
+  capacity = 0;
   size = 0;
   streamEnds = NULL;
   streamEndsLen = 0;
@@ -311,6 +312,50 @@ XRef::~XRef() {
   }
 }
 
+int XRef::reserve(int newSize)
+{
+  if (newSize > capacity) {
+
+    int realNewSize;
+    for (realNewSize = capacity ? 2 * capacity : 1024;
+          newSize > realNewSize && realNewSize > 0;
+          realNewSize <<= 1) ;
+    if ((realNewSize < 0) ||
+        (realNewSize >= INT_MAX / (int)sizeof(XRefEntry))) {
+      return 0;
+    }
+
+    entries = (XRefEntry *)greallocn(entries, realNewSize, sizeof(XRefEntry));
+    capacity = realNewSize;
+  }
+
+  return capacity;
+}
+
+int XRef::resize(int newSize)
+{
+  if (newSize > size) {
+
+    if (reserve(newSize) < newSize) return size;
+
+    for (int i = size; i < newSize; ++i) {
+      entries[i].offset = 0xffffffff;
+      entries[i].type = xrefEntryFree;
+      entries[i].obj.initNull ();
+      entries[i].updated = false;
+      entries[i].gen = 0;
+    }
+  } else {
+    for (int i = newSize; i < size; i++) {
+      entries[i].obj.free ();
+    }
+  }
+
+  size = newSize;
+
+  return size;
+}
+
 // Read the 'startxref' position.
 Guint XRef::getStartXref() {
   char buf[xrefSearchSize+1];
@@ -398,7 +443,7 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
   GBool more;
   Object obj, obj2;
   Guint pos2;
-  int first, n, newSize, i;
+  int first, n, i;
 
   while (1) {
     parser->getObj(&obj);
@@ -417,29 +462,13 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
     n = obj.getInt();
     obj.free();
     if (first < 0 || n < 0 || first + n < 0) {
-      goto err1;
+      goto err0;
     }
     if (first + n > size) {
-      for (newSize = size ? 2 * size : 1024;
-	   first + n > newSize && newSize > 0;
-	   newSize <<= 1) ;
-      if (newSize < 0) {
-	goto err1;
-      }
-      if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
+      if (resize(first + n) != first + n) {
         error(-1, "Invalid 'obj' parameters'");
-        goto err1;
+        goto err0;
       }
- 
-      entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-      for (i = size; i < newSize; ++i) {
-	entries[i].offset = 0xffffffff;
-	entries[i].type = xrefEntryFree;
-	entries[i].obj.initNull ();
-	entries[i].updated = false;
-	entries[i].gen = 0;
-      }
-      size = newSize;
     }
     for (i = first; i < first + n; ++i) {
       if (!parser->getObj(&obj)->isInt()) {
@@ -520,6 +549,7 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
 
  err1:
   obj.free();
+ err0:
   ok = gFalse;
   return gFalse;
 }
@@ -542,19 +572,10 @@ GBool XRef::readXRefStream(Stream *xrefStr, Guint *pos) {
     goto err1;
   }
   if (newSize > size) {
-    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-      error(-1, "Invalid 'size' parameter.");
-      return gFalse;
-    }
-    entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-    for (i = size; i < newSize; ++i) {
-      entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
-      entries[i].obj.initNull ();
-      entries[i].updated = false;
-      entries[i].gen = 0;
+    if (resize(newSize) != newSize) {
+      error(-1, "Invalid 'size' parameter");
+      goto err0;
     }
-    size = newSize;
   }
 
   if (!dict->lookupNF("W", &obj)->isArray() ||
@@ -627,31 +648,16 @@ GBool XRef::readXRefStream(Stream *xrefStr, Guint *pos) {
 
 GBool XRef::readXRefStreamSection(Stream *xrefStr, int *w, int first, int n) {
   Guint offset;
-  int type, gen, c, newSize, i, j;
+  int type, gen, c, i, j;
 
   if (first + n < 0) {
     return gFalse;
   }
   if (first + n > size) {
-    for (newSize = size ? 2 * size : 1024;
-	 first + n > newSize && newSize > 0;
-	 newSize <<= 1) ;
-    if (newSize < 0) {
-      return gFalse;
-    }
-    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-      error(-1, "Invalid 'size' inside xref table.");
+    if (resize(first + n) != size) {
+      error(-1, "Invalid 'size' inside xref table");
       return gFalse;
     }
-    entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-    for (i = size; i < newSize; ++i) {
-      entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
-      entries[i].obj.initNull ();
-      entries[i].updated = false;
-      entries[i].gen = 0;
-    }
-    size = newSize;
   }
   for (i = first; i < first + n; ++i) {
     if (w[0] == 0) {
@@ -712,13 +718,13 @@ GBool XRef::constructXRef() {
   int newSize;
   int streamEndsSize;
   char *p;
-  int i;
   GBool gotRoot;
   char* token = NULL;
   bool oneCycle = true;
   int offset = 0;
 
   gfree(entries);
+  capacity = 0;
   size = 0;
   entries = NULL;
 
@@ -800,19 +806,10 @@ GBool XRef::constructXRef() {
 		      error(-1, "Bad object number");
 		      return gFalse;
 		    }
-		    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-		      error(-1, "Invalid 'obj' parameters.");
+		    if (resize(newSize) != newSize) {
+		      error(-1, "Invalid 'obj' parameters");
 		      return gFalse;
 		    }
-		    entries = (XRefEntry *)
-		        greallocn(entries, newSize, sizeof(XRefEntry));
-		    for (i = size; i < newSize; ++i) {
-		      entries[i].offset = 0xffffffff;
-		      entries[i].type = xrefEntryFree;
-		      entries[i].obj.initNull ();
-		      entries[i].updated = false;
-		    }
-		    size = newSize;
 		  }
 		  if (entries[num].type == xrefEntryFree ||
 		      gen >= entries[num].gen) {
@@ -1085,7 +1082,10 @@ Guint XRef::strToUnsigned(char *s) {
 
 void XRef::add(int num, int gen, Guint offs, GBool used) {
   if (num >= size) {
-    entries = (XRefEntry *)greallocn(entries, num + 1, sizeof(XRefEntry));
+    if (num >= capacity) {
+      entries = (XRefEntry *)greallocn(entries, num + 1, sizeof(XRefEntry));
+      capacity = num + 1;
+    }
     for (int i = size; i < num + 1; ++i) {
       entries[i].offset = 0xffffffff;
       entries[i].type = xrefEntryFree;
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 98db234..f86e5ee 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -136,7 +136,8 @@ private:
   Guint start;			// offset in file (to allow for garbage
 				//   at beginning of file)
   XRefEntry *entries;		// xref entries
-  int size;			// size of <entries> array
+  int capacity;			// size of <entries> array
+  int size;			// number of entries
   int rootNum, rootGen;		// catalog dict
   GBool ok;			// true if xref table is valid
   int errCode;			// error code (if <ok> is false)
@@ -156,6 +157,8 @@ private:
   GBool ownerPasswordOk;	// true if owner password is correct
 
   void init();
+  int reserve(int newSize);
+  int resize(int newSize);
   Guint getStartXref();
   GBool readXRef(Guint *pos);
   GBool readXRefTable(Parser *parser, Guint *pos);
-- 
1.6.4.2


From fa5a0beb6f86a6cea7ba98ef5ef3c04a53b7319d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 28 Apr 2010 12:45:42 +0200
Subject: [PATCH 03/12] Use XRef::add() in XRef::addIndirectObject()

---
 poppler/XRef.cc |    4 +---
 1 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 49ff809..012f91c 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -1127,10 +1127,8 @@ Ref XRef::addIndirectObject (Object* o) {
   XRefEntry *e;
   if (entryIndexToUse == -1) {
     entryIndexToUse = size;
-    size++;
-    entries = (XRefEntry *)greallocn(entries, size, sizeof(XRefEntry));
+    add(entryIndexToUse, 0, 0, gFalse);
     e = &entries[entryIndexToUse];
-    e->gen = 0;
   } else {
     //reuse a free entry
     e = &entries[entryIndexToUse];
-- 
1.6.4.2


From 7ce2bb1aac50315145ebc21ae34b5c37f00e0c35 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 14 Apr 2010 12:20:49 +0200
Subject: [PATCH 04/12] Use XRef::getEntry() to access entries

---
 poppler/XRef.cc |   49 +++++++++++++++++++++++++------------------------
 poppler/XRef.h  |    2 +-
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 012f91c..d615ec0 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -927,7 +927,7 @@ Object *XRef::fetch(int num, int gen, Object *obj) {
     goto err;
   }
 
-  e = &entries[num];
+  e = getEntry(num);
   if(!e->obj.isNull ()) { //check for updated object
     obj = e->obj.copy(obj);
     return obj;
@@ -1047,20 +1047,20 @@ GBool XRef::getStreamEnd(Guint streamStart, Guint *streamEnd) {
   return gTrue;
 }
 
-int XRef::getNumEntry(Guint offset) const
+int XRef::getNumEntry(Guint offset)
 {
   if (size > 0)
   {
     int res = 0;
-    Guint resOffset = entries[0].offset;
-    XRefEntry e;
+    Guint resOffset = getEntry(0)->offset;
+    XRefEntry *e;
     for (int i = 1; i < size; ++i)
     {
-      e = entries[i];
-      if (e.offset < offset && e.offset >= resOffset)
+      e = getEntry(i);
+      if (e->offset < offset && e->offset >= resOffset)
       {
         res = i;
-        resOffset = e.offset;
+        resOffset = e->offset;
       }
     }
     return res;
@@ -1095,7 +1095,7 @@ void XRef::add(int num, int gen, Guint offs, GBool used) {
     }
     size = num + 1;
   }
-  XRefEntry *e = &entries[num];
+  XRefEntry *e = getEntry(num);
   e->gen = gen;
   e->obj.initNull ();
   e->updated = false;
@@ -1113,25 +1113,26 @@ void XRef::setModifiedObject (Object* o, Ref r) {
     error(-1,"XRef::setModifiedObject on unknown ref: %i, %i\n", r.num, r.gen);
     return;
   }
-  entries[r.num].obj.free();
-  o->copy(&entries[r.num].obj);
-  entries[r.num].updated = true;
+  XRefEntry *e = getEntry(r.num);
+  e->obj.free();
+  o->copy(&(e->obj));
+  e->updated = true;
 }
 
 Ref XRef::addIndirectObject (Object* o) {
   int entryIndexToUse = -1;
   for (int i = 1; entryIndexToUse == -1 && i < size; ++i) {
-    if (entries[i].type == xrefEntryFree) entryIndexToUse = i;
+    if (getEntry(i)->type == xrefEntryFree) entryIndexToUse = i;
   }
 
   XRefEntry *e;
   if (entryIndexToUse == -1) {
     entryIndexToUse = size;
     add(entryIndexToUse, 0, 0, gFalse);
-    e = &entries[entryIndexToUse];
+    e = getEntry(entryIndexToUse);
   } else {
     //reuse a free entry
-    e = &entries[entryIndexToUse];
+    e = getEntry(entryIndexToUse);
     //we don't touch gen number, because it should have been 
     //incremented when the object was deleted
   }
@@ -1147,13 +1148,13 @@ Ref XRef::addIndirectObject (Object* o) {
 
 void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   //create free entries linked-list
-  if (entries[0].gen != 65535) {
+  if (getEntry(0)->gen != 65535) {
     error(-1, "XRef::writeToFile, entry 0 of the XRef is invalid (gen != 65535)\n");
   }
   int lastFreeEntry = 0;
   for (int i=0; i<size; i++) {
-    if (entries[i].type == xrefEntryFree) {
-      entries[lastFreeEntry].offset = i;
+    if (getEntry(i)->type == xrefEntryFree) {
+      getEntry(lastFreeEntry)->offset = i;
       lastFreeEntry = i;
     }
   }
@@ -1163,10 +1164,10 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
     outStr->printf("xref\r\n");
     outStr->printf("%i %i\r\n", 0, size);
     for (int i=0; i<size; i++) {
-      XRefEntry &e = entries[i];
+      XRefEntry *e = getEntry(i);
 
-      if(e.gen > 65535) e.gen = 65535; //cap generation number to 65535 (required by PDFReference)
-      outStr->printf("%010i %05i %c\r\n", e.offset, e.gen, (e.type==xrefEntryFree)?'f':'n');
+      if(e->gen > 65535) e->gen = 65535; //cap generation number to 65535 (required by PDFReference)
+      outStr->printf("%010i %05i %c\r\n", e->offset, e->gen, (e->type==xrefEntryFree)?'f':'n');
     }
   } else {
     //write the new xref
@@ -1175,16 +1176,16 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
     while (i < size) {
       int j;
       for(j=i; j<size; j++) { //look for consecutive entries
-        if ((entries[j].type == xrefEntryFree) && (entries[j].gen == 0))
+        if ((getEntry(j)->type == xrefEntryFree) && (getEntry(j)->gen == 0))
           break;
       }
       if (j-i != 0)
       {
         outStr->printf("%i %i\r\n", i, j-i);
         for (int k=i; k<j; k++) {
-          XRefEntry &e = entries[k];
-          if(e.gen > 65535) e.gen = 65535; //cap generation number to 65535 (required by PDFReference)
-          outStr->printf("%010i %05i %c\r\n", e.offset, e.gen, (e.type==xrefEntryFree)?'f':'n');
+          XRefEntry *e = getEntry(k);
+          if(e->gen > 65535) e->gen = 65535; //cap generation number to 65535 (required by PDFReference)
+          outStr->printf("%010i %05i %c\r\n", e->offset, e->gen, (e->type==xrefEntryFree)?'f':'n');
         }
         i = j;
       }
diff --git a/poppler/XRef.h b/poppler/XRef.h
index f86e5ee..344b764 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -117,7 +117,7 @@ public:
   GBool getStreamEnd(Guint streamStart, Guint *streamEnd);
 
   // Retuns the entry that belongs to the offset
-  int getNumEntry(Guint offset) const;
+  int getNumEntry(Guint offset);
 
   // Direct access.
   int getSize() { return size; }
-- 
1.6.4.2


From d1849ee55fb37f20db86e7a5cf2f44e63478cd66 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 15 Apr 2010 17:34:13 +0200
Subject: [PATCH 05/12] Read XRef table sections on demand

---
 poppler/XRef.cc |   37 ++++++++++++++++++++++++++++++++-----
 poppler/XRef.h  |    6 ++++--
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index d615ec0..a0c77fc 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -242,7 +242,6 @@ XRef::XRef() {
 }
 
 XRef::XRef(BaseStream *strA) {
-  Guint pos;
   Object obj;
 
   init();
@@ -254,11 +253,11 @@ XRef::XRef(BaseStream *strA) {
   // read the trailer
   str = strA;
   start = str->getStart();
-  pos = getStartXref();
+  prevXRefOffset = pos;
 
   // if there was a problem with the 'startxref' position, try to
   // reconstruct the xref table
-  if (pos == 0) {
+  if (prevXRefOffset == 0) {
     if (!(ok = constructXRef())) {
       errCode = errDamaged;
       return;
@@ -266,7 +265,7 @@ XRef::XRef(BaseStream *strA) {
 
   // read the xref table
   } else {
-    while (readXRef(&pos)) ;
+     readXRef(&prevXRefOffset);
 
     // if there was a problem with the xref table,
     // try to reconstruct it
@@ -278,6 +277,18 @@ XRef::XRef(BaseStream *strA) {
     }
   }
 
+  // set size according to trailer dict
+  trailerDict.dictLookupNF("Size", &obj);
+  if (obj.isInt() && (resize(obj.getInt()) == obj.getInt())) {
+    obj.free();
+  } else {
+    obj.free();
+    if (!(ok = constructXRef())) {
+      errCode = errDamaged;
+      return;
+    }
+  }
+
   // get the root dictionary (catalog) object
   trailerDict.dictLookupNF("Root", &obj);
   if (obj.isRef()) {
@@ -340,7 +351,7 @@ int XRef::resize(int newSize)
 
     for (int i = size; i < newSize; ++i) {
       entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
+      entries[i].type = xrefEntryNone;
       entries[i].obj.initNull ();
       entries[i].updated = false;
       entries[i].gen = 0;
@@ -1194,3 +1205,19 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   }
 }
 
+XRefEntry *XRef::getEntry(int i)
+{
+  if (entries[i].type == xrefEntryNone) {
+
+    while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+
+    if (entries[i].type == xrefEntryNone) {
+       error(-1, "Invalid XRef entry");
+       entries[i].type = xrefEntryFree;
+    }
+  }
+
+  return &entries[i];
+}
+
+
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 344b764..a013e5a 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -45,7 +45,8 @@ class ObjectStream;
 enum XRefEntryType {
   xrefEntryFree,
   xrefEntryUncompressed,
-  xrefEntryCompressed
+  xrefEntryCompressed,
+  xrefEntryNone
 };
 
 struct XRefEntry {
@@ -121,7 +122,7 @@ public:
 
   // Direct access.
   int getSize() { return size; }
-  XRefEntry *getEntry(int i) { return &entries[i]; }
+  XRefEntry *getEntry(int i);
   Object *getTrailerDict() { return &trailerDict; }
 
   // Write access
@@ -155,6 +156,7 @@ private:
   int permFlags;		// permission bits
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
+  Guint prevXRefOffset;		// position of prev XRef section (= next to read)
 
   void init();
   int reserve(int newSize);
-- 
1.6.4.2


From 6932300b90334cc2c43a0dd3fe7f7e5da13f389c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 18:26:17 +0100
Subject: [PATCH 06/12] Add Linearization dictionary support

---
 CMakeLists.txt           |    2 +
 poppler/Linearization.cc |  225 ++++++++++++++++++++++++++++++++++++++++++++++
 poppler/Linearization.h  |   45 +++++++++
 poppler/Makefile.am      |    2 +
 poppler/PDFDoc.cc        |   13 +++
 poppler/PDFDoc.h         |    5 +
 6 files changed, 292 insertions(+), 0 deletions(-)
 create mode 100644 poppler/Linearization.cc
 create mode 100644 poppler/Linearization.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1eba1fe..a119a6d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,6 +249,7 @@ set(poppler_SRCS
   poppler/JBIG2Stream.cc
   poppler/Lexer.cc
   poppler/Link.cc
+  poppler/Linearization.cc
   poppler/LocalPDFDocBuilder.cc
   poppler/NameToCharCode.cc
   poppler/Object.cc
@@ -394,6 +395,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/JBIG2Stream.h
     poppler/Lexer.h
     poppler/Link.h
+    poppler/Linearization.h
     poppler/LocalPDFDocBuilder.h
     poppler/Movie.h
     poppler/NameToCharCode.h
diff --git a/poppler/Linearization.cc b/poppler/Linearization.cc
new file mode 100644
index 0000000..d63e5ad
--- /dev/null
+++ b/poppler/Linearization.cc
@@ -0,0 +1,225 @@
+//========================================================================
+//
+// Linearization.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#include "Linearization.h"
+#include "Parser.h"
+#include "Lexer.h"
+
+//------------------------------------------------------------------------
+// Linearization
+//------------------------------------------------------------------------
+
+Linearization::Linearization (BaseStream *str)
+{
+  Parser *parser;
+  Object obj1, obj2, obj3, obj4, obj5;
+
+  linDict.initNull();
+
+  str->reset();
+  obj1.initNull();
+  parser = new Parser(NULL,
+      new Lexer(NULL, str->makeSubStream(str->getStart(), gFalse, 0, &obj1)),
+      gTrue);
+  parser->getObj(&obj1);
+  parser->getObj(&obj2);
+  parser->getObj(&obj3);
+  parser->getObj(&linDict);
+  parser->getObj(&obj4);
+  if (obj1.isInt() && obj2.isInt() && obj3.isCmd("obj") && linDict.isDict()) {
+    linDict.dictLookup("Linearized", &obj5);
+    if (!(obj5.isNum() && obj5.getNum() > 0)) {
+       linDict.free();
+       linDict.initNull();
+    }
+    obj5.free();
+  }
+  obj4.free();
+  obj4.free();
+  obj3.free();
+  obj2.free();
+  obj1.free();
+  delete parser;
+}
+
+Linearization:: ~Linearization()
+{
+  linDict.free();
+}
+
+Guint Linearization::getLength()
+{
+  if (!linDict.isDict()) return 0;
+
+  int length;
+  if (linDict.getDict()->lookupInt("L", NULL, &length) &&
+      length > 0) {
+    return length;
+  } else {
+    error(-1, "Length in linearization table is invalid");
+    return 0;
+  }
+}
+
+Guint Linearization::getHintsOffset()
+{
+  int hintsOffset;
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=2 &&
+      obj1.arrayGet(0, &obj2)->isInt() &&
+      obj2.getInt() > 0) {
+    hintsOffset = obj2.getInt();
+  } else {
+    error(-1, "Hints table offset in linearization table is invalid");
+    hintsOffset = 0;
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsOffset;
+}
+
+Guint Linearization::getHintsLength()
+{
+  int hintsLength;
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=2 &&
+      obj1.arrayGet(1, &obj2)->isInt() &&
+      obj2.getInt() > 0) {
+    hintsLength = obj2.getInt();
+  } else {
+    error(-1, "Hints table length in linearization table is invalid");
+    hintsLength = 0;
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsLength;
+}
+
+Guint Linearization::getHintsOffset2()
+{
+  int hintsOffset2 = 0; // default to 0
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=4) {
+    if (obj1.arrayGet(2, &obj2)->isInt() &&
+        obj2.getInt() > 0) {
+      hintsOffset2 = obj2.getInt();
+    } else {
+      error(-1, "Second hints table offset in linearization table is invalid");
+      hintsOffset2 = 0;
+    }
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsOffset2;
+}
+
+Guint Linearization::getHintsLength2()
+{
+  int hintsLength2 = 0; // default to 0
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=4) {
+    if (obj1.arrayGet(3, &obj2)->isInt() &&
+        obj2.getInt() > 0) {
+      hintsLength2 = obj2.getInt();
+    } else {
+      error(-1, "Second hints table length in linearization table is invalid");
+      hintsLength2 = 0;
+    }
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsLength2;
+}
+
+int Linearization::getObjectNumberFirst()
+{
+  int objectNumberFirst = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("O", NULL, &objectNumberFirst) &&
+      objectNumberFirst > 0) {
+    return objectNumberFirst;
+  } else {
+    error(-1, "Object number of first page in linearization table is invalid");
+    return 0;
+  }
+}
+
+Guint Linearization::getEndFirst()
+{
+  int pageEndFirst = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("E", NULL, &pageEndFirst) &&
+      pageEndFirst > 0) {
+    return pageEndFirst;
+  } else {
+    error(-1, "First page end offset in linearization table is invalid");
+    return 0;
+  }
+}
+
+int Linearization::getNumPages()
+{
+  int numPages = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("N", NULL, &numPages) &&
+      numPages > 0) {
+    return numPages;
+  } else {
+    error(-1, "Page count in linearization table is invalid");
+    return 0;
+  }
+}
+
+Guint Linearization::getMainXRefEntriesOffset()
+{
+  int mainXRefEntriesOffset = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("T", NULL, &mainXRefEntriesOffset) &&
+      mainXRefEntriesOffset > 0) {
+    return mainXRefEntriesOffset;
+  } else {
+    error(-1, "Main Xref offset in linearization table is invalid");
+    return 0;
+  }
+}
+
+int Linearization::getPageFirst()
+{
+  int pageFirst = 0; // Optional, defaults to 0.
+
+  if (linDict.isDict()) {
+    linDict.getDict()->lookupInt("P", NULL, &pageFirst);
+  }
+
+  if (pageFirst < 0) {
+    error(-1, "First page in linearization table is invalid");
+    return 0;
+  }
+
+  return pageFirst;
+}
+
+
diff --git a/poppler/Linearization.h b/poppler/Linearization.h
new file mode 100644
index 0000000..6728a75
--- /dev/null
+++ b/poppler/Linearization.h
@@ -0,0 +1,45 @@
+//========================================================================
+//
+// Linearization.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#ifndef LINEARIZATION_H
+#define LINEARIZATION_H
+
+#include "goo/gtypes.h"
+#include "Object.h"
+class BaseStream;
+
+//------------------------------------------------------------------------
+// Linearization
+//------------------------------------------------------------------------
+
+class Linearization {
+public:
+
+  Linearization(BaseStream *str);
+  ~Linearization();
+
+  Guint getLength();
+  Guint getHintsOffset();
+  Guint getHintsLength();
+  Guint getHintsOffset2();
+  Guint getHintsLength2();
+  int getObjectNumberFirst();
+  Guint getEndFirst();
+  int getNumPages();
+  Guint getMainXRefEntriesOffset();
+  int getPageFirst();
+
+private:
+
+  Object linDict;
+
+};
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 5dd8082..8c1e019 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -209,6 +209,7 @@ poppler_include_HEADERS =	\
 	JArithmeticDecoder.h	\
 	JBIG2Stream.h		\
 	Lexer.h			\
+	Linearization.h 	\
 	Link.h			\
 	LocalPDFDocBuilder.h	\
 	Movie.h                 \
@@ -287,6 +288,7 @@ libpoppler_la_SOURCES =		\
 	JArithmeticDecoder.cc	\
 	JBIG2Stream.cc		\
 	Lexer.cc 		\
+	Linearization.cc 	\
 	Link.cc 		\
 	LocalPDFDocBuilder.cc	\
 	Movie.cc                \
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 2d1477d..fe568a0 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -52,6 +52,7 @@
 #include "Catalog.h"
 #include "Stream.h"
 #include "XRef.h"
+#include "Linearization.h"
 #include "Link.h"
 #include "OutputDev.h"
 #include "Error.h"
@@ -82,6 +83,7 @@ void PDFDoc::init()
   file = NULL;
   str = NULL;
   xref = NULL;
+  linearization = NULL;
   catalog = NULL;
 #ifndef DISABLE_OUTLINE
   outline = NULL;
@@ -242,6 +244,9 @@ PDFDoc::~PDFDoc() {
   if (xref) {
     delete xref;
   }
+  if (linearization) {
+    delete linearization;
+  }
   if (str) {
     delete str;
   }
@@ -412,6 +417,14 @@ void PDFDoc::processLinks(OutputDev *out, int page) {
   catalog->getPage(page)->processLinks(out, catalog);
 }
 
+Linearization *PDFDoc::getLinearization()
+{
+  if (!linearization) {
+    linearization = new Linearization(str);
+  }
+  return linearization;
+}
+
 GBool PDFDoc::isLinearized() {
   Parser *parser;
   Object obj1, obj2, obj3, obj4, obj5;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 6d7dea2..011f4c0 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -48,6 +48,7 @@ class Links;
 class LinkAction;
 class LinkDest;
 class Outline;
+class Linearization;
 
 enum PDFWriteMode {
   writeStandard,
@@ -89,6 +90,9 @@ public:
   // Get file name.
   GooString *getFileName() { return fileName; }
 
+  // Get the linearization table.
+  Linearization *getLinearization();
+
   // Get the xref table.
   XRef *getXRef() { return xref; }
 
@@ -242,6 +246,7 @@ private:
   void *guiData;
   int pdfMajorVersion;
   int pdfMinorVersion;
+  Linearization *linearization;
   XRef *xref;
   Catalog *catalog;
 #ifndef DISABLE_OUTLINE
-- 
1.6.4.2


From b5c01257baded49e3b9ef7b57a66221ed4df36c8 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 13 Apr 2010 18:51:40 +0200
Subject: [PATCH 07/12] Add getLength() to BaseStream

---
 poppler/Stream.cc |   11 ++++++-----
 poppler/Stream.h  |   11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/poppler/Stream.cc b/poppler/Stream.cc
index 0771e25..f4f9351 100644
--- a/poppler/Stream.cc
+++ b/poppler/Stream.cc
@@ -363,8 +363,9 @@ void FileOutStream::printf(const char *format, ...)
 // BaseStream
 //------------------------------------------------------------------------
 
-BaseStream::BaseStream(Object *dictA) {
+BaseStream::BaseStream(Object *dictA, Guint lengthA) {
   dict = *dictA;
+  length = lengthA;
 }
 
 BaseStream::~BaseStream() {
@@ -677,7 +678,7 @@ GBool StreamPredictor::getNextLine() {
 
 FileStream::FileStream(FILE *fA, Guint startA, GBool limitedA,
 		       Guint lengthA, Object *dictA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   f = fA;
   start = startA;
   limited = limitedA;
@@ -802,7 +803,7 @@ void FileStream::moveStart(int delta) {
 
 CachedFileStream::CachedFileStream(CachedFile *ccA, Guint startA,
         GBool limitedA, Guint lengthA, Object *dictA)
-  : BaseStream(dictA)
+  : BaseStream(dictA, lengthA)
 {
   cc = ccA;
   start = startA;
@@ -900,7 +901,7 @@ void CachedFileStream::moveStart(int delta)
 //------------------------------------------------------------------------
 
 MemStream::MemStream(char *bufA, Guint startA, Guint lengthA, Object *dictA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   buf = bufA;
   start = startA;
   length = lengthA;
@@ -964,7 +965,7 @@ void MemStream::moveStart(int delta) {
 
 EmbedStream::EmbedStream(Stream *strA, Object *dictA,
 			 GBool limitedA, Guint lengthA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   str = strA;
   limited = limitedA;
   length = lengthA;
diff --git a/poppler/Stream.h b/poppler/Stream.h
index 49ae8fb..6896d20 100644
--- a/poppler/Stream.h
+++ b/poppler/Stream.h
@@ -240,7 +240,7 @@ private:
 class BaseStream: public Stream {
 public:
 
-  BaseStream(Object *dictA);
+  BaseStream(Object *dictA, Guint lengthA);
   virtual ~BaseStream();
   virtual Stream *makeSubStream(Guint start, GBool limited,
 				Guint length, Object *dict) = 0;
@@ -250,11 +250,16 @@ public:
   virtual Stream *getUndecodedStream() { return this; }
   virtual Dict *getDict() { return dict.getDict(); }
   virtual GooString *getFileName() { return NULL; }
+  virtual Guint getLength() { return length; }
 
   // Get/set position of first byte of stream within the file.
   virtual Guint getStart() = 0;
   virtual void moveStart(int delta) = 0;
 
+protected:
+
+  Guint length;
+
 private:
 
   Object dict;
@@ -401,7 +406,6 @@ private:
   FILE *f;
   Guint start;
   GBool limited;
-  Guint length;
   char buf[fileStreamBufSize];
   char *bufPtr;
   char *bufEnd;
@@ -446,7 +450,6 @@ private:
   CachedFile *cc;
   Guint start;
   GBool limited;
-  Guint length;
   char buf[cachedStreamBufSize];
   char *bufPtr;
   char *bufEnd;
@@ -490,7 +493,6 @@ private:
 
   char *buf;
   Guint start;
-  Guint length;
   char *bufEnd;
   char *bufPtr;
   GBool needFree;
@@ -530,7 +532,6 @@ private:
 
   Stream *str;
   GBool limited;
-  Guint length;
 };
 
 //------------------------------------------------------------------------
-- 
1.6.4.2


From 3c4f384d782db743f9f1d88834939ac717ad6a32 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 19:16:14 +0100
Subject: [PATCH 08/12] Pass size of file when creating FileStream

---
 poppler/PDFDoc.cc |   19 +++++++++++++++++--
 1 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index fe568a0..0a018fd 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -44,6 +44,7 @@
 #ifdef _WIN32
 #  include <windows.h>
 #endif
+#include <sys/stat.h>
 #include "goo/gstrtod.h"
 #include "goo/GooString.h"
 #include "poppler-config.h"
@@ -98,12 +99,18 @@ PDFDoc::PDFDoc()
 PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
 	       GooString *userPassword, void *guiDataA) {
   Object obj;
+  int size = 0;
 
   init();
 
   fileName = fileNameA;
   guiData = guiDataA;
 
+  struct stat buf;
+  if (stat(fileName->getCString(), &buf) == 0) {
+     size = buf.st_size;
+  }
+
   // try to open file
 #ifdef VMS
   file = fopen(fileName->getCString(), "rb", "ctx=stm");
@@ -123,7 +130,7 @@ PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
 
   // create stream
   obj.initNull();
-  str = new FileStream(file, 0, gFalse, 0, &obj);
+  str = new FileStream(file, 0, gFalse, size, &obj);
 
   ok = setup(ownerPassword, userPassword);
 }
@@ -154,11 +161,19 @@ PDFDoc::PDFDoc(wchar_t *fileNameA, int fileNameLen, GooString *ownerPassword,
 
   // try to open file
   // NB: _wfopen is only available in NT
+  struct stat buf;
+  int size;
   version.dwOSVersionInfoSize = sizeof(version);
   GetVersionEx(&version);
   if (version.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+    if (_wstat(fileName2, &buf) == 0) {
+      size = buf.st_size;
+    }
     file = _wfopen(fileName2, L"rb");
   } else {
+    if (_wstat(fileName->getCString(), &buf) == 0) {
+      size = buf.st_size;
+    }
     file = fopen(fileName->getCString(), "rb");
   }
   if (!file) {
@@ -169,7 +184,7 @@ PDFDoc::PDFDoc(wchar_t *fileNameA, int fileNameLen, GooString *ownerPassword,
 
   // create stream
   obj.initNull();
-  str = new FileStream(file, 0, gFalse, 0, &obj);
+  str = new FileStream(file, 0, gFalse, size, &obj);
 
   ok = setup(ownerPassword, userPassword);
 }
-- 
1.6.4.2


From 0a1e5029769e3096d9f05cb586dac56e13d4a6f5 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 19:32:59 +0100
Subject: [PATCH 09/12] Improve linearization check

---
 poppler/PDFDoc.cc |   33 +++++----------------------------
 1 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 0a018fd..35f5cc9 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -441,34 +441,11 @@ Linearization *PDFDoc::getLinearization()
 }
 
 GBool PDFDoc::isLinearized() {
-  Parser *parser;
-  Object obj1, obj2, obj3, obj4, obj5;
-  GBool lin;
-
-  lin = gFalse;
-  obj1.initNull();
-  parser = new Parser(xref,
-	     new Lexer(xref,
-	       str->makeSubStream(str->getStart(), gFalse, 0, &obj1)),
-	     gTrue);
-  parser->getObj(&obj1);
-  parser->getObj(&obj2);
-  parser->getObj(&obj3);
-  parser->getObj(&obj4);
-  if (obj1.isInt() && obj2.isInt() && obj3.isCmd("obj") &&
-      obj4.isDict()) {
-    obj4.dictLookup("Linearized", &obj5);
-    if (obj5.isNum() && obj5.getNum() > 0) {
-      lin = gTrue;
-    }
-    obj5.free();
-  }
-  obj4.free();
-  obj3.free();
-  obj2.free();
-  obj1.free();
-  delete parser;
-  return lin;
+  if ((str->getLength()) &&
+      (getLinearization()->getLength() == str->getLength()))
+    return gTrue;
+  else
+    return gFalse;
 }
 
 int PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
-- 
1.6.4.2


From 3827314e6450b74eff2acd1e01db48eb144aa76d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 7 Apr 2010 12:05:56 +0200
Subject: [PATCH 10/12] Move getStartXref from XRef to PDFDoc

---
 poppler/PDFDoc.cc |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 poppler/PDFDoc.h  |    5 ++++
 poppler/XRef.cc   |   50 +------------------------------------------
 poppler/XRef.h    |    6 +----
 4 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 35f5cc9..6c4159a 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -34,6 +34,7 @@
 #pragma implementation
 #endif
 
+#include <ctype.h>
 #include <locale.h>
 #include <stdio.h>
 #include <errno.h>
@@ -72,6 +73,9 @@
 #define headerSearchSize 1024	// read this many bytes at beginning of
 				//   file to look for '%PDF'
 
+#define xrefSearchSize 1024	// read this many bytes at end of file
+				//   to look for 'startxref'
+
 //------------------------------------------------------------------------
 // PDFDoc
 //------------------------------------------------------------------------
@@ -89,6 +93,7 @@ void PDFDoc::init()
 #ifndef DISABLE_OUTLINE
   outline = NULL;
 #endif
+  startXRefPos = ~(Guint)0;
 }
 
 PDFDoc::PDFDoc()
@@ -222,7 +227,7 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
   checkHeader();
 
   // read xref table
-  xref = new XRef(str);
+  xref = new XRef(str, getStartXRef());
   if (!xref->isOk()) {
     error(-1, "Couldn't read xref table");
     errCode = xref->getErrorCode();
@@ -889,7 +894,7 @@ void PDFDoc::writeTrailer (Guint uxrefOffset, int uxrefSize, OutStream* outStr,
   trailerDict->set("Root", &obj1);
 
   if (incrUpdate) { 
-    obj1.initInt(xref->getLastXRefPos());
+    obj1.initInt(getStartXRef());
     trailerDict->set("Prev", &obj1);
   }
   
@@ -927,3 +932,55 @@ PDFDoc *PDFDoc::ErrorPDFDoc(int errorCode, GooString *fileNameA)
 
   return doc;
 }
+
+Guint PDFDoc::strToUnsigned(char *s) {
+  Guint x;
+  char *p;
+  int i;
+
+  x = 0;
+  for (p = s, i = 0; *p && isdigit(*p) && i < 10; ++p, ++i) {
+    x = 10 * x + (*p - '0');
+  }
+  return x;
+}
+
+// Read the 'startxref' position.
+Guint PDFDoc::getStartXRef()
+{
+  if (startXRefPos == ~(Guint)0) {
+
+    {
+      char buf[xrefSearchSize+1];
+      char *p;
+      int c, n, i;
+
+      // read last xrefSearchSize bytes
+      str->setPos(xrefSearchSize, -1);
+      for (n = 0; n < xrefSearchSize; ++n) {
+        if ((c = str->getChar()) == EOF) {
+          break;
+        }
+        buf[n] = c;
+      }
+      buf[n] = '\0';
+
+      // find startxref
+      for (i = n - 9; i >= 0; --i) {
+        if (!strncmp(&buf[i], "startxref", 9)) {
+          break;
+        }
+      }
+      if (i < 0) {
+        startXRefPos = 0;
+      }
+      for (p = &buf[i+9]; isspace(*p); ++p) ;
+      startXRefPos =  strToUnsigned(p);
+    }
+
+  }
+
+  return startXRefPos;
+}
+
+
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 011f4c0..d093b59 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -239,6 +239,9 @@ private:
   GBool checkFooter();
   void checkHeader();
   GBool checkEncryption(GooString *ownerPassword, GooString *userPassword);
+  // Get the offset of the start xref table.
+  Guint getStartXRef();
+  Guint strToUnsigned(char *s);
 
   GooString *fileName;
   FILE *file;
@@ -258,6 +261,8 @@ private:
   //If there is an error opening the PDF file with fopen() in the constructor, 
   //then the POSIX errno will be here.
   int fopenErrno;
+
+  Guint startXRefPos;		// offset of last xref table
 };
 
 #endif
diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index a0c77fc..b69bf9a 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -47,11 +47,6 @@
 #include "XRef.h"
 
 //------------------------------------------------------------------------
-
-#define xrefSearchSize 1024	// read this many bytes at end of file
-				//   to look for 'startxref'
-
-//------------------------------------------------------------------------
 // Permission bits
 // Note that the PDF spec uses 1 base (eg bit 3 is 1<<2)
 //------------------------------------------------------------------------
@@ -241,7 +236,7 @@ XRef::XRef() {
   init();
 }
 
-XRef::XRef(BaseStream *strA) {
+XRef::XRef(BaseStream *strA, Guint pos) {
   Object obj;
 
   init();
@@ -367,37 +362,6 @@ int XRef::resize(int newSize)
   return size;
 }
 
-// Read the 'startxref' position.
-Guint XRef::getStartXref() {
-  char buf[xrefSearchSize+1];
-  char *p;
-  int c, n, i;
-
-  // read last xrefSearchSize bytes
-  str->setPos(xrefSearchSize, -1);
-  for (n = 0; n < xrefSearchSize; ++n) {
-    if ((c = str->getChar()) == EOF) {
-      break;
-    }
-    buf[n] = c;
-  }
-  buf[n] = '\0';
-
-  // find startxref
-  for (i = n - 9; i >= 0; --i) {
-    if (!strncmp(&buf[i], "startxref", 9)) {
-      break;
-    }
-  }
-  if (i < 0) {
-    return 0;
-  }
-  for (p = &buf[i+9]; isspace(*p); ++p) ;
-  lastXRefPos = strToUnsigned(p);
-
-  return lastXRefPos;
-}
-
 // Read one xref table section.  Also reads the associated trailer
 // dictionary, and returns the prev pointer (if any).
 GBool XRef::readXRef(Guint *pos) {
@@ -1079,18 +1043,6 @@ int XRef::getNumEntry(Guint offset)
   else return -1;
 }
 
-Guint XRef::strToUnsigned(char *s) {
-  Guint x;
-  char *p;
-  int i;
-
-  x = 0;
-  for (p = s, i = 0; *p && isdigit(*p) && i < 10; ++p, ++i) {
-    x = 10 * x + (*p - '0');
-  }
-  return x;
-}
-
 void XRef::add(int num, int gen, Guint offs, GBool used) {
   if (num >= size) {
     if (num >= capacity) {
diff --git a/poppler/XRef.h b/poppler/XRef.h
index a013e5a..7cd8ebe 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -63,7 +63,7 @@ public:
   // Constructor, create an empty XRef, used for PDF writing
   XRef();
   // Constructor.  Read xref table from stream.
-  XRef(BaseStream *strA);
+  XRef(BaseStream *strA, Guint pos);
 
   // Destructor.
   ~XRef();
@@ -106,9 +106,6 @@ public:
   // Return the number of objects in the xref table.
   int getNumObjects() { return size; }
 
-  // Return the offset of the last xref table.
-  Guint getLastXRefPos() { return lastXRefPos; }
-
   // Return the catalog object reference.
   int getRootNum() { return rootNum; }
   int getRootGen() { return rootGen; }
@@ -143,7 +140,6 @@ private:
   GBool ok;			// true if xref table is valid
   int errCode;			// error code (if <ok> is false)
   Object trailerDict;		// trailer dictionary
-  Guint lastXRefPos;		// offset of last xref table
   Guint *streamEnds;		// 'endstream' positions - only used in
 				//   damaged files
   int streamEndsLen;		// number of valid entries in streamEnds
-- 
1.6.4.2


From caa7dcaa19dc3db4297e0615e2045ba66738a21c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 7 Apr 2010 12:35:05 +0200
Subject: [PATCH 11/12] Use XRef table at start of linearized document

---
 poppler/PDFDoc.cc |   27 ++++++++++++++++++++++++++-
 1 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 6c4159a..e590f3c 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -73,6 +73,10 @@
 #define headerSearchSize 1024	// read this many bytes at beginning of
 				//   file to look for '%PDF'
 
+#define linearizationSearchSize 1024	// read this many bytes at beginning of
+					// file to look for linearization
+					// dictionary
+
 #define xrefSearchSize 1024	// read this many bytes at end of file
 				//   to look for 'startxref'
 
@@ -950,7 +954,28 @@ Guint PDFDoc::getStartXRef()
 {
   if (startXRefPos == ~(Guint)0) {
 
-    {
+    if (isLinearized()) {
+      char buf[linearizationSearchSize+1];
+      int c, n, i;
+
+      str->setPos(0);
+      for (n = 0; n < linearizationSearchSize; ++n) {
+        if ((c = str->getChar()) == EOF) {
+          break;
+        }
+        buf[n] = c;
+      }
+      buf[n] = '\0';
+
+      // find end of first obj
+      startXRefPos = 0;
+      for (i = 0; i < n; i++) {
+        if (!strncmp("endobj", &buf[i], 6)) {
+           startXRefPos = i+6;
+           break;
+        }
+      }
+    } else {
       char buf[xrefSearchSize+1];
       char *p;
       int c, n, i;
-- 
1.6.4.2


From 05da8579632ed1bfecd1d026cd7f6dec96beac43 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Sun, 25 Apr 2010 17:34:49 +0200
Subject: [PATCH 12/12] Use linearization data to parse XRef entries

---
 poppler/PDFDoc.cc |   12 +++++++++++-
 poppler/PDFDoc.h  |    3 +++
 poppler/XRef.cc   |   45 +++++++++++++++++++++++++++++++++++++++++++--
 poppler/XRef.h    |    6 +++++-
 4 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index e590f3c..172f766 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -231,7 +231,7 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
   checkHeader();
 
   // read xref table
-  xref = new XRef(str, getStartXRef());
+  xref = new XRef(str, getStartXRef(), getMainXRefEntriesOffset());
   if (!xref->isOk()) {
     error(-1, "Couldn't read xref table");
     errCode = xref->getErrorCode();
@@ -1008,4 +1008,14 @@ Guint PDFDoc::getStartXRef()
   return startXRefPos;
 }
 
+Guint PDFDoc::getMainXRefEntriesOffset()
+{
+  Guint mainXRefEntriesOffset = 0;
+
+  if (isLinearized()) {
+    mainXRefEntriesOffset = getLinearization()->getMainXRefEntriesOffset();
+  }
+
+  return mainXRefEntriesOffset;
+}
 
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index d093b59..f6f8c8f 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -241,6 +241,9 @@ private:
   GBool checkEncryption(GooString *ownerPassword, GooString *userPassword);
   // Get the offset of the start xref table.
   Guint getStartXRef();
+  // Get the offset of the entries in the main XRef table of a
+  // linearized document (0 for non linearized documents).
+  Guint getMainXRefEntriesOffset();
   Guint strToUnsigned(char *s);
 
   GooString *fileName;
diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index b69bf9a..ab92cd8 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -230,16 +230,19 @@ void XRef::init() {
   streamEnds = NULL;
   streamEndsLen = 0;
   objStr = NULL;
+  mainXRefEntriesOffset = 0;
+  xRefStream = gFalse;
 }
 
 XRef::XRef() {
   init();
 }
 
-XRef::XRef(BaseStream *strA, Guint pos) {
+XRef::XRef(BaseStream *strA, Guint pos, Guint mainXRefEntriesOffsetA) {
   Object obj;
 
   init();
+  mainXRefEntriesOffset = mainXRefEntriesOffsetA;
 
   encrypted = gFalse;
   permFlags = defPermFlags;
@@ -396,6 +399,9 @@ GBool XRef::readXRef(Guint *pos) {
     if (!parser->getObj(&obj)->isStream()) {
       goto err1;
     }
+    if (trailerDict.isNone()) {
+      xRefStream = gTrue;
+    }
     more = readXRefStream(obj.getStream(), pos);
     obj.free();
 
@@ -1157,11 +1163,46 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   }
 }
 
+GBool XRef::parseEntry(Guint offset, XRefEntry *entry)
+{
+  GBool r;
+
+  Object obj;
+  obj.initNull();
+  Parser parser = Parser(NULL, new Lexer(NULL,
+     str->makeSubStream(offset, gFalse, 20, &obj)), gTrue);
+
+  Object obj1, obj2, obj3;
+  if ((parser.getObj(&obj1)->isInt()) &&
+      (parser.getObj(&obj2)->isInt()) &&
+      (parser.getObj(&obj3)->isCmd("n") || obj3.isCmd("f"))) {
+    entry->offset = (Guint) obj1.getInt();
+    entry->gen = obj2.getInt();
+    entry->type = obj3.isCmd("n") ? xrefEntryUncompressed : xrefEntryFree;
+    entry->obj.initNull ();
+    entry->updated = false;
+    r = gTrue;
+  } else {
+    r = gFalse;
+  }
+  obj1.free();
+  obj2.free();
+  obj3.free();
+
+  return r;
+}
+
 XRefEntry *XRef::getEntry(int i)
 {
   if (entries[i].type == xrefEntryNone) {
 
-    while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+    if ((!xRefStream) && mainXRefEntriesOffset) {
+      if (!parseEntry(mainXRefEntriesOffset + 20*i, &entries[i])) {
+        error(-1, "Failed to parse XRef entry [%d].", i);
+      }
+    } else {
+      while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+    }
 
     if (entries[i].type == xrefEntryNone) {
        error(-1, "Invalid XRef entry");
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 7cd8ebe..a4548e6 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -63,7 +63,7 @@ public:
   // Constructor, create an empty XRef, used for PDF writing
   XRef();
   // Constructor.  Read xref table from stream.
-  XRef(BaseStream *strA, Guint pos);
+  XRef(BaseStream *strA, Guint pos, Guint mainXRefEntriesOffsetA = 0);
 
   // Destructor.
   ~XRef();
@@ -153,6 +153,8 @@ private:
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
   Guint prevXRefOffset;		// position of prev XRef section (= next to read)
+  Guint mainXRefEntriesOffset;	// offset of entries in main XRef table
+  GBool xRefStream;		// true if last XRef section is a stream
 
   void init();
   int reserve(int newSize);
@@ -164,6 +166,8 @@ private:
   GBool readXRefStream(Stream *xrefStr, Guint *pos);
   GBool constructXRef();
   Guint strToUnsigned(char *s);
+  GBool parseEntry(Guint offset, XRefEntry *entry);
+
 };
 
 #endif
-- 
1.6.4.2
-------------- next part --------------
From f5c6549ec7d9ad88c5349a5b70f2f0a1f1b91289 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:03:54 +0200
Subject: [PATCH 01/15] add PDFDoc::getPage()

---
 poppler/PDFDoc.cc |    8 ++++++++
 poppler/PDFDoc.h  |    3 +++
 2 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 172f766..b52b280 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -1019,3 +1019,11 @@ Guint PDFDoc::getMainXRefEntriesOffset()
   return mainXRefEntriesOffset;
 }
 
+Page *PDFDoc::getPage(int page)
+{
+  if ((page < 1) || page > getNumPages()) return NULL;
+
+  {
+    return catalog->getPage(page);
+  }
+}
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index f6f8c8f..011e6e1 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -127,6 +127,9 @@ public:
   // Return the structure tree root object.
   Object *getStructTreeRoot() { return catalog->getStructTreeRoot(); }
 
+  // Get page.
+  Page *getPage(int page);
+
   // Display a page.
   void displayPage(OutputDev *out, int page,
 		   double hDPI, double vDPI, int rotate,
-- 
1.6.4.2


From b9a4939ea3b4a6ca56e963a11ab2c3d51b97b232 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:36:08 +0200
Subject: [PATCH 02/15] Use PDFDoc::getPage() in PDFDoc

---
 poppler/PDFDoc.cc |   28 +++++++++++++++++++++-------
 poppler/PDFDoc.h  |   10 +++++-----
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index b52b280..89dba6f 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -395,8 +395,11 @@ void PDFDoc::displayPage(OutputDev *out, int page,
   if (globalParams->getPrintCommands()) {
     printf("***** page %d *****\n", page);
   }
-  catalog->getPage(page)->display(out, hDPI, vDPI,
-				  rotate, useMediaBox, crop, printing, catalog,
+
+  Page *p = getPage(page);
+  if (!p) return;
+
+  p->display(out, hDPI, vDPI, rotate, useMediaBox, crop, printing, catalog,
 				  abortCheckCbk, abortCheckCbkData,
 				  annotDisplayDecideCbk, annotDisplayDecideCbkData);
 }
@@ -425,8 +428,11 @@ void PDFDoc::displayPageSlice(OutputDev *out, int page,
 			      void *abortCheckCbkData,
                               GBool (*annotDisplayDecideCbk)(Annot *annot, void *user_data),
                               void *annotDisplayDecideCbkData) {
-  catalog->getPage(page)->displaySlice(out, hDPI, vDPI,
-				       rotate, useMediaBox, crop,
+
+  Page *p = getPage(page);
+  if (!p) return;
+
+  p->displaySlice(out, hDPI, vDPI, rotate, useMediaBox, crop,
 				       sliceX, sliceY, sliceW, sliceH,
 				       printing, catalog,
 				       abortCheckCbk, abortCheckCbkData,
@@ -434,11 +440,19 @@ void PDFDoc::displayPageSlice(OutputDev *out, int page,
 }
 
 Links *PDFDoc::getLinks(int page) {
-  return catalog->getPage(page)->getLinks(catalog);
+  Page *p = getPage(page);
+  if (!p) {
+    Object obj;
+    obj.initNull();
+    return new Links (&obj, NULL);
+  }
+  return p->getLinks(catalog);
 }
-  
+
 void PDFDoc::processLinks(OutputDev *out, int page) {
-  catalog->getPage(page)->processLinks(out, catalog);
+  Page *p = getPage(page);
+  if (!p) return;
+  p->processLinks(out, catalog);
 }
 
 Linearization *PDFDoc::getLinearization()
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 011e6e1..8de139f 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -107,15 +107,15 @@ public:
 
   // Get page parameters.
   double getPageMediaWidth(int page)
-    { return catalog->getPage(page)->getMediaWidth(); }
+    { return getPage(page) ? getPage(page)->getMediaWidth() : 0.0 ; }
   double getPageMediaHeight(int page)
-    { return catalog->getPage(page)->getMediaHeight(); }
+    { return getPage(page) ? getPage(page)->getMediaHeight() : 0.0 ; }
   double getPageCropWidth(int page)
-    { return catalog->getPage(page)->getCropWidth(); }
+    { return getPage(page) ? getPage(page)->getCropWidth() : 0.0 ; }
   double getPageCropHeight(int page)
-    { return catalog->getPage(page)->getCropHeight(); }
+    { return getPage(page) ? getPage(page)->getCropHeight() : 0.0 ; }
   int getPageRotate(int page)
-    { return catalog->getPage(page)->getRotate(); }
+    { return getPage(page) ? getPage(page)->getRotate() : 0 ; }
 
   // Get number of pages.
   int getNumPages() { return catalog->getNumPages(); }
-- 
1.6.4.2


From 3f91306f47ba90719a14edd01eaf214cc4f82249 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 20:48:30 +0200
Subject: [PATCH 03/15] Use PDFDoc::getPage() in FontInfo

---
 poppler/FontInfo.cc |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/poppler/FontInfo.cc b/poppler/FontInfo.cc
index 0037e07..c348d14 100644
--- a/poppler/FontInfo.cc
+++ b/poppler/FontInfo.cc
@@ -70,7 +70,9 @@ GooList *FontInfoScanner::scan(int nPages) {
   }
 
   for (int pg = currentPage; pg < lastPage; ++pg) {
-    page = doc->getCatalog()->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) continue;
+
     if ((resDict = page->getResourceDict())) {
       scanFonts(resDict, result);
     }
-- 
1.6.4.2


From 09a50f0dd352f4fe6141ef5ef2492e1005f9e891 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 11:11:11 +0200
Subject: [PATCH 04/15] Use PDFDoc::getPage() in pdfinfo

---
 utils/pdfinfo.cc |   22 +++++++++++++++-------
 1 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 2abe8b4..a94e4e8 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -257,7 +257,11 @@ int main(int argc, char *argv[]) {
   if (printBoxes) {
     if (multiPage) {
       for (pg = firstPage; pg <= lastPage; ++pg) {
-	page = doc->getCatalog()->getPage(pg);
+	page = doc->getPage(pg);
+	if (!page) {
+          error(-1, "Failed to print boxes for page %d", pg);
+	  continue;
+	}
 	sprintf(buf, "Page %4d MediaBox: ", pg);
 	printBox(buf, page->getMediaBox());
 	sprintf(buf, "Page %4d CropBox:  ", pg);
@@ -270,12 +274,16 @@ int main(int argc, char *argv[]) {
 	printBox(buf, page->getArtBox());
       }
     } else {
-      page = doc->getCatalog()->getPage(firstPage);
-      printBox("MediaBox:       ", page->getMediaBox());
-      printBox("CropBox:        ", page->getCropBox());
-      printBox("BleedBox:       ", page->getBleedBox());
-      printBox("TrimBox:        ", page->getTrimBox());
-      printBox("ArtBox:         ", page->getArtBox());
+      page = doc->getPage(firstPage);
+      if (!page) {
+        error(-1, "Failed to print boxes for page %d", firstPage);
+      } else {
+        printBox("MediaBox:       ", page->getMediaBox());
+        printBox("CropBox:        ", page->getCropBox());
+        printBox("BleedBox:       ", page->getBleedBox());
+        printBox("TrimBox:        ", page->getTrimBox());
+        printBox("ArtBox:         ", page->getArtBox());
+      }
     }
   }
 
-- 
1.6.4.2


From ffcc4834e076b7830bd57eeddf0b6327f1eabb4c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 11:19:53 +0200
Subject: [PATCH 05/15] Use PDFDoc::getPage() in pdffonts

---
 utils/pdffonts.cc |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/utils/pdffonts.cc b/utils/pdffonts.cc
index 81b20e4..30e25dc 100644
--- a/utils/pdffonts.cc
+++ b/utils/pdffonts.cc
@@ -166,7 +166,11 @@ int main(int argc, char *argv[]) {
   fonts = NULL;
   fontsLen = fontsSize = 0;
   for (pg = firstPage; pg <= lastPage; ++pg) {
-    page = doc->getCatalog()->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) {
+      error(-1, "Failed to read fonts from page %d", pg);
+      continue;
+    }
     if ((resDict = page->getResourceDict())) {
       scanFonts(resDict, doc);
     }
-- 
1.6.4.2


From d0799186bfed0f1cbd2f916b12028e89d8ed3d62 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 15:52:20 +0200
Subject: [PATCH 06/15] Use PDFDoc::getPage() in glib

---
 glib/poppler-action.cc   |    4 ++--
 glib/poppler-document.cc |   17 ++++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/glib/poppler-action.cc b/glib/poppler-action.cc
index ffc1842..a0f8576 100644
--- a/glib/poppler-action.cc
+++ b/glib/poppler-action.cc
@@ -422,13 +422,13 @@ find_annot_movie_for_action (PopplerDocument *document,
 
     xref->fetch (ref->num, ref->gen, &annotObj);
   } else if (link->hasAnnotTitle ()) {
-    Catalog *catalog = document->doc->getCatalog ();
     Object annots;
     GooString *title = link->getAnnotTitle ();
     int i;
 
     for (i = 1; i <= document->doc->getNumPages (); ++i) {
-      Page *p = catalog->getPage (i);
+      Page *p = document->doc->getPage (i);
+      if (!p) continue;
 
       if (p->getAnnots (&annots)->isArray ()) {
         int j;
diff --git a/glib/poppler-document.cc b/glib/poppler-document.cc
index cd6794a..ccb0f1c 100644
--- a/glib/poppler-document.cc
+++ b/glib/poppler-document.cc
@@ -379,15 +379,14 @@ PopplerPage *
 poppler_document_get_page (PopplerDocument  *document,
 			   int               index)
 {
-  Catalog *catalog;
   Page *page;
 
   g_return_val_if_fail (0 <= index &&
 			index < poppler_document_get_n_pages (document),
 			NULL);
 
-  catalog = document->doc->getCatalog();
-  page = catalog->getPage (index + 1);
+  page = document->doc->getPage (index + 1);
+  if (!page) return NULL;
 
   return _poppler_page_new (document, page, index);
 }
@@ -1909,18 +1908,22 @@ PopplerFormField *
 poppler_document_get_form_field (PopplerDocument *document,
 				 gint             id)
 {
-  Catalog *catalog = document->doc->getCatalog();
+  Page *page;
   unsigned pageNum;
   unsigned fieldNum;
   FormPageWidgets *widgets;
   FormWidget *field;
 
   FormWidget::decodeID (id, &pageNum, &fieldNum);
-  
-  widgets = catalog->getPage (pageNum)->getPageWidgets ();
+
+  page = document->doc->getPage (pageNum);
+  if (!page)
+    return NULL;
+
+  widgets = page->getPageWidgets ();
   if (!widgets)
     return NULL;
-  
+
   field = widgets->getWidget (fieldNum);
   if (field)
     return _poppler_form_field_new (document, field);
-- 
1.6.4.2


From 2f5c14573200e8318fdd6b52d9de01f0d733301b Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 17:59:01 +0200
Subject: [PATCH 07/15] Use PDFDoc::getPage() in qt4

Note API change: With this patch, Document::Page(int index) can now return NULL
when poppler fails to create a page. Any application using these bindings
should check the return value.
---
 qt4/src/poppler-document.cc |    8 +++++++-
 qt4/src/poppler-link.cc     |    6 ++++--
 qt4/src/poppler-page.cc     |    3 ++-
 qt4/src/poppler-qt4.h       |    3 +++
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/qt4/src/poppler-document.cc b/qt4/src/poppler-document.cc
index 41d35b6..dc0ce97 100644
--- a/qt4/src/poppler-document.cc
+++ b/qt4/src/poppler-document.cc
@@ -98,7 +98,13 @@ namespace Poppler {
 
     Page *Document::page(int index) const
     {
-	return new Page(m_doc, index);
+	Page *page = new Page(m_doc, index);
+	if (!page->isOk()) {
+	  delete page;
+	  return NULL;
+	}
+
+	return page;
     }
 
     bool Document::isLocked() const
diff --git a/qt4/src/poppler-link.cc b/qt4/src/poppler-link.cc
index de06242..4f54201 100644
--- a/qt4/src/poppler-link.cc
+++ b/qt4/src/poppler-link.cc
@@ -232,9 +232,11 @@ class LinkMoviePrivate : public LinkPrivate
 		
 		int leftAux = 0, topAux = 0, rightAux = 0, bottomAux = 0;
 		
-		if (d->pageNum > 0 && d->pageNum <= data.doc->doc->getNumPages())
+		::Page *page;
+		if (d->pageNum > 0 &&
+		    d->pageNum <= data.doc->doc->getNumPages() &&
+		    (page = data.doc->doc->getPage( d->pageNum )))
 		{
-			::Page *page = data.doc->doc->getCatalog()->getPage( d->pageNum );
 			cvtUserToDev( page, left, top, &leftAux, &topAux );
 			cvtUserToDev( page, right, bottom, &rightAux, &bottomAux );
 			
diff --git a/qt4/src/poppler-page.cc b/qt4/src/poppler-page.cc
index 6dbf50f..335f2ce 100644
--- a/qt4/src/poppler-page.cc
+++ b/qt4/src/poppler-page.cc
@@ -186,8 +186,9 @@ Page::Page(DocumentData *doc, int index) {
   m_page = new PageData();
   m_page->index = index;
   m_page->parentDoc = doc;
-  m_page->page = doc->doc->getCatalog()->getPage(m_page->index + 1);
+  m_page->page = doc->doc->getPage(m_page->index + 1);
   m_page->transition = 0;
+  ok = m_page->page ? true : false;
 }
 
 Page::~Page()
diff --git a/qt4/src/poppler-qt4.h b/qt4/src/poppler-qt4.h
index 117dc43..2e77f48 100644
--- a/qt4/src/poppler-qt4.h
+++ b/qt4/src/poppler-qt4.h
@@ -587,11 +587,14 @@ delete it;
 	**/
 	QString label() const;
 	
+	bool isOk() { return ok; };
+
     private:
 	Q_DISABLE_COPY(Page)
 
 	Page(DocumentData *doc, int index);
 	PageData *m_page;
+        bool ok;
     };
 
 /**
-- 
1.6.4.2


From 0232472448894471e21f994cadee505be9fe625d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Fri, 23 Apr 2010 09:21:23 +0200
Subject: [PATCH 08/15] Use PDFDoc::getPage() in qt

Note API change: With this patch, Document::getPage(int index) can now
return NULL when poppler fails to create a page. Any application using
these bindings should check the return value.
---
 qt/poppler-document.cc |   11 +++++++++++
 qt/poppler-page.cc     |   11 +++++++----
 qt/poppler-qt.h        |    6 +++++-
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/qt/poppler-document.cc b/qt/poppler-document.cc
index bade1d1..1a5892b 100644
--- a/qt/poppler-document.cc
+++ b/qt/poppler-document.cc
@@ -113,6 +113,17 @@ int Document::getNumPages() const
   return data->doc.getNumPages();
 }
 
+Page *Document::getPage(int index) const
+{
+  Page *p = new Page(this, index);
+  if (!p->isOk()) {
+    delete p;
+    return NULL;
+  }
+
+  return p;
+}
+
 QValueList<FontInfo> Document::fonts() const
 {
   QValueList<FontInfo> ourList;
diff --git a/qt/poppler-page.cc b/qt/poppler-page.cc
index a42aa15..ef077a7 100644
--- a/qt/poppler-page.cc
+++ b/qt/poppler-page.cc
@@ -47,6 +47,7 @@ class PageData {
   const Document *doc;
   int index;
   PageTransition *transition;
+  ::Page *page;
 };
 
 Page::Page(const Document *doc, int index) {
@@ -54,6 +55,8 @@ Page::Page(const Document *doc, int index) {
   data->index = index;
   data->doc = doc;
   data->transition = 0;
+  data->page = doc->data->doc.getPage(data->index + 1);
+  ok = data->page ? true : false;
 }
 
 Page::~Page()
@@ -132,7 +135,7 @@ QString Page::getText(const Rectangle &r) const
   output_dev = new TextOutputDev(0, gFalse, gFalse, gFalse);
   data->doc->data->doc.displayPageSlice(output_dev, data->index + 1, 72, 72,
       0, false, false, false, -1, -1, -1, -1);
-  p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  p = data->page;
   if (r.isNull())
   {
     rect = p->getCropBox();
@@ -197,7 +200,7 @@ PageTransition *Page::getTransition() const
   {
     Object o;
     PageTransitionParams params;
-    params.dictObj = data->doc->data->doc.getCatalog()->getPage(data->index + 1)->getTrans(&o);
+    params.dictObj = data->page->getTrans(&o);
     data->transition = new PageTransition(params);
     o.free();
   }
@@ -208,7 +211,7 @@ QSize Page::pageSize() const
 {
   ::Page *p;
 
-  p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  p = data->page;
   if ( ( Page::Landscape == orientation() ) || (Page::Seascape == orientation() ) ) {
     return QSize( (int)p->getCropHeight(), (int)p->getCropWidth() );
   } else {
@@ -218,7 +221,7 @@ QSize Page::pageSize() const
 
 Page::Orientation Page::orientation() const
 {
-  ::Page *p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  ::Page *p = data->page;
 
   int rotation = p->getRotate();
   switch (rotation) {
diff --git a/qt/poppler-qt.h b/qt/poppler-qt.h
index a6b1e6e..549ffd2 100644
--- a/qt/poppler-qt.h
+++ b/qt/poppler-qt.h
@@ -31,6 +31,7 @@
 #include <qdom.h>
 #include <qpixmap.h>
 
+
 namespace Poppler {
 
 class Document;
@@ -198,9 +199,12 @@ class Page {
     */
     QValueList<Link*> links() const;
 
+    bool isOk() { return ok; };
+
   private:
     Page(const Document *doc, int index);
     PageData *data;
+    bool ok;
 };
 
 class DocumentData;
@@ -219,7 +223,7 @@ public:
   
   static Document *load(const QString & filePath);
   
-  Page *getPage(int index) const{ return new Page(this, index); }
+  Page *getPage(int index) const;
   
   int getNumPages() const;
   
-- 
1.6.4.2


From f856758f24e1124181020d82ba4851230c855608 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Fri, 23 Apr 2010 12:07:39 +0200
Subject: [PATCH 09/15] Use PDFDoc::getPage() in PSOutputDev

---
 glib/poppler-page.cc            |    1 +
 poppler/PSOutputDev.cc          |   37 ++++++++++++++++++++++---------------
 poppler/PSOutputDev.h           |   13 ++++++++-----
 qt/poppler-document.cc          |    2 +-
 qt4/src/poppler-ps-converter.cc |    1 +
 utils/pdftohtml.cc              |    2 +-
 utils/pdftops.cc                |    2 +-
 7 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 39645bd..106b636 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1161,6 +1161,7 @@ poppler_page_render_to_ps (PopplerPage   *page,
 
   if (!ps_file->out)
     ps_file->out = new PSOutputDev (ps_file->filename,
+                                    ps_file->document->doc,
                                     ps_file->document->doc->getXRef(),
                                     ps_file->document->doc->getCatalog(),
                                     NULL,
diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc
index 179a494..5e5d3d0 100644
--- a/poppler/PSOutputDev.cc
+++ b/poppler/PSOutputDev.cc
@@ -70,6 +70,7 @@
 #  include "SplashOutputDev.h"
 #endif
 #include "PSOutputDev.h"
+#include "PDFDoc.h"
 
 #ifdef MACOS
 // needed for setting type/creator of MacOS files
@@ -972,7 +973,7 @@ static void outputToFile(void *stream, char *data, int len) {
   fwrite(data, 1, len, (FILE *)stream);
 }
 
-PSOutputDev::PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
+PSOutputDev::PSOutputDev(const char *fileName, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 			 char *psTitle,
 			 int firstPage, int lastPage, PSOutMode modeA,
 			 int paperWidthA, int paperHeightA, GBool duplexA,
@@ -1033,13 +1034,14 @@ PSOutputDev::PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
   }
 
   init(outputToFile, f, fileTypeA, psTitle,
-       xrefA, catalog, firstPage, lastPage, modeA,
+       doc, xrefA, catalog, firstPage, lastPage, modeA,
        imgLLXA, imgLLYA, imgURXA, imgURYA, manualCtrlA,
        paperWidthA, paperHeightA, duplexA);
 }
 
 PSOutputDev::PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
 			 char *psTitle,
+			 PDFDoc *doc,
 			 XRef *xrefA, Catalog *catalog,
 			 int firstPage, int lastPage, PSOutMode modeA,
 			 int paperWidthA, int paperHeightA, GBool duplexA,
@@ -1068,18 +1070,17 @@ PSOutputDev::PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
   forceRasterize = forceRasterizeA;
 
   init(outputFuncA, outputStreamA, psGeneric, psTitle,
-       xrefA, catalog, firstPage, lastPage, modeA,
+       doc, xrefA, catalog, firstPage, lastPage, modeA,
        imgLLXA, imgLLYA, imgURXA, imgURYA, manualCtrlA,
        paperWidthA, paperHeightA, duplexA);
 }
 
 void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
-		       PSFileType fileTypeA, char *pstitle, XRef *xrefA, Catalog *catalog,
+		       PSFileType fileTypeA, char *pstitle, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 		       int firstPage, int lastPage, PSOutMode modeA,
 		       int imgLLXA, int imgLLYA, int imgURXA, int imgURYA,
 		       GBool manualCtrlA, int paperWidthA, int paperHeightA,
 		       GBool duplexA) {
-  Page *page;
   PDFRectangle *box;
 
   // initialize
@@ -1099,12 +1100,12 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
   imgURX = imgURXA;
   imgURY = imgURYA;
   if (paperWidth < 0 || paperHeight < 0) {
-    // this check is needed in case the document has zero pages
-    if (firstPage > 0 && firstPage <= catalog->getNumPages()) {
-      page = catalog->getPage(firstPage);
+    Page *page;
+    if ((page = doc->getPage(firstPage))) {
       paperWidth = (int)ceil(page->getMediaWidth());
       paperHeight = (int)ceil(page->getMediaHeight());
     } else {
+      error(-1, "Invalid page %d", firstPage);
       paperWidth = 1;
       paperHeight = 1;
     }
@@ -1170,14 +1171,16 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
   embFontList = new GooString();
 
   if (!manualCtrl) {
+    Page *page;
     // this check is needed in case the document has zero pages
-    if (firstPage > 0 && firstPage <= catalog->getNumPages()) {
+    if ((page = doc->getPage(firstPage))) {
       writeHeader(firstPage, lastPage,
-		  catalog->getPage(firstPage)->getMediaBox(),
-		  catalog->getPage(firstPage)->getCropBox(),
-		  catalog->getPage(firstPage)->getRotate(),
+		  page->getMediaBox(),
+		  page->getCropBox(),
+		  page->getRotate(),
 		  pstitle);
     } else {
+      error(-1, "Invalid page %d", firstPage);
       box = new PDFRectangle(0, 0, 1, 1);
       writeHeader(firstPage, lastPage, box, box, 0, pstitle);
       delete box;
@@ -1190,7 +1193,7 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
       writePS("%%EndProlog\n");
       writePS("%%BeginSetup\n");
     }
-    writeDocSetup(catalog, firstPage, lastPage, duplexA);
+    writeDocSetup(doc, catalog, firstPage, lastPage, duplexA);
     if (mode != psModeForm) {
       writePS("%%EndSetup\n");
     }
@@ -1400,7 +1403,7 @@ void PSOutputDev::writeXpdfProcset() {
   }
 }
 
-void PSOutputDev::writeDocSetup(Catalog *catalog,
+void PSOutputDev::writeDocSetup(PDFDoc *doc, Catalog *catalog,
 				int firstPage, int lastPage,
                                 GBool duplexA) {
   Page *page;
@@ -1416,7 +1419,11 @@ void PSOutputDev::writeDocSetup(Catalog *catalog,
     writePS("xpdf begin\n");
   }
   for (pg = firstPage; pg <= lastPage; ++pg) {
-    page = catalog->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) {
+      error(-1, "Failed writing resources for page %d", pg);
+      continue;
+    }
     if ((resDict = page->getResourceDict())) {
       setupResources(resDict);
     }
diff --git a/poppler/PSOutputDev.h b/poppler/PSOutputDev.h
index 38c838c..a84a638 100644
--- a/poppler/PSOutputDev.h
+++ b/poppler/PSOutputDev.h
@@ -50,6 +50,7 @@ struct PSFont8Info;
 struct PSFont16Enc;
 class PSOutCustomColor;
 class Function;
+class PDFDoc;
 
 //------------------------------------------------------------------------
 // PSOutputDev
@@ -75,7 +76,7 @@ class PSOutputDev: public OutputDev {
 public:
 
   // Open a PostScript output file, and write the prolog.
-  PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
+  PSOutputDev(const char *fileName, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 	      char *psTitle,
 	      int firstPage, int lastPage, PSOutMode modeA,
 	      int paperWidthA = -1, int paperHeightA = -1,
@@ -88,6 +89,7 @@ public:
   // Open a PSOutputDev that will write to a generic stream.
   PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
 	      char *psTitle,
+	      PDFDoc *doc,
 	      XRef *xrefA, Catalog *catalog,
 	      int firstPage, int lastPage, PSOutMode modeA,
 	      int paperWidthA = -1, int paperHeightA = -1,
@@ -145,9 +147,6 @@ public:
   // Write the Xpdf procset.
   void writeXpdfProcset();
 
-  // Write the document-level setup.
-  void writeDocSetup(Catalog *catalog, int firstPage, int lastPage, GBool duplexA);
-
   // Write the trailer for the current page.
   void writePageTrailer();
 
@@ -287,7 +286,7 @@ public:
 private:
 
   void init(PSOutputFunc outputFuncA, void *outputStreamA,
-	    PSFileType fileTypeA, char *pstitle, XRef *xrefA, Catalog *catalog,
+	    PSFileType fileTypeA, char *pstitle, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 	    int firstPage, int lastPage, PSOutMode modeA,
 	    int imgLLXA, int imgLLYA, int imgURXA, int imgURYA,
 	    GBool manualCtrlA, int paperWidthA, int paperHeightA,
@@ -341,6 +340,10 @@ private:
 		    double *x1, double *y1);
 #endif
   void cvtFunction(Function *func);
+
+  // Write the document-level setup.
+  void writeDocSetup(PDFDoc *doc, Catalog *catalog, int firstPage, int lastPage, GBool duplexA);
+
   void writePSChar(char c);
   void writePS(char *s);
   void writePSFmt(const char *fmt, ...);
diff --git a/qt/poppler-document.cc b/qt/poppler-document.cc
index 1a5892b..03d01fa 100644
--- a/qt/poppler-document.cc
+++ b/qt/poppler-document.cc
@@ -325,7 +325,7 @@ bool Document::print(const QString &fileName, QValueList<int> pageList, double h
 
 bool Document::print(const QString &file, QValueList<int> pageList, double hDPI, double vDPI, int rotate, int paperWidth, int paperHeight)
 {
-  PSOutputDev *psOut = new PSOutputDev(file.latin1(), data->doc.getXRef(), data->doc.getCatalog(), NULL, 1, data->doc.getNumPages(), psModePS, paperWidth, paperHeight);
+  PSOutputDev *psOut = new PSOutputDev(file.latin1(), &(data->doc), data->doc.getXRef(), data->doc.getCatalog(), NULL, 1, data->doc.getNumPages(), psModePS, paperWidth, paperHeight);
   
   if (psOut->isOk()) {
     QValueList<int>::iterator it;
diff --git a/qt4/src/poppler-ps-converter.cc b/qt4/src/poppler-ps-converter.cc
index 7a1957b..9dc82ec 100644
--- a/qt4/src/poppler-ps-converter.cc
+++ b/qt4/src/poppler-ps-converter.cc
@@ -195,6 +195,7 @@ bool PSConverter::convert()
 	
 	PSOutputDev *psOut = new PSOutputDev(outputToQIODevice, dev,
 	                                     pstitlechar,
+	                                     d->document->doc,
 	                                     d->document->doc->getXRef(),
 	                                     d->document->doc->getCatalog(),
 	                                     1,
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 3c74c6e..0558e5c 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -350,7 +350,7 @@ int main(int argc, char *argv[]) {
     psFileName = new GooString(htmlFileName->getCString());
     psFileName->append(".ps");
 
-    psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+    psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
 			    doc->getCatalog(), NULL, firstPage, lastPage, psModePS, w, h);
     psOut->setDisplayText(gFalse);
     doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
diff --git a/utils/pdftops.cc b/utils/pdftops.cc
index 0bc43a1..8231458 100644
--- a/utils/pdftops.cc
+++ b/utils/pdftops.cc
@@ -359,7 +359,7 @@ int main(int argc, char *argv[]) {
   }
 
   // write PostScript file
-  psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+  psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
 			  doc->getCatalog(), NULL, firstPage, lastPage, mode,
 			  paperWidth,
 			  paperHeight,
-- 
1.6.4.2


From e046bf1778582a613205c231e18efe37e37733dc Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Sat, 24 Apr 2010 10:17:56 +0200
Subject: [PATCH 10/15] Use PDFDoc::getPage() in HtmlOutputDev

---
 utils/HtmlOutputDev.cc |    2 +-
 utils/HtmlOutputDev.h  |    2 ++
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 81f8b88..4f7dff6 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -1093,7 +1093,7 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state) {
 
 
 void HtmlOutputDev::endPage() {
-  Links *linksList = catalog->getPage(pageNum)->getLinks(catalog);
+  Links *linksList = docPage->getLinks(catalog);
   for (int i = 0; i < linksList->getNumLinks(); ++i)
   {
       doProcessLink(linksList->getLink(i));
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index 24ccfd1..48b04c6 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -256,6 +256,7 @@ public:
                                GBool (* abortCheckCbk)(void *data) = NULL,
                                void * abortCheckCbkData = NULL)
   {
+   docPage = page;
    catalog = catalogA;
    return gTrue;
   }
@@ -323,6 +324,7 @@ private:
   GooString *docTitle;
   GooList *glMetaVars;
   Catalog *catalog;
+  Page *docPage;
   friend class HtmlPage;
 };
 
-- 
1.6.4.2


From 0ae15644df43ca7d5ef4c1a52f8731e6f4a14fbf Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 31 Mar 2010 14:39:57 +0200
Subject: [PATCH 11/15] Parse page tree on demand

---
 poppler/Catalog.cc |  266 ++++++++++++++++++++++++++++++++++-----------------
 poppler/Catalog.h  |   12 ++-
 2 files changed, 185 insertions(+), 93 deletions(-)

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index 900cdd7..416fb66 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -59,9 +59,6 @@ Catalog::Catalog(XRef *xrefA) {
   Object catDict, pagesDict, pagesDictRef;
   Object obj, obj2;
   Object optContentProps;
-  char *alreadyRead;
-  int numPages0;
-  int i;
 
   ok = gTrue;
   xref = xrefA;
@@ -78,6 +75,12 @@ Catalog::Catalog(XRef *xrefA) {
   embeddedFileNameTree = NULL;
   jsNameTree = NULL;
 
+  pagesList = NULL;
+  pagesRefList = NULL;
+  attrsList = NULL;
+  kidsIdxList = NULL;
+  lastCachedPage = 0;
+
   xref->getCatalog(&catDict);
   if (!catDict.isDict()) {
     error(-1, "Catalog object is wrong type (%s)", catDict.getTypeName());
@@ -100,31 +103,11 @@ Catalog::Catalog(XRef *xrefA) {
   if (!obj.isNum()) {
     error(-1, "Page count in top-level pages object is wrong type (%s)",
 	  obj.getTypeName());
-    pagesSize = numPages0 = 0;
+    numPages = 0;
   } else {
-    pagesSize = numPages0 = (int)obj.getNum();
+    numPages = (int)obj.getNum();
   }
   obj.free();
-  pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
-  pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
-  for (i = 0; i < pagesSize; ++i) {
-    pages[i] = NULL;
-    pageRefs[i].num = -1;
-    pageRefs[i].gen = -1;
-  }
-  alreadyRead = (char *)gmalloc(xref->getNumObjects());
-  memset(alreadyRead, 0, xref->getNumObjects());
-  if (catDict.dictLookupNF("Pages", &pagesDictRef)->isRef() &&
-      pagesDictRef.getRefNum() >= 0 &&
-      pagesDictRef.getRefNum() < xref->getNumObjects()) {
-    alreadyRead[pagesDictRef.getRefNum()] = 1;
-  }
-  pagesDictRef.free();
-  numPages = readPageTree(pagesDict.getDict(), NULL, 0, alreadyRead);
-  gfree(alreadyRead);
-  if (numPages != numPages0) {
-    error(-1, "Page count in top-level pages object is incorrect");
-  }
   pagesDict.free();
 
   // read base URI
@@ -163,6 +146,10 @@ Catalog::Catalog(XRef *xrefA) {
 Catalog::~Catalog() {
   int i;
 
+  delete kidsIdxList;
+  delete attrsList;
+  delete pagesRefList;
+  delete pagesList;
   if (pages) {
     for (i = 0; i < pagesSize; ++i) {
       if (pages[i]) {
@@ -225,91 +212,192 @@ GooString *Catalog::readMetadata() {
   return s;
 }
 
-int Catalog::readPageTree(Dict *pagesDict, PageAttrs *attrs, int start,
-			  char *alreadyRead) {
-  Object kids;
-  Object kid;
-  Object kidRef;
-  PageAttrs *attrs1, *attrs2;
-  Page *page;
-  int i, j;
-
-  attrs1 = new PageAttrs(attrs, pagesDict);
-  pagesDict->lookup("Kids", &kids);
-  if (!kids.isArray()) {
-    error(-1, "Kids object (page %d) is wrong type (%s)",
-	  start+1, kids.getTypeName());
-    return start;
-  }
-  for (i = 0; i < kids.arrayGetLength(); ++i) {
-    kids.arrayGetNF(i, &kidRef);
-    if (kidRef.isRef() &&
-	kidRef.getRefNum() >= 0 &&
-	kidRef.getRefNum() < xref->getNumObjects()) {
-      if (alreadyRead[kidRef.getRefNum()]) {
-	error(-1, "Loop in Pages tree");
-	kidRef.free();
-	continue;
+Page *Catalog::getPage(int i)
+{
+  if (i < 1) return NULL;
+
+  if (i > lastCachedPage) {
+     if (cachePageTree(i) == gFalse) return NULL;
+  }
+  return pages[i-1];
+}
+
+Ref *Catalog::getPageRef(int i)
+{
+  if (i < 1) return NULL;
+
+  if (i > lastCachedPage) {
+     if (cachePageTree(i) == gFalse) return NULL;
+  }
+  return &pageRefs[i-1];
+}
+
+GBool Catalog::cachePageTree(int page)
+{
+  Dict *pagesDict;
+
+  if (pagesList == NULL) {
+
+    Object catDict;
+    Ref pagesRef;
+
+    xref->getCatalog(&catDict);
+
+    Object pagesDictRef;
+    if (catDict.dictLookupNF("Pages", &pagesDictRef)->isRef() &&
+        pagesDictRef.getRefNum() >= 0 &&
+        pagesDictRef.getRefNum() < xref->getNumObjects()) {
+      pagesRef = pagesDictRef.getRef();
+      pagesDictRef.free();
+    } else {
+       error(-1, "Catalog dictionary does not contain a valid \"Pages\" entry");
+       pagesDictRef.free();
+       return gFalse;
+    }
+
+    Object obj;
+    catDict.dictLookup("Pages", &obj);
+    // This should really be isDict("Pages"), but I've seen at least one
+    // PDF file where the /Type entry is missing.
+    if (obj.isDict()) {
+      obj.getDict()->incRef();
+      pagesDict = obj.getDict();
+      obj.free();
+    }
+    else {
+      error(-1, "Top-level pages object is wrong type (%s)", obj.getTypeName());
+      obj.free();
+      return gFalse;
+    }
+
+    pagesSize = numPages;
+    pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
+    pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
+    for (int i = 0; i < pagesSize; ++i) {
+      pages[i] = NULL;
+      pageRefs[i].num = -1;
+      pageRefs[i].gen = -1;
+    }
+
+    pagesList = new GooVector<Dict *>();
+    pagesList->push_back(pagesDict);
+    pagesRefList = new GooVector<Ref>();
+    pagesRefList->push_back(pagesRef);
+    attrsList = new GooVector<PageAttrs *>();
+    attrsList->push_back(new PageAttrs(NULL, pagesDict));
+    kidsIdxList = new GooVector<int>();
+    kidsIdxList->push_back(0);
+    lastCachedPage = 0;
+
+  }
+
+  while(1) {
+
+    if (page <= lastCachedPage) return gTrue;
+
+    if (pagesList->empty()) return gFalse;
+
+    pagesDict = pagesList->back();
+    Object kids;
+    pagesDict->lookup("Kids", &kids);
+    if (!kids.isArray()) {
+      error(-1, "Kids object (page %d) is wrong type (%s)",
+            lastCachedPage+1, kids.getTypeName());
+      kids.free();
+      return gFalse;
+    }
+
+    int kidsIdx = kidsIdxList->back();
+    if (kidsIdx >= kids.arrayGetLength()) {
+       delete pagesList->back();
+       pagesList->pop_back();
+       pagesRefList->pop_back();
+       delete attrsList->back();
+       attrsList->pop_back();
+       kidsIdxList->pop_back();
+       if (!kidsIdxList->empty()) kidsIdxList->back()++;
+       kids.free();
+       continue;
+    }
+
+    Object kidRef;
+    kids.arrayGetNF(kidsIdx, &kidRef);
+    if (!kidRef.isRef()) {
+      error(-1, "Kid object (page %d) is not an indirect reference (%s)",
+            lastCachedPage+1, kidRef.getTypeName());
+      kidRef.free();
+      kids.free();
+      return gFalse;
+    }
+
+    for (size_t i = 0; i < pagesRefList->size(); i++) {
+      if (((*pagesRefList)[i]).num == kidRef.getRefNum()) {
+         error(-1, "Loop in Pages tree");
+         kidRef.free();
+         kids.free();
+         kidsIdxList->back()++;
+         continue;
       }
-      alreadyRead[kidRef.getRefNum()] = 1;
     }
-    kids.arrayGet(i, &kid);
+
+    Object kid;
+    kids.arrayGet(kidsIdx, &kid);
+    kids.free();
     if (kid.isDict("Page")) {
-      attrs2 = new PageAttrs(attrs1, kid.getDict());
-      page = new Page(xref, start+1, kid.getDict(), kidRef.getRef(), attrs2, getForm());
-      if (!page->isOk()) {
-	++start;
-	goto err3;
-      }
-      if (start >= pagesSize) {
-	pagesSize += 32;
-	pages = (Page **)greallocn(pages, pagesSize, sizeof(Page *));
-	pageRefs = (Ref *)greallocn(pageRefs, pagesSize, sizeof(Ref));
-	for (j = pagesSize - 32; j < pagesSize; ++j) {
-	  pages[j] = NULL;
-	  pageRefs[j].num = -1;
-	  pageRefs[j].gen = -1;
-	}
+      PageAttrs *attrs = new PageAttrs(attrsList->back(), kid.getDict());
+      Page *p = new Page(xref, lastCachedPage+1, kid.getDict(),
+                     kidRef.getRef(), attrs, form);
+      if (!p->isOk()) {
+        error(-1, "Failed to create page (page %d)", lastCachedPage+1);
+        delete p;
+        kidRef.free();
+        kid.free();
+        return gFalse;
       }
-      pages[start] = page;
-      if (kidRef.isRef()) {
-	pageRefs[start].num = kidRef.getRefNum();
-	pageRefs[start].gen = kidRef.getRefGen();
+
+      if (lastCachedPage >= numPages) {
+        error(-1, "Page count in top-level pages object is incorrect");
+        kidRef.free();
+        kid.free();
+        return gFalse;
       }
-      ++start;
+
+      pages[lastCachedPage] = p;
+      pageRefs[lastCachedPage].num = kidRef.getRefNum();
+      pageRefs[lastCachedPage].gen = kidRef.getRefGen();
+
+      lastCachedPage++;
+      kidsIdxList->back()++;
+
     // This should really be isDict("Pages"), but I've seen at least one
     // PDF file where the /Type entry is missing.
     } else if (kid.isDict()) {
-      if ((start = readPageTree(kid.getDict(), attrs1, start, alreadyRead))
-	  < 0)
-	goto err2;
+      attrsList->push_back(new PageAttrs(attrsList->back(), kid.getDict()));
+      pagesRefList->push_back(kidRef.getRef());
+      kid.getDict()->incRef();
+      pagesList->push_back(kid.getDict());
+      kidsIdxList->push_back(0);
     } else {
       error(-1, "Kid object (page %d) is wrong type (%s)",
-	    start+1, kid.getTypeName());
+            lastCachedPage+1, kid.getTypeName());
+      kidRef.free();
+      kid.free();
+      return gFalse;
     }
-    kid.free();
     kidRef.free();
+    kid.free();
+
   }
-  delete attrs1;
-  kids.free();
-  return start;
 
- err3:
-  delete page;
- err2:
-  kid.free();
-  kidRef.free();
-  kids.free();
-  delete attrs1;
-  ok = gFalse;
-  return -1;
+  return gFalse;
 }
 
 int Catalog::findPage(int num, int gen) {
   int i;
 
   for (i = 0; i < numPages; ++i) {
-    if (pageRefs[i].num == num && pageRefs[i].gen == gen)
+    Ref *ref = getPageRef(i+1);
+    if (ref->num == num && ref->gen == gen)
       return i + 1;
   }
   return 0;
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 2cab80a..5a25109 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -151,10 +151,10 @@ public:
   int getNumPages() { return numPages; }
 
   // Get a page.
-  Page *getPage(int i) { return pages[i-1]; }
+  Page *getPage(int i);
 
   // Get the reference for a page object.
-  Ref *getPageRef(int i) { return &pageRefs[i-1]; }
+  Ref *getPageRef(int i);
 
   // Return base URI, or NULL if none.
   GooString *getBaseURI() { return baseURI; }
@@ -232,6 +232,11 @@ private:
   XRef *xref;			// the xref table for this PDF file
   Page **pages;			// array of pages
   Ref *pageRefs;		// object ID for each page
+  int lastCachedPage;
+  GooVector<Dict *> *pagesList;
+  GooVector<Ref> *pagesRefList;
+  GooVector<PageAttrs *> *attrsList;
+  GooVector<int> *kidsIdxList;
   Form *form;
   int numPages;			// number of pages
   int pagesSize;		// size of pages array
@@ -251,8 +256,7 @@ private:
   PageMode pageMode;		// page mode
   PageLayout pageLayout;	// page layout
 
-  int readPageTree(Dict *pages, PageAttrs *attrs, int start,
-		   char *alreadyRead);
+  GBool cachePageTree(int page); // Cache first <page> pages.
   Object *findDestInTree(Object *tree, GooString *name, Object *obj);
 
   Object *getNames();
-- 
1.6.4.2


From e9ee24e6fddfa520407161f49a732c9b16f42251 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 22:01:41 +0100
Subject: [PATCH 12/15] Parse number of pages on demand

---
 poppler/Catalog.cc |   70 +++++++++++++++++++++++++++++++--------------------
 poppler/Catalog.h  |    2 +-
 2 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index 416fb66..536e474 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -64,7 +64,8 @@ Catalog::Catalog(XRef *xrefA) {
   xref = xrefA;
   pages = NULL;
   pageRefs = NULL;
-  numPages = pagesSize = 0;
+  numPages = -1;
+  pagesSize = 0;
   baseURI = NULL;
   pageLabelInfo = NULL;
   form = NULL;
@@ -89,27 +90,6 @@ Catalog::Catalog(XRef *xrefA) {
   // get the AcroForm dictionary
   catDict.dictLookup("AcroForm", &acroForm);
 
-  // read page tree
-  catDict.dictLookup("Pages", &pagesDict);
-  // This should really be isDict("Pages"), but I've seen at least one
-  // PDF file where the /Type entry is missing.
-  if (!pagesDict.isDict()) {
-    error(-1, "Top-level pages object is wrong type (%s)",
-	  pagesDict.getTypeName());
-    goto err2;
-  }
-  pagesDict.dictLookup("Count", &obj);
-  // some PDF files actually use real numbers here ("/Count 9.0")
-  if (!obj.isNum()) {
-    error(-1, "Page count in top-level pages object is wrong type (%s)",
-	  obj.getTypeName());
-    numPages = 0;
-  } else {
-    numPages = (int)obj.getNum();
-  }
-  obj.free();
-  pagesDict.free();
-
   // read base URI
   if (catDict.dictLookup("URI", &obj)->isDict()) {
     if (obj.dictLookup("Base", &obj2)->isString()) {
@@ -136,8 +116,6 @@ Catalog::Catalog(XRef *xrefA) {
   catDict.free();
   return;
 
- err2:
-  pagesDict.free();
  err1:
   catDict.free();
   ok = gFalse;
@@ -270,7 +248,7 @@ GBool Catalog::cachePageTree(int page)
       return gFalse;
     }
 
-    pagesSize = numPages;
+    pagesSize = getNumPages();
     pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
     pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
     for (int i = 0; i < pagesSize; ++i) {
@@ -395,7 +373,7 @@ GBool Catalog::cachePageTree(int page)
 int Catalog::findPage(int num, int gen) {
   int i;
 
-  for (i = 0; i < numPages; ++i) {
+  for (i = 0; i < getNumPages(); ++i) {
     Ref *ref = getPageRef(i+1);
     if (ref->num == num && ref->gen == gen)
       return i + 1;
@@ -719,7 +697,7 @@ GBool Catalog::labelToIndex(GooString *label, int *index)
       return gFalse;
   }
 
-  if (*index < 0 || *index >= numPages)
+  if (*index < 0 || *index >= getNumPages())
     return gFalse;
 
   return gTrue;
@@ -729,7 +707,7 @@ GBool Catalog::indexToLabel(int index, GooString *label)
 {
   char buffer[32];
 
-  if (index < 0 || index >= numPages)
+  if (index < 0 || index >= getNumPages())
     return gFalse;
 
   PageLabelInfo *pli = getPageLabelInfo();
@@ -845,6 +823,42 @@ EmbFile::EmbFile(Object *efDict, GooString *description)
     m_mimetype = new GooString();
 }
 
+int Catalog::getNumPages()
+{
+  if (numPages == -1)
+  {
+    Object catDict, pagesDict, obj;
+
+    xref->getCatalog(&catDict);
+    catDict.dictLookup("Pages", &pagesDict);
+    catDict.free();
+
+    // This should really be isDict("Pages"), but I've seen at least one
+    // PDF file where the /Type entry is missing.
+    if (!pagesDict.isDict()) {
+      error(-1, "Top-level pages object is wrong type (%s)",
+          pagesDict.getTypeName());
+      pagesDict.free();
+      return 0;
+    }
+
+    pagesDict.dictLookup("Count", &obj);
+    // some PDF files actually use real numbers here ("/Count 9.0")
+    if (!obj.isNum()) {
+      error(-1, "Page count in top-level pages object is wrong type (%s)",
+         obj.getTypeName());
+      numPages = 0;
+    } else {
+      numPages = (int)obj.getNum();
+    }
+
+    obj.free();
+    pagesDict.free();
+  }
+
+  return numPages;
+}
+
 PageLabelInfo *Catalog::getPageLabelInfo()
 {
   if (!pageLabelInfo) {
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 5a25109..8bca80b 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -148,7 +148,7 @@ public:
   GBool isOk() { return ok; }
 
   // Get number of pages.
-  int getNumPages() { return numPages; }
+  int getNumPages();
 
   // Get a page.
   Page *getPage(int i);
-- 
1.6.4.2


From 27842f9134cf63cc9d75f90fca0c8b7dd4f3ba60 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 25 Mar 2010 18:53:54 +0100
Subject: [PATCH 13/15] Get number of pages from linearization table

---
 poppler/PDFDoc.cc |    9 +++++++++
 poppler/PDFDoc.h  |    2 +-
 2 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 89dba6f..8f105fd 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -1033,6 +1033,15 @@ Guint PDFDoc::getMainXRefEntriesOffset()
   return mainXRefEntriesOffset;
 }
 
+int PDFDoc::getNumPages()
+{
+  if (isLinearized()) {
+    return getLinearization()->getNumPages();
+  } else {
+    return catalog->getNumPages();
+  }
+}
+
 Page *PDFDoc::getPage(int page)
 {
   if ((page < 1) || page > getNumPages()) return NULL;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 8de139f..9069698 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -118,7 +118,7 @@ public:
     { return getPage(page) ? getPage(page)->getRotate() : 0 ; }
 
   // Get number of pages.
-  int getNumPages() { return catalog->getNumPages(); }
+  int getNumPages();
 
   // Return the contents of the metadata stream, or NULL if there is
   // no metadata.
-- 
1.6.4.2


From cf2c35c975702b3e3a0744d62ba6a15d1ce80b6d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 22:03:27 +0100
Subject: [PATCH 14/15] Add hint tables support

---
 CMakeLists.txt      |    2 +
 poppler/Hints.cc    |  405 +++++++++++++++++++++++++++++++++++++++++++++++++++
 poppler/Hints.h     |   92 ++++++++++++
 poppler/Makefile.am |    2 +
 poppler/PDFDoc.cc   |   14 ++
 poppler/PDFDoc.h    |    5 +
 6 files changed, 520 insertions(+), 0 deletions(-)
 create mode 100644 poppler/Hints.cc
 create mode 100644 poppler/Hints.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a119a6d..6d43826 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,6 +245,7 @@ set(poppler_SRCS
   poppler/GfxFont.cc
   poppler/GfxState.cc
   poppler/GlobalParams.cc
+  poppler/Hints.cc
   poppler/JArithmeticDecoder.cc
   poppler/JBIG2Stream.cc
   poppler/Lexer.cc
@@ -391,6 +392,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/GfxState.h
     poppler/GfxState_helpers.h
     poppler/GlobalParams.h
+    poppler/Hints.h
     poppler/JArithmeticDecoder.h
     poppler/JBIG2Stream.h
     poppler/Lexer.h
diff --git a/poppler/Hints.cc b/poppler/Hints.cc
new file mode 100644
index 0000000..97af853
--- /dev/null
+++ b/poppler/Hints.cc
@@ -0,0 +1,405 @@
+//========================================================================
+//
+// Hints.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#include <config.h>
+
+#include "Hints.h"
+
+#include "Linearization.h"
+#include "Object.h"
+#include "Stream.h"
+#include "XRef.h"
+#include "Parser.h"
+#include "Lexer.h"
+
+#include <limits.h>
+
+//------------------------------------------------------------------------
+// Hints
+//------------------------------------------------------------------------
+
+Hints::Hints(BaseStream *str, Linearization *linearization)
+{
+  mainXRefEntriesOffset = linearization->getMainXRefEntriesOffset();
+  nPages = linearization->getNumPages();
+  pageFirst = linearization->getPageFirst();
+  pageEndFirst = linearization->getEndFirst();
+
+  if (nPages >= INT_MAX / (int)sizeof(Guint)) {
+     error(-1, "Invalid number of pages (%d) for hints table", nPages);
+     nPages = 0;
+  }
+  nObjects = (Guint *) gmallocn(nPages, sizeof(Guint));
+  xRefOffset = (Guint *) gmallocn(nPages, sizeof(Guint));
+  pageLength = (Guint *) gmallocn(nPages, sizeof(Guint));
+  pageOffset = (Guint *) gmallocn(nPages, sizeof(Guint));
+  numSharedObject = (Guint *) gmallocn(nPages, sizeof(Guint));
+  sharedObjectId = (Guint **) gmallocn(nPages, sizeof(Guint));
+  if (!nObjects || !xRefOffset || !pageLength || !pageOffset ||
+      !numSharedObject || !sharedObjectId) {
+    error(-1, "Failed to allocate memory for hints tabel");
+    nPages = 0;
+  }
+
+  memset(numSharedObject, 0, nPages);
+
+  nSharedGroups = 0;
+  groupLength = NULL;
+  groupOffset = NULL;
+  groupHasSignature = NULL;
+  groupNumObjects = NULL;
+  groupXRefOffset = NULL;
+
+  readTables(str, linearization);
+}
+
+Hints::~Hints()
+{
+  gfree(nObjects);
+  gfree(xRefOffset);
+  gfree(pageLength);
+  gfree(pageOffset);
+  gfree(numSharedObject);
+  for (int i=0; i< nPages; i++) gfree(sharedObjectId[i]);
+  gfree(sharedObjectId);
+
+  gfree(groupLength);
+  gfree(groupOffset);
+  gfree(groupHasSignature);
+  gfree(groupNumObjects);
+  gfree(groupXRefOffset);
+}
+
+void Hints::readTables(BaseStream *str, Linearization *linearization)
+{
+  hintsOffset = linearization->getHintsOffset();
+  hintsLength = linearization->getHintsLength();
+  hintsOffset2 = linearization->getHintsOffset2();
+  hintsLength2 = linearization->getHintsLength2();
+
+  Parser *parser;
+  Object obj;
+
+  int bufLength = hintsLength + hintsLength2;
+
+  char buf[bufLength];
+  char *p = buf;
+
+  obj.initNull();
+  Stream *s = str->makeSubStream(hintsOffset, gFalse, hintsLength, &obj);
+  s->reset();
+  for (Guint i=0; i < hintsLength; i++) { *p++ = s->getChar(); }
+  delete s;
+
+  if (hintsOffset2 && hintsLength2) {
+    obj.initNull();
+    s = str->makeSubStream(hintsOffset2, gFalse, hintsLength2, &obj);
+    s->reset();
+    for (Guint i=0; i < hintsLength2; i++) { *p++ = s->getChar(); }
+    delete s;
+  }
+
+  obj.initNull();
+  MemStream *memStream = new MemStream (buf, 0, bufLength, &obj);
+
+  obj.initNull();
+  parser = new Parser(NULL, new Lexer(NULL, memStream), gTrue);
+  if (parser->getObj(&obj)->isInt() &&
+     (obj.free(), parser->getObj(&obj)->isInt()) &&
+     (obj.free(), parser->getObj(&obj)->isCmd("obj")) &&
+     (obj.free(), parser->getObj(&obj)->isStream())){
+    Stream *hintsStream = obj.getStream();
+    Dict *hintsDict = obj.streamGetDict();
+
+    int sharedStreamOffset = 0;
+    if (hintsDict->lookupInt("S", NULL, &sharedStreamOffset) &&
+        sharedStreamOffset > 0) {
+
+        hintsStream->reset();
+        readPageOffsetTable(hintsStream);
+
+        hintsStream->reset();
+        for (int i=0; i<sharedStreamOffset; i++) hintsStream->getChar();
+        readSharedObjectsTable(hintsStream);
+    } else {
+      error(-1, "Invalid shared object hint table offset");
+    }
+  } else {
+    error(-1, "Failed parsing hints table object");
+  }
+  obj.free();
+
+  delete parser;
+}
+
+void Hints::readPageOffsetTable(Stream *str)
+{
+  if (nPages < 1) {
+    error(-1, "Invalid number of pages reading page offset hints table");
+    return;
+  }
+
+  inputBits = 0; // reset on byte boundary.
+
+  nObjectLeast = readBits(32, str);
+
+  objectOffsetFirst = readBits(32, str);
+  if (objectOffsetFirst >= hintsOffset) objectOffsetFirst += hintsLength;
+
+  nBitsDiffObjects = readBits(16, str);
+
+  pageLengthLeast = readBits(32, str);
+
+  nBitsDiffPageLength = readBits(16, str);
+
+  OffsetStreamLeast = readBits(32, str);
+
+  nBitsOffsetStream = readBits(16, str);
+
+  lengthStreamLeast = readBits(32, str);
+
+  nBitsLengthStream = readBits(16, str);
+
+  nBitsNumShared = readBits(16, str);
+
+  nBitsShared = readBits(16, str);
+
+  nBitsNumerator = readBits(16, str);
+
+  denominator = readBits(16, str);
+
+  for (int i=0; i<nPages; i++) {
+    nObjects[i] = nObjectLeast + readBits(nBitsDiffObjects, str);
+  }
+
+  nObjects[0] = 0;
+  xRefOffset[0] = mainXRefEntriesOffset + 20;
+  for (int i=1; i<nPages; i++) {
+    xRefOffset[i] = xRefOffset[i-1] + 20*nObjects[i-1];
+  }
+
+  for (int i=0; i<nPages; i++) {
+    pageLength[i] = pageLengthLeast + readBits(nBitsDiffPageLength, str);
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  numSharedObject[0] = readBits(nBitsNumShared, str);
+  numSharedObject[0] = 0; // Do not trust the read value to be 0.
+  sharedObjectId[0] = NULL;
+  for (int i=1; i<nPages; i++) {
+    numSharedObject[i] = readBits(nBitsNumShared, str);
+    if (numSharedObject[i] >= INT_MAX / (int)sizeof(Guint)) {
+       error(-1, "Invalid number of shared objects");
+       numSharedObject[i] = 0;
+       return;
+    }
+    sharedObjectId[i] = (Guint *) gmallocn(numSharedObject[i], sizeof(Guint));
+    if (numSharedObject[i] && !sharedObjectId[i]) {
+       error(-1, "Failed to allocate memory for shared object IDs");
+       numSharedObject[i] = 0;
+       return;
+    }
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (int i=1; i<nPages; i++) {
+    for (Guint j=0; j < numSharedObject[i]; j++) {
+      sharedObjectId[i][j] = readBits(nBitsShared, str);
+    }
+  }
+
+  pageOffset[0] = objectOffsetFirst;
+  pageLength[0] = pageEndFirst - objectOffsetFirst;
+  // set fake pageOffset[0] to correct for hint table.
+  if (pageOffset[0] < hintsOffset) {
+    pageOffset[0] += hintsLength;
+  }
+  // find pageOffsets.
+  for (int i=1; i<nPages; i++) {
+    pageOffset[i] = pageOffset[i-1] + pageLength[i-1];
+  }
+  // restore correct pageOffset[0].
+  pageOffset[0] = objectOffsetFirst;
+
+}
+
+void Hints::readSharedObjectsTable(Stream *str)
+{
+  inputBits = 0; // reset on byte boundary.
+
+  Guint firstSharedObjectNumber = readBits(32, str);
+
+  Guint firstSharedObjectOffset = readBits(32, str);
+  firstSharedObjectOffset += hintsLength;
+
+  Guint nSharedGroupsFirst = readBits(32, str);
+
+  Guint nSharedGroups = readBits(32, str);
+
+  Guint nBitsNumObjects = readBits(16, str);
+
+  Guint groupLengthLeast = readBits(32, str);
+
+  Guint nBitsDiffGroupLength = readBits(16, str);
+
+  if (nSharedGroups >= INT_MAX / (int)sizeof(Guint)) {
+     error(-1, "Invalid number of shared object groups");
+     nSharedGroups = 0;
+     return;
+  }
+  groupLength = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupOffset = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupHasSignature = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupNumObjects = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupXRefOffset = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  if (!groupLength || !groupOffset || !groupHasSignature ||
+      !groupNumObjects || !groupXRefOffset) {
+     error(-1, "Failed to allocate memory for shared object groups");
+     nSharedGroups = 0;
+     return;
+  }
+
+  if (nSharedGroupsFirst > nSharedGroups) {
+     error(-1, "Invalid number of first page shared object groups");
+     nSharedGroupsFirst = nSharedGroups;
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    groupLength[i] = groupLengthLeast + readBits(nBitsDiffGroupLength, str);
+  }
+
+  groupOffset[0] = objectOffsetFirst;
+  for (Guint i=1; i<nSharedGroupsFirst; i++) {
+    groupOffset[i] = groupOffset[i-1] + groupLength[i-1];
+  }
+  groupOffset[nSharedGroupsFirst] = firstSharedObjectOffset;
+  for (Guint i=nSharedGroupsFirst+1; i<nSharedGroups; i++) {
+    groupOffset[i] = groupOffset[i-1] + groupLength[i-1];
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    groupHasSignature[i] = readBits(1, str);
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    if (groupHasSignature[i]) {
+       readBits(128, str);
+    }
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    groupNumObjects[i] =
+       nBitsNumObjects ? 1 + readBits(nBitsNumObjects, str) : 1;
+  }
+
+  for (Guint i=0; i<nSharedGroupsFirst; i++) {
+    groupNumObjects[i] = 0;
+    groupXRefOffset[i] = 0;
+  }
+  groupXRefOffset[nSharedGroupsFirst] =
+      mainXRefEntriesOffset + 20*firstSharedObjectNumber;
+  for (Guint i=nSharedGroupsFirst+1; i<nSharedGroups; i++) {
+    groupXRefOffset[i] = groupXRefOffset[i-1] + 20*groupNumObjects[i-1];
+  }
+
+}
+
+Guint Hints::getPageOffset(int page)
+{
+  if ((page < 1) || (page > nPages)) return 0;
+
+  if (page-1 > pageFirst)
+    return pageOffset[page-1];
+  else if (page-1 < pageFirst)
+    return pageOffset[page];
+  else
+    return pageOffset[0];
+}
+
+GooVector<ByteRange>* Hints::getPageRanges(int page)
+{
+  if ((page < 1) || (page > nPages)) return NULL;
+
+  int idx;
+  if (page-1 > pageFirst)
+     idx = page-1;
+  else if (page-1 < pageFirst)
+     idx = page;
+  else
+     idx = 0;
+
+  ByteRange pageRange;
+  GooVector<ByteRange> *v = new GooVector<ByteRange>;
+
+  pageRange.offset = pageOffset[idx];
+  pageRange.length = pageLength[idx];
+  v->push_back(pageRange);
+
+  pageRange.offset = xRefOffset[idx];
+  pageRange.length = 20*nObjects[idx];
+  v->push_back(pageRange);
+
+  for (Guint j=0; j<numSharedObject[idx]; j++) {
+     Guint k = sharedObjectId[idx][j];
+
+     pageRange.offset = groupOffset[k];
+     pageRange.length = groupLength[k];
+     v->push_back(pageRange);
+
+     pageRange.offset = groupXRefOffset[k];
+     pageRange.length = 20*groupNumObjects[k];
+     v->push_back(pageRange);
+  }
+
+  return v;
+}
+
+Guint Hints::readBit(Stream *str)
+{
+  Guint bit;
+  int c;
+
+  if (inputBits == 0) {
+    if ((c = str->getChar()) == EOF) {
+      return (Guint) -1;
+    }
+    bitsBuffer = c;
+    inputBits = 8;
+  }
+  bit = (bitsBuffer >> (inputBits - 1)) & 1;
+  --inputBits;
+  return bit;
+}
+
+Guint Hints::readBits(int n, Stream *str)
+{
+  Guint bit, bits;
+
+  if (n < 1) return -1;
+
+  if (n == 1)
+    return readBit(str);
+
+  bit = (readBit(str) << (n-1));
+  if (bit == (Guint) -1)
+    return -1;
+
+  bits = readBits(n-1, str);
+  if (bits == (Guint) -1)
+    return -1;
+
+  return bit | bits;
+}
+
+
diff --git a/poppler/Hints.h b/poppler/Hints.h
new file mode 100644
index 0000000..35a2e55
--- /dev/null
+++ b/poppler/Hints.h
@@ -0,0 +1,92 @@
+//========================================================================
+//
+// Hints.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#ifndef HINTS_H
+#define HINTS_H
+
+#include <string.h>
+#include "goo/gtypes.h"
+#include "goo/GooVector.h"
+//#include <vector>
+#include "PDFDoc.h"
+
+class Stream;
+class BaseStream;
+class Linearization;
+class XRef;
+
+//------------------------------------------------------------------------
+// Hints
+//------------------------------------------------------------------------
+
+class Hints {
+public:
+
+  Hints(BaseStream *str, Linearization *linearization);
+  ~Hints();
+
+  Guint getPageOffset(int page);
+  GooVector<ByteRange>* getPageRanges(int page);
+
+private:
+
+  void readTables(BaseStream *str, Linearization *linearization);
+  void readPageOffsetTable(Stream *str);
+  void readSharedObjectsTable(Stream *str);
+
+  Guint readBit(Stream *str);
+  Guint readBits(int n, Stream *str);
+
+  Guint hintsOffset;
+  Guint hintsLength;
+  Guint hintsOffset2;
+  Guint hintsLength2;
+  Guint mainXRefEntriesOffset;
+
+  int nPages;
+  int pageFirst;
+  Guint pageOffsetFirst;
+  Guint pageEndFirst;
+  int objectNumberFirst;
+
+  Guint nObjectLeast;
+  Guint objectOffsetFirst;
+  Guint nBitsDiffObjects;
+  Guint pageLengthLeast;
+  Guint nBitsDiffPageLength;
+  Guint OffsetStreamLeast;
+  Guint nBitsOffsetStream;
+  Guint lengthStreamLeast;
+  Guint nBitsLengthStream;
+  Guint nBitsNumShared;
+  Guint nBitsShared;
+  Guint nBitsNumerator;
+  Guint denominator;
+
+  Guint *nObjects;
+  Guint *xRefOffset;
+  Guint *pageLength;
+  Guint *pageOffset;
+  Guint *numSharedObject;
+  Guint **sharedObjectId;
+
+  Guint nSharedGroups;
+  Guint *groupLength;
+  Guint *groupOffset;
+  Guint *groupHasSignature;
+  Guint *groupNumObjects;
+  Guint *groupXRefOffset;
+
+  int inputBits;
+  char bitsBuffer;
+
+};
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 8c1e019..a6b7990 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -206,6 +206,7 @@ poppler_include_HEADERS =	\
 	GfxState.h		\
 	GfxState_helpers.h	\
 	GlobalParams.h		\
+	Hints.h			\
 	JArithmeticDecoder.h	\
 	JBIG2Stream.h		\
 	Lexer.h			\
@@ -285,6 +286,7 @@ libpoppler_la_SOURCES =		\
 	GfxFont.cc 		\
 	GfxState.cc		\
 	GlobalParams.cc		\
+	Hints.cc		\
 	JArithmeticDecoder.cc	\
 	JBIG2Stream.cc		\
 	Lexer.cc 		\
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 8f105fd..1d6c2f4 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -67,6 +67,7 @@
 #include "Outline.h"
 #endif
 #include "PDFDoc.h"
+#include "Hints.h"
 
 //------------------------------------------------------------------------
 
@@ -94,6 +95,7 @@ void PDFDoc::init()
   xref = NULL;
   linearization = NULL;
   catalog = NULL;
+  hints = NULL;
 #ifndef DISABLE_OUTLINE
   outline = NULL;
 #endif
@@ -268,6 +270,9 @@ PDFDoc::~PDFDoc() {
   if (xref) {
     delete xref;
   }
+  if (hints) {
+    delete hints;
+  }
   if (linearization) {
     delete linearization;
   }
@@ -471,6 +476,15 @@ GBool PDFDoc::isLinearized() {
     return gFalse;
 }
 
+Hints *PDFDoc::getHints()
+{
+  if (!hints && isLinearized()) {
+    hints = new Hints(str, getLinearization());
+  }
+
+  return hints;
+}
+
 int PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
   FILE *f;
   OutStream *outStr;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 9069698..b2f40c9 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -49,6 +49,7 @@ class LinkAction;
 class LinkDest;
 class Outline;
 class Linearization;
+class Hints;
 
 enum PDFWriteMode {
   writeStandard,
@@ -236,6 +237,9 @@ private:
   void saveIncrementalUpdate (OutStream* outStr);
   void saveCompleteRewrite (OutStream* outStr);
 
+  // Get hints.
+  Hints *getHints();
+
   PDFDoc();
   void init();
   GBool setup(GooString *ownerPassword, GooString *userPassword);
@@ -258,6 +262,7 @@ private:
   Linearization *linearization;
   XRef *xref;
   Catalog *catalog;
+  Hints *hints;
 #ifndef DISABLE_OUTLINE
   Outline *outline;
 #endif
-- 
1.6.4.2


From 3ce8a01affc68d82bd788c47359f41ea2da5ffa9 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:06:02 +0200
Subject: [PATCH 15/15] Use hint tables for PDFDoc::getPage()

---
 poppler/PDFDoc.cc |   76 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 poppler/PDFDoc.h  |    4 +++
 2 files changed, 79 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 1d6c2f4..1239ee7 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -100,6 +100,7 @@ void PDFDoc::init()
   outline = NULL;
 #endif
   startXRefPos = ~(Guint)0;
+  pageCache = NULL;
 }
 
 PDFDoc::PDFDoc()
@@ -259,6 +260,14 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
 }
 
 PDFDoc::~PDFDoc() {
+  if (pageCache) {
+    for (int i = 0; i < getNumPages(); i++) {
+      if (pageCache[i]) {
+        delete pageCache[i];
+      }
+    }
+    gfree(pageCache);
+  }
 #ifndef DISABLE_OUTLINE
   if (outline) {
     delete outline;
@@ -1056,11 +1065,76 @@ int PDFDoc::getNumPages()
   }
 }
 
+Guint PDFDoc::getPageOffset(int page)
+{
+  if (isLinearized() && (page-1 == getLinearization()->getPageFirst())) {
+    return xref->getEntry(linearization->getObjectNumberFirst())->offset;
+  }
+
+  Guint offset;
+  if (getHints() && (offset = getHints()->getPageOffset(page))) {
+    return offset;
+  } else {
+    error(-1, "Failed getting page offset from hint table");
+    return 0;
+  }
+}
+
+Page *PDFDoc::parsePage(Guint offset, int page)
+{
+  Page *p = NULL;
+  Object obj;
+
+  obj.initNull();
+  Stream *stream = str->makeSubStream(offset, gFalse, 0, &obj);
+  Parser parser = Parser(xref, new Lexer(xref, stream), gTrue);
+
+  Object obj1, obj2, obj3, obj4;
+  if (parser.getObj(&obj1)->isInt() &&
+      parser.getObj(&obj2)->isInt() &&
+      parser.getObj(&obj3)->isCmd("obj") &&
+      parser.getObj(&obj4)->isDict("Page")) {
+    Ref pageRef;
+    Dict *pageDict;
+    pageRef.num = obj1.getInt();
+    pageRef.gen = obj2.getInt();
+    pageDict = obj4.getDict();
+    p = new Page(xref, page, pageDict, pageRef,
+                 new PageAttrs(NULL, pageDict),
+                 catalog->getForm());
+    if (!p->isOk()) {
+      delete p;
+      p = NULL;
+    }
+  }
+  obj4.free();
+  obj3.free();
+  obj2.free();
+  obj1.free();
+
+  return p;
+}
+
 Page *PDFDoc::getPage(int page)
 {
   if ((page < 1) || page > getNumPages()) return NULL;
 
-  {
+  if (isLinearized()) {
+    if (!pageCache) {
+      pageCache = (Page **) gmallocn(getNumPages(), sizeof(Page *));
+      for (int i = 0; i < getNumPages(); i++) {
+        pageCache[i] = NULL;
+      }
+    }
+    if (!pageCache[page-1]) {
+      pageCache[page-1] = parsePage(getPageOffset(page), page);
+      if (!pageCache[page-1]) {
+         error(-1, "Failed parsing page %d at offset %d",
+               page, getPageOffset(page));
+      }
+    }
+    return pageCache[page-1];
+  } else {
     return catalog->getPage(page);
   }
 }
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index b2f40c9..99c005e 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -237,6 +237,9 @@ private:
   void saveIncrementalUpdate (OutStream* outStr);
   void saveCompleteRewrite (OutStream* outStr);
 
+  Guint getPageOffset(int page);
+  Page *parsePage(Guint offset, int page);
+
   // Get hints.
   Hints *getHints();
 
@@ -266,6 +269,7 @@ private:
 #ifndef DISABLE_OUTLINE
   Outline *outline;
 #endif
+  Page **pageCache;
 
   GBool ok;
   int errCode;
-- 
1.6.4.2


More information about the poppler mailing list