[poppler] Linearization support

Hib Eris hib at hiberis.nl
Sun May 2 12:34:10 PDT 2010


Hi again,

On Thu, Apr 29, 2010 at 1:52 PM, Hib Eris <hib at hiberis.nl> wrote:
> Hi all,
>
> I have two series of patches that allow poppler to handle linearized
> documents more efficiently.

I have updated my patches to improve the handling of malformed
documents. Here are the new patches.

Hib
-------------- next part --------------
From a9f2d19d18dd509d3f495c4c2fbb830516fa0527 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 6 Apr 2010 19:24:42 +0200
Subject: [PATCH 01/12] Cleanup XRef constructors

---
 poppler/XRef.cc |   14 ++++++--------
 poppler/XRef.h  |    1 +
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 3ab23d9..8ae30b2 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -226,7 +226,7 @@ Object *ObjectStream::getObject(int objIdx, int objNum, Object *obj) {
 // XRef
 //------------------------------------------------------------------------
 
-XRef::XRef() {
+void XRef::init() {
   ok = gTrue;
   errCode = errNone;
   entries = NULL;
@@ -236,17 +236,15 @@ XRef::XRef() {
   objStr = NULL;
 }
 
+XRef::XRef() {
+  init();
+}
+
 XRef::XRef(BaseStream *strA) {
   Guint pos;
   Object obj;
 
-  ok = gTrue;
-  errCode = errNone;
-  size = 0;
-  entries = NULL;
-  streamEnds = NULL;
-  streamEndsLen = 0;
-  objStr = NULL;
+  init();
 
   encrypted = gFalse;
   permFlags = defPermFlags;
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 2dbd469..98db234 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -155,6 +155,7 @@ private:
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
 
+  void init();
   Guint getStartXref();
   GBool readXRef(Guint *pos);
   GBool readXRefTable(Parser *parser, Guint *pos);
-- 
1.6.4.2


From f1bf4283fce1793d5d0a07810c7de4bfd0389562 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 6 Apr 2010 19:16:45 +0200
Subject: [PATCH 02/12] Create no more XRef entries than specified

---
 poppler/XRef.cc |  126 +++++++++++++++++++++++++++---------------------------
 poppler/XRef.h  |    5 ++-
 2 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 8ae30b2..49ff809 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -230,6 +230,7 @@ void XRef::init() {
   ok = gTrue;
   errCode = errNone;
   entries = NULL;
+  capacity = 0;
   size = 0;
   streamEnds = NULL;
   streamEndsLen = 0;
@@ -311,6 +312,50 @@ XRef::~XRef() {
   }
 }
 
+int XRef::reserve(int newSize)
+{
+  if (newSize > capacity) {
+
+    int realNewSize;
+    for (realNewSize = capacity ? 2 * capacity : 1024;
+          newSize > realNewSize && realNewSize > 0;
+          realNewSize <<= 1) ;
+    if ((realNewSize < 0) ||
+        (realNewSize >= INT_MAX / (int)sizeof(XRefEntry))) {
+      return 0;
+    }
+
+    entries = (XRefEntry *)greallocn(entries, realNewSize, sizeof(XRefEntry));
+    capacity = realNewSize;
+  }
+
+  return capacity;
+}
+
+int XRef::resize(int newSize)
+{
+  if (newSize > size) {
+
+    if (reserve(newSize) < newSize) return size;
+
+    for (int i = size; i < newSize; ++i) {
+      entries[i].offset = 0xffffffff;
+      entries[i].type = xrefEntryFree;
+      entries[i].obj.initNull ();
+      entries[i].updated = false;
+      entries[i].gen = 0;
+    }
+  } else {
+    for (int i = newSize; i < size; i++) {
+      entries[i].obj.free ();
+    }
+  }
+
+  size = newSize;
+
+  return size;
+}
+
 // Read the 'startxref' position.
 Guint XRef::getStartXref() {
   char buf[xrefSearchSize+1];
@@ -398,7 +443,7 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
   GBool more;
   Object obj, obj2;
   Guint pos2;
-  int first, n, newSize, i;
+  int first, n, i;
 
   while (1) {
     parser->getObj(&obj);
@@ -417,29 +462,13 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
     n = obj.getInt();
     obj.free();
     if (first < 0 || n < 0 || first + n < 0) {
-      goto err1;
+      goto err0;
     }
     if (first + n > size) {
-      for (newSize = size ? 2 * size : 1024;
-	   first + n > newSize && newSize > 0;
-	   newSize <<= 1) ;
-      if (newSize < 0) {
-	goto err1;
-      }
-      if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
+      if (resize(first + n) != first + n) {
         error(-1, "Invalid 'obj' parameters'");
-        goto err1;
+        goto err0;
       }
- 
-      entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-      for (i = size; i < newSize; ++i) {
-	entries[i].offset = 0xffffffff;
-	entries[i].type = xrefEntryFree;
-	entries[i].obj.initNull ();
-	entries[i].updated = false;
-	entries[i].gen = 0;
-      }
-      size = newSize;
     }
     for (i = first; i < first + n; ++i) {
       if (!parser->getObj(&obj)->isInt()) {
@@ -520,6 +549,7 @@ GBool XRef::readXRefTable(Parser *parser, Guint *pos) {
 
  err1:
   obj.free();
+ err0:
   ok = gFalse;
   return gFalse;
 }
@@ -542,19 +572,10 @@ GBool XRef::readXRefStream(Stream *xrefStr, Guint *pos) {
     goto err1;
   }
   if (newSize > size) {
-    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-      error(-1, "Invalid 'size' parameter.");
-      return gFalse;
-    }
-    entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-    for (i = size; i < newSize; ++i) {
-      entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
-      entries[i].obj.initNull ();
-      entries[i].updated = false;
-      entries[i].gen = 0;
+    if (resize(newSize) != newSize) {
+      error(-1, "Invalid 'size' parameter");
+      goto err0;
     }
-    size = newSize;
   }
 
   if (!dict->lookupNF("W", &obj)->isArray() ||
@@ -627,31 +648,16 @@ GBool XRef::readXRefStream(Stream *xrefStr, Guint *pos) {
 
 GBool XRef::readXRefStreamSection(Stream *xrefStr, int *w, int first, int n) {
   Guint offset;
-  int type, gen, c, newSize, i, j;
+  int type, gen, c, i, j;
 
   if (first + n < 0) {
     return gFalse;
   }
   if (first + n > size) {
-    for (newSize = size ? 2 * size : 1024;
-	 first + n > newSize && newSize > 0;
-	 newSize <<= 1) ;
-    if (newSize < 0) {
-      return gFalse;
-    }
-    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-      error(-1, "Invalid 'size' inside xref table.");
+    if (resize(first + n) != size) {
+      error(-1, "Invalid 'size' inside xref table");
       return gFalse;
     }
-    entries = (XRefEntry *)greallocn(entries, newSize, sizeof(XRefEntry));
-    for (i = size; i < newSize; ++i) {
-      entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
-      entries[i].obj.initNull ();
-      entries[i].updated = false;
-      entries[i].gen = 0;
-    }
-    size = newSize;
   }
   for (i = first; i < first + n; ++i) {
     if (w[0] == 0) {
@@ -712,13 +718,13 @@ GBool XRef::constructXRef() {
   int newSize;
   int streamEndsSize;
   char *p;
-  int i;
   GBool gotRoot;
   char* token = NULL;
   bool oneCycle = true;
   int offset = 0;
 
   gfree(entries);
+  capacity = 0;
   size = 0;
   entries = NULL;
 
@@ -800,19 +806,10 @@ GBool XRef::constructXRef() {
 		      error(-1, "Bad object number");
 		      return gFalse;
 		    }
-		    if (newSize >= INT_MAX / (int)sizeof(XRefEntry)) {
-		      error(-1, "Invalid 'obj' parameters.");
+		    if (resize(newSize) != newSize) {
+		      error(-1, "Invalid 'obj' parameters");
 		      return gFalse;
 		    }
-		    entries = (XRefEntry *)
-		        greallocn(entries, newSize, sizeof(XRefEntry));
-		    for (i = size; i < newSize; ++i) {
-		      entries[i].offset = 0xffffffff;
-		      entries[i].type = xrefEntryFree;
-		      entries[i].obj.initNull ();
-		      entries[i].updated = false;
-		    }
-		    size = newSize;
 		  }
 		  if (entries[num].type == xrefEntryFree ||
 		      gen >= entries[num].gen) {
@@ -1085,7 +1082,10 @@ Guint XRef::strToUnsigned(char *s) {
 
 void XRef::add(int num, int gen, Guint offs, GBool used) {
   if (num >= size) {
-    entries = (XRefEntry *)greallocn(entries, num + 1, sizeof(XRefEntry));
+    if (num >= capacity) {
+      entries = (XRefEntry *)greallocn(entries, num + 1, sizeof(XRefEntry));
+      capacity = num + 1;
+    }
     for (int i = size; i < num + 1; ++i) {
       entries[i].offset = 0xffffffff;
       entries[i].type = xrefEntryFree;
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 98db234..f86e5ee 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -136,7 +136,8 @@ private:
   Guint start;			// offset in file (to allow for garbage
 				//   at beginning of file)
   XRefEntry *entries;		// xref entries
-  int size;			// size of <entries> array
+  int capacity;			// size of <entries> array
+  int size;			// number of entries
   int rootNum, rootGen;		// catalog dict
   GBool ok;			// true if xref table is valid
   int errCode;			// error code (if <ok> is false)
@@ -156,6 +157,8 @@ private:
   GBool ownerPasswordOk;	// true if owner password is correct
 
   void init();
+  int reserve(int newSize);
+  int resize(int newSize);
   Guint getStartXref();
   GBool readXRef(Guint *pos);
   GBool readXRefTable(Parser *parser, Guint *pos);
-- 
1.6.4.2


From fa5a0beb6f86a6cea7ba98ef5ef3c04a53b7319d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 28 Apr 2010 12:45:42 +0200
Subject: [PATCH 03/12] Use XRef::add() in XRef::addIndirectObject()

---
 poppler/XRef.cc |    4 +---
 1 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 49ff809..012f91c 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -1127,10 +1127,8 @@ Ref XRef::addIndirectObject (Object* o) {
   XRefEntry *e;
   if (entryIndexToUse == -1) {
     entryIndexToUse = size;
-    size++;
-    entries = (XRefEntry *)greallocn(entries, size, sizeof(XRefEntry));
+    add(entryIndexToUse, 0, 0, gFalse);
     e = &entries[entryIndexToUse];
-    e->gen = 0;
   } else {
     //reuse a free entry
     e = &entries[entryIndexToUse];
-- 
1.6.4.2


From 7ce2bb1aac50315145ebc21ae34b5c37f00e0c35 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 14 Apr 2010 12:20:49 +0200
Subject: [PATCH 04/12] Use XRef::getEntry() to access entries

---
 poppler/XRef.cc |   49 +++++++++++++++++++++++++------------------------
 poppler/XRef.h  |    2 +-
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index 012f91c..d615ec0 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -927,7 +927,7 @@ Object *XRef::fetch(int num, int gen, Object *obj) {
     goto err;
   }
 
-  e = &entries[num];
+  e = getEntry(num);
   if(!e->obj.isNull ()) { //check for updated object
     obj = e->obj.copy(obj);
     return obj;
@@ -1047,20 +1047,20 @@ GBool XRef::getStreamEnd(Guint streamStart, Guint *streamEnd) {
   return gTrue;
 }
 
-int XRef::getNumEntry(Guint offset) const
+int XRef::getNumEntry(Guint offset)
 {
   if (size > 0)
   {
     int res = 0;
-    Guint resOffset = entries[0].offset;
-    XRefEntry e;
+    Guint resOffset = getEntry(0)->offset;
+    XRefEntry *e;
     for (int i = 1; i < size; ++i)
     {
-      e = entries[i];
-      if (e.offset < offset && e.offset >= resOffset)
+      e = getEntry(i);
+      if (e->offset < offset && e->offset >= resOffset)
       {
         res = i;
-        resOffset = e.offset;
+        resOffset = e->offset;
       }
     }
     return res;
@@ -1095,7 +1095,7 @@ void XRef::add(int num, int gen, Guint offs, GBool used) {
     }
     size = num + 1;
   }
-  XRefEntry *e = &entries[num];
+  XRefEntry *e = getEntry(num);
   e->gen = gen;
   e->obj.initNull ();
   e->updated = false;
@@ -1113,25 +1113,26 @@ void XRef::setModifiedObject (Object* o, Ref r) {
     error(-1,"XRef::setModifiedObject on unknown ref: %i, %i\n", r.num, r.gen);
     return;
   }
-  entries[r.num].obj.free();
-  o->copy(&entries[r.num].obj);
-  entries[r.num].updated = true;
+  XRefEntry *e = getEntry(r.num);
+  e->obj.free();
+  o->copy(&(e->obj));
+  e->updated = true;
 }
 
 Ref XRef::addIndirectObject (Object* o) {
   int entryIndexToUse = -1;
   for (int i = 1; entryIndexToUse == -1 && i < size; ++i) {
-    if (entries[i].type == xrefEntryFree) entryIndexToUse = i;
+    if (getEntry(i)->type == xrefEntryFree) entryIndexToUse = i;
   }
 
   XRefEntry *e;
   if (entryIndexToUse == -1) {
     entryIndexToUse = size;
     add(entryIndexToUse, 0, 0, gFalse);
-    e = &entries[entryIndexToUse];
+    e = getEntry(entryIndexToUse);
   } else {
     //reuse a free entry
-    e = &entries[entryIndexToUse];
+    e = getEntry(entryIndexToUse);
     //we don't touch gen number, because it should have been 
     //incremented when the object was deleted
   }
@@ -1147,13 +1148,13 @@ Ref XRef::addIndirectObject (Object* o) {
 
 void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   //create free entries linked-list
-  if (entries[0].gen != 65535) {
+  if (getEntry(0)->gen != 65535) {
     error(-1, "XRef::writeToFile, entry 0 of the XRef is invalid (gen != 65535)\n");
   }
   int lastFreeEntry = 0;
   for (int i=0; i<size; i++) {
-    if (entries[i].type == xrefEntryFree) {
-      entries[lastFreeEntry].offset = i;
+    if (getEntry(i)->type == xrefEntryFree) {
+      getEntry(lastFreeEntry)->offset = i;
       lastFreeEntry = i;
     }
   }
@@ -1163,10 +1164,10 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
     outStr->printf("xref\r\n");
     outStr->printf("%i %i\r\n", 0, size);
     for (int i=0; i<size; i++) {
-      XRefEntry &e = entries[i];
+      XRefEntry *e = getEntry(i);
 
-      if(e.gen > 65535) e.gen = 65535; //cap generation number to 65535 (required by PDFReference)
-      outStr->printf("%010i %05i %c\r\n", e.offset, e.gen, (e.type==xrefEntryFree)?'f':'n');
+      if(e->gen > 65535) e->gen = 65535; //cap generation number to 65535 (required by PDFReference)
+      outStr->printf("%010i %05i %c\r\n", e->offset, e->gen, (e->type==xrefEntryFree)?'f':'n');
     }
   } else {
     //write the new xref
@@ -1175,16 +1176,16 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
     while (i < size) {
       int j;
       for(j=i; j<size; j++) { //look for consecutive entries
-        if ((entries[j].type == xrefEntryFree) && (entries[j].gen == 0))
+        if ((getEntry(j)->type == xrefEntryFree) && (getEntry(j)->gen == 0))
           break;
       }
       if (j-i != 0)
       {
         outStr->printf("%i %i\r\n", i, j-i);
         for (int k=i; k<j; k++) {
-          XRefEntry &e = entries[k];
-          if(e.gen > 65535) e.gen = 65535; //cap generation number to 65535 (required by PDFReference)
-          outStr->printf("%010i %05i %c\r\n", e.offset, e.gen, (e.type==xrefEntryFree)?'f':'n');
+          XRefEntry *e = getEntry(k);
+          if(e->gen > 65535) e->gen = 65535; //cap generation number to 65535 (required by PDFReference)
+          outStr->printf("%010i %05i %c\r\n", e->offset, e->gen, (e->type==xrefEntryFree)?'f':'n');
         }
         i = j;
       }
diff --git a/poppler/XRef.h b/poppler/XRef.h
index f86e5ee..344b764 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -117,7 +117,7 @@ public:
   GBool getStreamEnd(Guint streamStart, Guint *streamEnd);
 
   // Retuns the entry that belongs to the offset
-  int getNumEntry(Guint offset) const;
+  int getNumEntry(Guint offset);
 
   // Direct access.
   int getSize() { return size; }
-- 
1.6.4.2


From d1849ee55fb37f20db86e7a5cf2f44e63478cd66 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 15 Apr 2010 17:34:13 +0200
Subject: [PATCH 05/12] Read XRef table sections on demand

---
 poppler/XRef.cc |   37 ++++++++++++++++++++++++++++++++-----
 poppler/XRef.h  |    6 ++++--
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index d615ec0..a0c77fc 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -242,7 +242,6 @@ XRef::XRef() {
 }
 
 XRef::XRef(BaseStream *strA) {
-  Guint pos;
   Object obj;
 
   init();
@@ -254,11 +253,11 @@ XRef::XRef(BaseStream *strA) {
   // read the trailer
   str = strA;
   start = str->getStart();
-  pos = getStartXref();
+  prevXRefOffset = pos;
 
   // if there was a problem with the 'startxref' position, try to
   // reconstruct the xref table
-  if (pos == 0) {
+  if (prevXRefOffset == 0) {
     if (!(ok = constructXRef())) {
       errCode = errDamaged;
       return;
@@ -266,7 +265,7 @@ XRef::XRef(BaseStream *strA) {
 
   // read the xref table
   } else {
-    while (readXRef(&pos)) ;
+     readXRef(&prevXRefOffset);
 
     // if there was a problem with the xref table,
     // try to reconstruct it
@@ -278,6 +277,18 @@ XRef::XRef(BaseStream *strA) {
     }
   }
 
+  // set size according to trailer dict
+  trailerDict.dictLookupNF("Size", &obj);
+  if (obj.isInt() && (resize(obj.getInt()) == obj.getInt())) {
+    obj.free();
+  } else {
+    obj.free();
+    if (!(ok = constructXRef())) {
+      errCode = errDamaged;
+      return;
+    }
+  }
+
   // get the root dictionary (catalog) object
   trailerDict.dictLookupNF("Root", &obj);
   if (obj.isRef()) {
@@ -340,7 +351,7 @@ int XRef::resize(int newSize)
 
     for (int i = size; i < newSize; ++i) {
       entries[i].offset = 0xffffffff;
-      entries[i].type = xrefEntryFree;
+      entries[i].type = xrefEntryNone;
       entries[i].obj.initNull ();
       entries[i].updated = false;
       entries[i].gen = 0;
@@ -1194,3 +1205,19 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   }
 }
 
+XRefEntry *XRef::getEntry(int i)
+{
+  if (entries[i].type == xrefEntryNone) {
+
+    while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+
+    if (entries[i].type == xrefEntryNone) {
+       error(-1, "Invalid XRef entry");
+       entries[i].type = xrefEntryFree;
+    }
+  }
+
+  return &entries[i];
+}
+
+
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 344b764..a013e5a 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -45,7 +45,8 @@ class ObjectStream;
 enum XRefEntryType {
   xrefEntryFree,
   xrefEntryUncompressed,
-  xrefEntryCompressed
+  xrefEntryCompressed,
+  xrefEntryNone
 };
 
 struct XRefEntry {
@@ -121,7 +122,7 @@ public:
 
   // Direct access.
   int getSize() { return size; }
-  XRefEntry *getEntry(int i) { return &entries[i]; }
+  XRefEntry *getEntry(int i);
   Object *getTrailerDict() { return &trailerDict; }
 
   // Write access
@@ -155,6 +156,7 @@ private:
   int permFlags;		// permission bits
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
+  Guint prevXRefOffset;		// position of prev XRef section (= next to read)
 
   void init();
   int reserve(int newSize);
-- 
1.6.4.2


From 6932300b90334cc2c43a0dd3fe7f7e5da13f389c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 18:26:17 +0100
Subject: [PATCH 06/12] Add Linearization dictionary support

---
 CMakeLists.txt           |    2 +
 poppler/Linearization.cc |  225 ++++++++++++++++++++++++++++++++++++++++++++++
 poppler/Linearization.h  |   45 +++++++++
 poppler/Makefile.am      |    2 +
 poppler/PDFDoc.cc        |   13 +++
 poppler/PDFDoc.h         |    5 +
 6 files changed, 292 insertions(+), 0 deletions(-)
 create mode 100644 poppler/Linearization.cc
 create mode 100644 poppler/Linearization.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1eba1fe..a119a6d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,6 +249,7 @@ set(poppler_SRCS
   poppler/JBIG2Stream.cc
   poppler/Lexer.cc
   poppler/Link.cc
+  poppler/Linearization.cc
   poppler/LocalPDFDocBuilder.cc
   poppler/NameToCharCode.cc
   poppler/Object.cc
@@ -394,6 +395,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/JBIG2Stream.h
     poppler/Lexer.h
     poppler/Link.h
+    poppler/Linearization.h
     poppler/LocalPDFDocBuilder.h
     poppler/Movie.h
     poppler/NameToCharCode.h
diff --git a/poppler/Linearization.cc b/poppler/Linearization.cc
new file mode 100644
index 0000000..d63e5ad
--- /dev/null
+++ b/poppler/Linearization.cc
@@ -0,0 +1,225 @@
+//========================================================================
+//
+// Linearization.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#include "Linearization.h"
+#include "Parser.h"
+#include "Lexer.h"
+
+//------------------------------------------------------------------------
+// Linearization
+//------------------------------------------------------------------------
+
+Linearization::Linearization (BaseStream *str)
+{
+  Parser *parser;
+  Object obj1, obj2, obj3, obj4, obj5;
+
+  linDict.initNull();
+
+  str->reset();
+  obj1.initNull();
+  parser = new Parser(NULL,
+      new Lexer(NULL, str->makeSubStream(str->getStart(), gFalse, 0, &obj1)),
+      gTrue);
+  parser->getObj(&obj1);
+  parser->getObj(&obj2);
+  parser->getObj(&obj3);
+  parser->getObj(&linDict);
+  parser->getObj(&obj4);
+  if (obj1.isInt() && obj2.isInt() && obj3.isCmd("obj") && linDict.isDict()) {
+    linDict.dictLookup("Linearized", &obj5);
+    if (!(obj5.isNum() && obj5.getNum() > 0)) {
+       linDict.free();
+       linDict.initNull();
+    }
+    obj5.free();
+  }
+  obj4.free();
+  obj4.free();
+  obj3.free();
+  obj2.free();
+  obj1.free();
+  delete parser;
+}
+
+Linearization:: ~Linearization()
+{
+  linDict.free();
+}
+
+Guint Linearization::getLength()
+{
+  if (!linDict.isDict()) return 0;
+
+  int length;
+  if (linDict.getDict()->lookupInt("L", NULL, &length) &&
+      length > 0) {
+    return length;
+  } else {
+    error(-1, "Length in linearization table is invalid");
+    return 0;
+  }
+}
+
+Guint Linearization::getHintsOffset()
+{
+  int hintsOffset;
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=2 &&
+      obj1.arrayGet(0, &obj2)->isInt() &&
+      obj2.getInt() > 0) {
+    hintsOffset = obj2.getInt();
+  } else {
+    error(-1, "Hints table offset in linearization table is invalid");
+    hintsOffset = 0;
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsOffset;
+}
+
+Guint Linearization::getHintsLength()
+{
+  int hintsLength;
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=2 &&
+      obj1.arrayGet(1, &obj2)->isInt() &&
+      obj2.getInt() > 0) {
+    hintsLength = obj2.getInt();
+  } else {
+    error(-1, "Hints table length in linearization table is invalid");
+    hintsLength = 0;
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsLength;
+}
+
+Guint Linearization::getHintsOffset2()
+{
+  int hintsOffset2 = 0; // default to 0
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=4) {
+    if (obj1.arrayGet(2, &obj2)->isInt() &&
+        obj2.getInt() > 0) {
+      hintsOffset2 = obj2.getInt();
+    } else {
+      error(-1, "Second hints table offset in linearization table is invalid");
+      hintsOffset2 = 0;
+    }
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsOffset2;
+}
+
+Guint Linearization::getHintsLength2()
+{
+  int hintsLength2 = 0; // default to 0
+
+  Object obj1, obj2;
+  if (linDict.isDict() &&
+      linDict.dictLookup("H", &obj1)->isArray() &&
+      obj1.arrayGetLength()>=4) {
+    if (obj1.arrayGet(3, &obj2)->isInt() &&
+        obj2.getInt() > 0) {
+      hintsLength2 = obj2.getInt();
+    } else {
+      error(-1, "Second hints table length in linearization table is invalid");
+      hintsLength2 = 0;
+    }
+  }
+  obj2.free();
+  obj1.free();
+
+  return hintsLength2;
+}
+
+int Linearization::getObjectNumberFirst()
+{
+  int objectNumberFirst = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("O", NULL, &objectNumberFirst) &&
+      objectNumberFirst > 0) {
+    return objectNumberFirst;
+  } else {
+    error(-1, "Object number of first page in linearization table is invalid");
+    return 0;
+  }
+}
+
+Guint Linearization::getEndFirst()
+{
+  int pageEndFirst = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("E", NULL, &pageEndFirst) &&
+      pageEndFirst > 0) {
+    return pageEndFirst;
+  } else {
+    error(-1, "First page end offset in linearization table is invalid");
+    return 0;
+  }
+}
+
+int Linearization::getNumPages()
+{
+  int numPages = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("N", NULL, &numPages) &&
+      numPages > 0) {
+    return numPages;
+  } else {
+    error(-1, "Page count in linearization table is invalid");
+    return 0;
+  }
+}
+
+Guint Linearization::getMainXRefEntriesOffset()
+{
+  int mainXRefEntriesOffset = 0;
+  if (linDict.isDict() &&
+      linDict.getDict()->lookupInt("T", NULL, &mainXRefEntriesOffset) &&
+      mainXRefEntriesOffset > 0) {
+    return mainXRefEntriesOffset;
+  } else {
+    error(-1, "Main Xref offset in linearization table is invalid");
+    return 0;
+  }
+}
+
+int Linearization::getPageFirst()
+{
+  int pageFirst = 0; // Optional, defaults to 0.
+
+  if (linDict.isDict()) {
+    linDict.getDict()->lookupInt("P", NULL, &pageFirst);
+  }
+
+  if (pageFirst < 0) {
+    error(-1, "First page in linearization table is invalid");
+    return 0;
+  }
+
+  return pageFirst;
+}
+
+
diff --git a/poppler/Linearization.h b/poppler/Linearization.h
new file mode 100644
index 0000000..6728a75
--- /dev/null
+++ b/poppler/Linearization.h
@@ -0,0 +1,45 @@
+//========================================================================
+//
+// Linearization.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#ifndef LINEARIZATION_H
+#define LINEARIZATION_H
+
+#include "goo/gtypes.h"
+#include "Object.h"
+class BaseStream;
+
+//------------------------------------------------------------------------
+// Linearization
+//------------------------------------------------------------------------
+
+class Linearization {
+public:
+
+  Linearization(BaseStream *str);
+  ~Linearization();
+
+  Guint getLength();
+  Guint getHintsOffset();
+  Guint getHintsLength();
+  Guint getHintsOffset2();
+  Guint getHintsLength2();
+  int getObjectNumberFirst();
+  Guint getEndFirst();
+  int getNumPages();
+  Guint getMainXRefEntriesOffset();
+  int getPageFirst();
+
+private:
+
+  Object linDict;
+
+};
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 5dd8082..8c1e019 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -209,6 +209,7 @@ poppler_include_HEADERS =	\
 	JArithmeticDecoder.h	\
 	JBIG2Stream.h		\
 	Lexer.h			\
+	Linearization.h 	\
 	Link.h			\
 	LocalPDFDocBuilder.h	\
 	Movie.h                 \
@@ -287,6 +288,7 @@ libpoppler_la_SOURCES =		\
 	JArithmeticDecoder.cc	\
 	JBIG2Stream.cc		\
 	Lexer.cc 		\
+	Linearization.cc 	\
 	Link.cc 		\
 	LocalPDFDocBuilder.cc	\
 	Movie.cc                \
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 2d1477d..fe568a0 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -52,6 +52,7 @@
 #include "Catalog.h"
 #include "Stream.h"
 #include "XRef.h"
+#include "Linearization.h"
 #include "Link.h"
 #include "OutputDev.h"
 #include "Error.h"
@@ -82,6 +83,7 @@ void PDFDoc::init()
   file = NULL;
   str = NULL;
   xref = NULL;
+  linearization = NULL;
   catalog = NULL;
 #ifndef DISABLE_OUTLINE
   outline = NULL;
@@ -242,6 +244,9 @@ PDFDoc::~PDFDoc() {
   if (xref) {
     delete xref;
   }
+  if (linearization) {
+    delete linearization;
+  }
   if (str) {
     delete str;
   }
@@ -412,6 +417,14 @@ void PDFDoc::processLinks(OutputDev *out, int page) {
   catalog->getPage(page)->processLinks(out, catalog);
 }
 
+Linearization *PDFDoc::getLinearization()
+{
+  if (!linearization) {
+    linearization = new Linearization(str);
+  }
+  return linearization;
+}
+
 GBool PDFDoc::isLinearized() {
   Parser *parser;
   Object obj1, obj2, obj3, obj4, obj5;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 6d7dea2..011f4c0 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -48,6 +48,7 @@ class Links;
 class LinkAction;
 class LinkDest;
 class Outline;
+class Linearization;
 
 enum PDFWriteMode {
   writeStandard,
@@ -89,6 +90,9 @@ public:
   // Get file name.
   GooString *getFileName() { return fileName; }
 
+  // Get the linearization table.
+  Linearization *getLinearization();
+
   // Get the xref table.
   XRef *getXRef() { return xref; }
 
@@ -242,6 +246,7 @@ private:
   void *guiData;
   int pdfMajorVersion;
   int pdfMinorVersion;
+  Linearization *linearization;
   XRef *xref;
   Catalog *catalog;
 #ifndef DISABLE_OUTLINE
-- 
1.6.4.2


From b5c01257baded49e3b9ef7b57a66221ed4df36c8 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 13 Apr 2010 18:51:40 +0200
Subject: [PATCH 07/12] Add getLength() to BaseStream

---
 poppler/Stream.cc |   11 ++++++-----
 poppler/Stream.h  |   11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/poppler/Stream.cc b/poppler/Stream.cc
index 0771e25..f4f9351 100644
--- a/poppler/Stream.cc
+++ b/poppler/Stream.cc
@@ -363,8 +363,9 @@ void FileOutStream::printf(const char *format, ...)
 // BaseStream
 //------------------------------------------------------------------------
 
-BaseStream::BaseStream(Object *dictA) {
+BaseStream::BaseStream(Object *dictA, Guint lengthA) {
   dict = *dictA;
+  length = lengthA;
 }
 
 BaseStream::~BaseStream() {
@@ -677,7 +678,7 @@ GBool StreamPredictor::getNextLine() {
 
 FileStream::FileStream(FILE *fA, Guint startA, GBool limitedA,
 		       Guint lengthA, Object *dictA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   f = fA;
   start = startA;
   limited = limitedA;
@@ -802,7 +803,7 @@ void FileStream::moveStart(int delta) {
 
 CachedFileStream::CachedFileStream(CachedFile *ccA, Guint startA,
         GBool limitedA, Guint lengthA, Object *dictA)
-  : BaseStream(dictA)
+  : BaseStream(dictA, lengthA)
 {
   cc = ccA;
   start = startA;
@@ -900,7 +901,7 @@ void CachedFileStream::moveStart(int delta)
 //------------------------------------------------------------------------
 
 MemStream::MemStream(char *bufA, Guint startA, Guint lengthA, Object *dictA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   buf = bufA;
   start = startA;
   length = lengthA;
@@ -964,7 +965,7 @@ void MemStream::moveStart(int delta) {
 
 EmbedStream::EmbedStream(Stream *strA, Object *dictA,
 			 GBool limitedA, Guint lengthA):
-    BaseStream(dictA) {
+    BaseStream(dictA, lengthA) {
   str = strA;
   limited = limitedA;
   length = lengthA;
diff --git a/poppler/Stream.h b/poppler/Stream.h
index 49ae8fb..6896d20 100644
--- a/poppler/Stream.h
+++ b/poppler/Stream.h
@@ -240,7 +240,7 @@ private:
 class BaseStream: public Stream {
 public:
 
-  BaseStream(Object *dictA);
+  BaseStream(Object *dictA, Guint lengthA);
   virtual ~BaseStream();
   virtual Stream *makeSubStream(Guint start, GBool limited,
 				Guint length, Object *dict) = 0;
@@ -250,11 +250,16 @@ public:
   virtual Stream *getUndecodedStream() { return this; }
   virtual Dict *getDict() { return dict.getDict(); }
   virtual GooString *getFileName() { return NULL; }
+  virtual Guint getLength() { return length; }
 
   // Get/set position of first byte of stream within the file.
   virtual Guint getStart() = 0;
   virtual void moveStart(int delta) = 0;
 
+protected:
+
+  Guint length;
+
 private:
 
   Object dict;
@@ -401,7 +406,6 @@ private:
   FILE *f;
   Guint start;
   GBool limited;
-  Guint length;
   char buf[fileStreamBufSize];
   char *bufPtr;
   char *bufEnd;
@@ -446,7 +450,6 @@ private:
   CachedFile *cc;
   Guint start;
   GBool limited;
-  Guint length;
   char buf[cachedStreamBufSize];
   char *bufPtr;
   char *bufEnd;
@@ -490,7 +493,6 @@ private:
 
   char *buf;
   Guint start;
-  Guint length;
   char *bufEnd;
   char *bufPtr;
   GBool needFree;
@@ -530,7 +532,6 @@ private:
 
   Stream *str;
   GBool limited;
-  Guint length;
 };
 
 //------------------------------------------------------------------------
-- 
1.6.4.2


From 3c4f384d782db743f9f1d88834939ac717ad6a32 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 19:16:14 +0100
Subject: [PATCH 08/12] Pass size of file when creating FileStream

---
 poppler/PDFDoc.cc |   19 +++++++++++++++++--
 1 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index fe568a0..0a018fd 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -44,6 +44,7 @@
 #ifdef _WIN32
 #  include <windows.h>
 #endif
+#include <sys/stat.h>
 #include "goo/gstrtod.h"
 #include "goo/GooString.h"
 #include "poppler-config.h"
@@ -98,12 +99,18 @@ PDFDoc::PDFDoc()
 PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
 	       GooString *userPassword, void *guiDataA) {
   Object obj;
+  int size = 0;
 
   init();
 
   fileName = fileNameA;
   guiData = guiDataA;
 
+  struct stat buf;
+  if (stat(fileName->getCString(), &buf) == 0) {
+     size = buf.st_size;
+  }
+
   // try to open file
 #ifdef VMS
   file = fopen(fileName->getCString(), "rb", "ctx=stm");
@@ -123,7 +130,7 @@ PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
 
   // create stream
   obj.initNull();
-  str = new FileStream(file, 0, gFalse, 0, &obj);
+  str = new FileStream(file, 0, gFalse, size, &obj);
 
   ok = setup(ownerPassword, userPassword);
 }
@@ -154,11 +161,19 @@ PDFDoc::PDFDoc(wchar_t *fileNameA, int fileNameLen, GooString *ownerPassword,
 
   // try to open file
   // NB: _wfopen is only available in NT
+  struct stat buf;
+  int size;
   version.dwOSVersionInfoSize = sizeof(version);
   GetVersionEx(&version);
   if (version.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+    if (_wstat(fileName2, &buf) == 0) {
+      size = buf.st_size;
+    }
     file = _wfopen(fileName2, L"rb");
   } else {
+    if (_wstat(fileName->getCString(), &buf) == 0) {
+      size = buf.st_size;
+    }
     file = fopen(fileName->getCString(), "rb");
   }
   if (!file) {
@@ -169,7 +184,7 @@ PDFDoc::PDFDoc(wchar_t *fileNameA, int fileNameLen, GooString *ownerPassword,
 
   // create stream
   obj.initNull();
-  str = new FileStream(file, 0, gFalse, 0, &obj);
+  str = new FileStream(file, 0, gFalse, size, &obj);
 
   ok = setup(ownerPassword, userPassword);
 }
-- 
1.6.4.2


From 0a1e5029769e3096d9f05cb586dac56e13d4a6f5 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 19:32:59 +0100
Subject: [PATCH 09/12] Improve linearization check

---
 poppler/PDFDoc.cc |   33 +++++----------------------------
 1 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 0a018fd..35f5cc9 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -441,34 +441,11 @@ Linearization *PDFDoc::getLinearization()
 }
 
 GBool PDFDoc::isLinearized() {
-  Parser *parser;
-  Object obj1, obj2, obj3, obj4, obj5;
-  GBool lin;
-
-  lin = gFalse;
-  obj1.initNull();
-  parser = new Parser(xref,
-	     new Lexer(xref,
-	       str->makeSubStream(str->getStart(), gFalse, 0, &obj1)),
-	     gTrue);
-  parser->getObj(&obj1);
-  parser->getObj(&obj2);
-  parser->getObj(&obj3);
-  parser->getObj(&obj4);
-  if (obj1.isInt() && obj2.isInt() && obj3.isCmd("obj") &&
-      obj4.isDict()) {
-    obj4.dictLookup("Linearized", &obj5);
-    if (obj5.isNum() && obj5.getNum() > 0) {
-      lin = gTrue;
-    }
-    obj5.free();
-  }
-  obj4.free();
-  obj3.free();
-  obj2.free();
-  obj1.free();
-  delete parser;
-  return lin;
+  if ((str->getLength()) &&
+      (getLinearization()->getLength() == str->getLength()))
+    return gTrue;
+  else
+    return gFalse;
 }
 
 int PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
-- 
1.6.4.2


From 3827314e6450b74eff2acd1e01db48eb144aa76d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 7 Apr 2010 12:05:56 +0200
Subject: [PATCH 10/12] Move getStartXref from XRef to PDFDoc

---
 poppler/PDFDoc.cc |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 poppler/PDFDoc.h  |    5 ++++
 poppler/XRef.cc   |   50 +------------------------------------------
 poppler/XRef.h    |    6 +----
 4 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 35f5cc9..6c4159a 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -34,6 +34,7 @@
 #pragma implementation
 #endif
 
+#include <ctype.h>
 #include <locale.h>
 #include <stdio.h>
 #include <errno.h>
@@ -72,6 +73,9 @@
 #define headerSearchSize 1024	// read this many bytes at beginning of
 				//   file to look for '%PDF'
 
+#define xrefSearchSize 1024	// read this many bytes at end of file
+				//   to look for 'startxref'
+
 //------------------------------------------------------------------------
 // PDFDoc
 //------------------------------------------------------------------------
@@ -89,6 +93,7 @@ void PDFDoc::init()
 #ifndef DISABLE_OUTLINE
   outline = NULL;
 #endif
+  startXRefPos = ~(Guint)0;
 }
 
 PDFDoc::PDFDoc()
@@ -222,7 +227,7 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
   checkHeader();
 
   // read xref table
-  xref = new XRef(str);
+  xref = new XRef(str, getStartXRef());
   if (!xref->isOk()) {
     error(-1, "Couldn't read xref table");
     errCode = xref->getErrorCode();
@@ -889,7 +894,7 @@ void PDFDoc::writeTrailer (Guint uxrefOffset, int uxrefSize, OutStream* outStr,
   trailerDict->set("Root", &obj1);
 
   if (incrUpdate) { 
-    obj1.initInt(xref->getLastXRefPos());
+    obj1.initInt(getStartXRef());
     trailerDict->set("Prev", &obj1);
   }
   
@@ -927,3 +932,55 @@ PDFDoc *PDFDoc::ErrorPDFDoc(int errorCode, GooString *fileNameA)
 
   return doc;
 }
+
+Guint PDFDoc::strToUnsigned(char *s) {
+  Guint x;
+  char *p;
+  int i;
+
+  x = 0;
+  for (p = s, i = 0; *p && isdigit(*p) && i < 10; ++p, ++i) {
+    x = 10 * x + (*p - '0');
+  }
+  return x;
+}
+
+// Read the 'startxref' position.
+Guint PDFDoc::getStartXRef()
+{
+  if (startXRefPos == ~(Guint)0) {
+
+    {
+      char buf[xrefSearchSize+1];
+      char *p;
+      int c, n, i;
+
+      // read last xrefSearchSize bytes
+      str->setPos(xrefSearchSize, -1);
+      for (n = 0; n < xrefSearchSize; ++n) {
+        if ((c = str->getChar()) == EOF) {
+          break;
+        }
+        buf[n] = c;
+      }
+      buf[n] = '\0';
+
+      // find startxref
+      for (i = n - 9; i >= 0; --i) {
+        if (!strncmp(&buf[i], "startxref", 9)) {
+          break;
+        }
+      }
+      if (i < 0) {
+        startXRefPos = 0;
+      }
+      for (p = &buf[i+9]; isspace(*p); ++p) ;
+      startXRefPos =  strToUnsigned(p);
+    }
+
+  }
+
+  return startXRefPos;
+}
+
+
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 011f4c0..d093b59 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -239,6 +239,9 @@ private:
   GBool checkFooter();
   void checkHeader();
   GBool checkEncryption(GooString *ownerPassword, GooString *userPassword);
+  // Get the offset of the start xref table.
+  Guint getStartXRef();
+  Guint strToUnsigned(char *s);
 
   GooString *fileName;
   FILE *file;
@@ -258,6 +261,8 @@ private:
   //If there is an error opening the PDF file with fopen() in the constructor, 
   //then the POSIX errno will be here.
   int fopenErrno;
+
+  Guint startXRefPos;		// offset of last xref table
 };
 
 #endif
diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index a0c77fc..b69bf9a 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -47,11 +47,6 @@
 #include "XRef.h"
 
 //------------------------------------------------------------------------
-
-#define xrefSearchSize 1024	// read this many bytes at end of file
-				//   to look for 'startxref'
-
-//------------------------------------------------------------------------
 // Permission bits
 // Note that the PDF spec uses 1 base (eg bit 3 is 1<<2)
 //------------------------------------------------------------------------
@@ -241,7 +236,7 @@ XRef::XRef() {
   init();
 }
 
-XRef::XRef(BaseStream *strA) {
+XRef::XRef(BaseStream *strA, Guint pos) {
   Object obj;
 
   init();
@@ -367,37 +362,6 @@ int XRef::resize(int newSize)
   return size;
 }
 
-// Read the 'startxref' position.
-Guint XRef::getStartXref() {
-  char buf[xrefSearchSize+1];
-  char *p;
-  int c, n, i;
-
-  // read last xrefSearchSize bytes
-  str->setPos(xrefSearchSize, -1);
-  for (n = 0; n < xrefSearchSize; ++n) {
-    if ((c = str->getChar()) == EOF) {
-      break;
-    }
-    buf[n] = c;
-  }
-  buf[n] = '\0';
-
-  // find startxref
-  for (i = n - 9; i >= 0; --i) {
-    if (!strncmp(&buf[i], "startxref", 9)) {
-      break;
-    }
-  }
-  if (i < 0) {
-    return 0;
-  }
-  for (p = &buf[i+9]; isspace(*p); ++p) ;
-  lastXRefPos = strToUnsigned(p);
-
-  return lastXRefPos;
-}
-
 // Read one xref table section.  Also reads the associated trailer
 // dictionary, and returns the prev pointer (if any).
 GBool XRef::readXRef(Guint *pos) {
@@ -1079,18 +1043,6 @@ int XRef::getNumEntry(Guint offset)
   else return -1;
 }
 
-Guint XRef::strToUnsigned(char *s) {
-  Guint x;
-  char *p;
-  int i;
-
-  x = 0;
-  for (p = s, i = 0; *p && isdigit(*p) && i < 10; ++p, ++i) {
-    x = 10 * x + (*p - '0');
-  }
-  return x;
-}
-
 void XRef::add(int num, int gen, Guint offs, GBool used) {
   if (num >= size) {
     if (num >= capacity) {
diff --git a/poppler/XRef.h b/poppler/XRef.h
index a013e5a..7cd8ebe 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -63,7 +63,7 @@ public:
   // Constructor, create an empty XRef, used for PDF writing
   XRef();
   // Constructor.  Read xref table from stream.
-  XRef(BaseStream *strA);
+  XRef(BaseStream *strA, Guint pos);
 
   // Destructor.
   ~XRef();
@@ -106,9 +106,6 @@ public:
   // Return the number of objects in the xref table.
   int getNumObjects() { return size; }
 
-  // Return the offset of the last xref table.
-  Guint getLastXRefPos() { return lastXRefPos; }
-
   // Return the catalog object reference.
   int getRootNum() { return rootNum; }
   int getRootGen() { return rootGen; }
@@ -143,7 +140,6 @@ private:
   GBool ok;			// true if xref table is valid
   int errCode;			// error code (if <ok> is false)
   Object trailerDict;		// trailer dictionary
-  Guint lastXRefPos;		// offset of last xref table
   Guint *streamEnds;		// 'endstream' positions - only used in
 				//   damaged files
   int streamEndsLen;		// number of valid entries in streamEnds
-- 
1.6.4.2


From caa7dcaa19dc3db4297e0615e2045ba66738a21c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 7 Apr 2010 12:35:05 +0200
Subject: [PATCH 11/12] Use XRef table at start of linearized document

---
 poppler/PDFDoc.cc |   27 ++++++++++++++++++++++++++-
 1 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 6c4159a..e590f3c 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -73,6 +73,10 @@
 #define headerSearchSize 1024	// read this many bytes at beginning of
 				//   file to look for '%PDF'
 
+#define linearizationSearchSize 1024	// read this many bytes at beginning of
+					// file to look for linearization
+					// dictionary
+
 #define xrefSearchSize 1024	// read this many bytes at end of file
 				//   to look for 'startxref'
 
@@ -950,7 +954,28 @@ Guint PDFDoc::getStartXRef()
 {
   if (startXRefPos == ~(Guint)0) {
 
-    {
+    if (isLinearized()) {
+      char buf[linearizationSearchSize+1];
+      int c, n, i;
+
+      str->setPos(0);
+      for (n = 0; n < linearizationSearchSize; ++n) {
+        if ((c = str->getChar()) == EOF) {
+          break;
+        }
+        buf[n] = c;
+      }
+      buf[n] = '\0';
+
+      // find end of first obj
+      startXRefPos = 0;
+      for (i = 0; i < n; i++) {
+        if (!strncmp("endobj", &buf[i], 6)) {
+           startXRefPos = i+6;
+           break;
+        }
+      }
+    } else {
       char buf[xrefSearchSize+1];
       char *p;
       int c, n, i;
-- 
1.6.4.2


From 05da8579632ed1bfecd1d026cd7f6dec96beac43 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Sun, 25 Apr 2010 17:34:49 +0200
Subject: [PATCH 12/12] Use linearization data to parse XRef entries

---
 poppler/PDFDoc.cc |   12 +++++++++++-
 poppler/PDFDoc.h  |    3 +++
 poppler/XRef.cc   |   45 +++++++++++++++++++++++++++++++++++++++++++--
 poppler/XRef.h    |    6 +++++-
 4 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index e590f3c..172f766 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -231,7 +231,7 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
   checkHeader();
 
   // read xref table
-  xref = new XRef(str, getStartXRef());
+  xref = new XRef(str, getStartXRef(), getMainXRefEntriesOffset());
   if (!xref->isOk()) {
     error(-1, "Couldn't read xref table");
     errCode = xref->getErrorCode();
@@ -1008,4 +1008,14 @@ Guint PDFDoc::getStartXRef()
   return startXRefPos;
 }
 
+Guint PDFDoc::getMainXRefEntriesOffset()
+{
+  Guint mainXRefEntriesOffset = 0;
+
+  if (isLinearized()) {
+    mainXRefEntriesOffset = getLinearization()->getMainXRefEntriesOffset();
+  }
+
+  return mainXRefEntriesOffset;
+}
 
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index d093b59..f6f8c8f 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -241,6 +241,9 @@ private:
   GBool checkEncryption(GooString *ownerPassword, GooString *userPassword);
   // Get the offset of the start xref table.
   Guint getStartXRef();
+  // Get the offset of the entries in the main XRef table of a
+  // linearized document (0 for non linearized documents).
+  Guint getMainXRefEntriesOffset();
   Guint strToUnsigned(char *s);
 
   GooString *fileName;
diff --git a/poppler/XRef.cc b/poppler/XRef.cc
index b69bf9a..ab92cd8 100644
--- a/poppler/XRef.cc
+++ b/poppler/XRef.cc
@@ -230,16 +230,19 @@ void XRef::init() {
   streamEnds = NULL;
   streamEndsLen = 0;
   objStr = NULL;
+  mainXRefEntriesOffset = 0;
+  xRefStream = gFalse;
 }
 
 XRef::XRef() {
   init();
 }
 
-XRef::XRef(BaseStream *strA, Guint pos) {
+XRef::XRef(BaseStream *strA, Guint pos, Guint mainXRefEntriesOffsetA) {
   Object obj;
 
   init();
+  mainXRefEntriesOffset = mainXRefEntriesOffsetA;
 
   encrypted = gFalse;
   permFlags = defPermFlags;
@@ -396,6 +399,9 @@ GBool XRef::readXRef(Guint *pos) {
     if (!parser->getObj(&obj)->isStream()) {
       goto err1;
     }
+    if (trailerDict.isNone()) {
+      xRefStream = gTrue;
+    }
     more = readXRefStream(obj.getStream(), pos);
     obj.free();
 
@@ -1157,11 +1163,46 @@ void XRef::writeToFile(OutStream* outStr, GBool writeAllEntries) {
   }
 }
 
+GBool XRef::parseEntry(Guint offset, XRefEntry *entry)
+{
+  GBool r;
+
+  Object obj;
+  obj.initNull();
+  Parser parser = Parser(NULL, new Lexer(NULL,
+     str->makeSubStream(offset, gFalse, 20, &obj)), gTrue);
+
+  Object obj1, obj2, obj3;
+  if ((parser.getObj(&obj1)->isInt()) &&
+      (parser.getObj(&obj2)->isInt()) &&
+      (parser.getObj(&obj3)->isCmd("n") || obj3.isCmd("f"))) {
+    entry->offset = (Guint) obj1.getInt();
+    entry->gen = obj2.getInt();
+    entry->type = obj3.isCmd("n") ? xrefEntryUncompressed : xrefEntryFree;
+    entry->obj.initNull ();
+    entry->updated = false;
+    r = gTrue;
+  } else {
+    r = gFalse;
+  }
+  obj1.free();
+  obj2.free();
+  obj3.free();
+
+  return r;
+}
+
 XRefEntry *XRef::getEntry(int i)
 {
   if (entries[i].type == xrefEntryNone) {
 
-    while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+    if ((!xRefStream) && mainXRefEntriesOffset) {
+      if (!parseEntry(mainXRefEntriesOffset + 20*i, &entries[i])) {
+        error(-1, "Failed to parse XRef entry [%d].", i);
+      }
+    } else {
+      while (readXRef(&prevXRefOffset) && (entries[i].type == xrefEntryNone)) ;
+    }
 
     if (entries[i].type == xrefEntryNone) {
        error(-1, "Invalid XRef entry");
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 7cd8ebe..a4548e6 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -63,7 +63,7 @@ public:
   // Constructor, create an empty XRef, used for PDF writing
   XRef();
   // Constructor.  Read xref table from stream.
-  XRef(BaseStream *strA, Guint pos);
+  XRef(BaseStream *strA, Guint pos, Guint mainXRefEntriesOffsetA = 0);
 
   // Destructor.
   ~XRef();
@@ -153,6 +153,8 @@ private:
   Guchar fileKey[16];		// file decryption key
   GBool ownerPasswordOk;	// true if owner password is correct
   Guint prevXRefOffset;		// position of prev XRef section (= next to read)
+  Guint mainXRefEntriesOffset;	// offset of entries in main XRef table
+  GBool xRefStream;		// true if last XRef section is a stream
 
   void init();
   int reserve(int newSize);
@@ -164,6 +166,8 @@ private:
   GBool readXRefStream(Stream *xrefStr, Guint *pos);
   GBool constructXRef();
   Guint strToUnsigned(char *s);
+  GBool parseEntry(Guint offset, XRefEntry *entry);
+
 };
 
 #endif
-- 
1.6.4.2
-------------- next part --------------
From f5c6549ec7d9ad88c5349a5b70f2f0a1f1b91289 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:03:54 +0200
Subject: [PATCH 01/15] add PDFDoc::getPage()

---
 poppler/PDFDoc.cc |    8 ++++++++
 poppler/PDFDoc.h  |    3 +++
 2 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 172f766..b52b280 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -1019,3 +1019,11 @@ Guint PDFDoc::getMainXRefEntriesOffset()
   return mainXRefEntriesOffset;
 }
 
+Page *PDFDoc::getPage(int page)
+{
+  if ((page < 1) || page > getNumPages()) return NULL;
+
+  {
+    return catalog->getPage(page);
+  }
+}
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index f6f8c8f..011e6e1 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -127,6 +127,9 @@ public:
   // Return the structure tree root object.
   Object *getStructTreeRoot() { return catalog->getStructTreeRoot(); }
 
+  // Get page.
+  Page *getPage(int page);
+
   // Display a page.
   void displayPage(OutputDev *out, int page,
 		   double hDPI, double vDPI, int rotate,
-- 
1.6.4.2


From b9a4939ea3b4a6ca56e963a11ab2c3d51b97b232 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:36:08 +0200
Subject: [PATCH 02/15] Use PDFDoc::getPage() in PDFDoc

---
 poppler/PDFDoc.cc |   28 +++++++++++++++++++++-------
 poppler/PDFDoc.h  |   10 +++++-----
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index b52b280..89dba6f 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -395,8 +395,11 @@ void PDFDoc::displayPage(OutputDev *out, int page,
   if (globalParams->getPrintCommands()) {
     printf("***** page %d *****\n", page);
   }
-  catalog->getPage(page)->display(out, hDPI, vDPI,
-				  rotate, useMediaBox, crop, printing, catalog,
+
+  Page *p = getPage(page);
+  if (!p) return;
+
+  p->display(out, hDPI, vDPI, rotate, useMediaBox, crop, printing, catalog,
 				  abortCheckCbk, abortCheckCbkData,
 				  annotDisplayDecideCbk, annotDisplayDecideCbkData);
 }
@@ -425,8 +428,11 @@ void PDFDoc::displayPageSlice(OutputDev *out, int page,
 			      void *abortCheckCbkData,
                               GBool (*annotDisplayDecideCbk)(Annot *annot, void *user_data),
                               void *annotDisplayDecideCbkData) {
-  catalog->getPage(page)->displaySlice(out, hDPI, vDPI,
-				       rotate, useMediaBox, crop,
+
+  Page *p = getPage(page);
+  if (!p) return;
+
+  p->displaySlice(out, hDPI, vDPI, rotate, useMediaBox, crop,
 				       sliceX, sliceY, sliceW, sliceH,
 				       printing, catalog,
 				       abortCheckCbk, abortCheckCbkData,
@@ -434,11 +440,19 @@ void PDFDoc::displayPageSlice(OutputDev *out, int page,
 }
 
 Links *PDFDoc::getLinks(int page) {
-  return catalog->getPage(page)->getLinks(catalog);
+  Page *p = getPage(page);
+  if (!p) {
+    Object obj;
+    obj.initNull();
+    return new Links (&obj, NULL);
+  }
+  return p->getLinks(catalog);
 }
-  
+
 void PDFDoc::processLinks(OutputDev *out, int page) {
-  catalog->getPage(page)->processLinks(out, catalog);
+  Page *p = getPage(page);
+  if (!p) return;
+  p->processLinks(out, catalog);
 }
 
 Linearization *PDFDoc::getLinearization()
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 011e6e1..8de139f 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -107,15 +107,15 @@ public:
 
   // Get page parameters.
   double getPageMediaWidth(int page)
-    { return catalog->getPage(page)->getMediaWidth(); }
+    { return getPage(page) ? getPage(page)->getMediaWidth() : 0.0 ; }
   double getPageMediaHeight(int page)
-    { return catalog->getPage(page)->getMediaHeight(); }
+    { return getPage(page) ? getPage(page)->getMediaHeight() : 0.0 ; }
   double getPageCropWidth(int page)
-    { return catalog->getPage(page)->getCropWidth(); }
+    { return getPage(page) ? getPage(page)->getCropWidth() : 0.0 ; }
   double getPageCropHeight(int page)
-    { return catalog->getPage(page)->getCropHeight(); }
+    { return getPage(page) ? getPage(page)->getCropHeight() : 0.0 ; }
   int getPageRotate(int page)
-    { return catalog->getPage(page)->getRotate(); }
+    { return getPage(page) ? getPage(page)->getRotate() : 0 ; }
 
   // Get number of pages.
   int getNumPages() { return catalog->getNumPages(); }
-- 
1.6.4.2


From 3f91306f47ba90719a14edd01eaf214cc4f82249 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 20:48:30 +0200
Subject: [PATCH 03/15] Use PDFDoc::getPage() in FontInfo

---
 poppler/FontInfo.cc |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/poppler/FontInfo.cc b/poppler/FontInfo.cc
index 0037e07..c348d14 100644
--- a/poppler/FontInfo.cc
+++ b/poppler/FontInfo.cc
@@ -70,7 +70,9 @@ GooList *FontInfoScanner::scan(int nPages) {
   }
 
   for (int pg = currentPage; pg < lastPage; ++pg) {
-    page = doc->getCatalog()->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) continue;
+
     if ((resDict = page->getResourceDict())) {
       scanFonts(resDict, result);
     }
-- 
1.6.4.2


From 09a50f0dd352f4fe6141ef5ef2492e1005f9e891 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 11:11:11 +0200
Subject: [PATCH 04/15] Use PDFDoc::getPage() in pdfinfo

---
 utils/pdfinfo.cc |   22 +++++++++++++++-------
 1 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index 2abe8b4..a94e4e8 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -257,7 +257,11 @@ int main(int argc, char *argv[]) {
   if (printBoxes) {
     if (multiPage) {
       for (pg = firstPage; pg <= lastPage; ++pg) {
-	page = doc->getCatalog()->getPage(pg);
+	page = doc->getPage(pg);
+	if (!page) {
+          error(-1, "Failed to print boxes for page %d", pg);
+	  continue;
+	}
 	sprintf(buf, "Page %4d MediaBox: ", pg);
 	printBox(buf, page->getMediaBox());
 	sprintf(buf, "Page %4d CropBox:  ", pg);
@@ -270,12 +274,16 @@ int main(int argc, char *argv[]) {
 	printBox(buf, page->getArtBox());
       }
     } else {
-      page = doc->getCatalog()->getPage(firstPage);
-      printBox("MediaBox:       ", page->getMediaBox());
-      printBox("CropBox:        ", page->getCropBox());
-      printBox("BleedBox:       ", page->getBleedBox());
-      printBox("TrimBox:        ", page->getTrimBox());
-      printBox("ArtBox:         ", page->getArtBox());
+      page = doc->getPage(firstPage);
+      if (!page) {
+        error(-1, "Failed to print boxes for page %d", firstPage);
+      } else {
+        printBox("MediaBox:       ", page->getMediaBox());
+        printBox("CropBox:        ", page->getCropBox());
+        printBox("BleedBox:       ", page->getBleedBox());
+        printBox("TrimBox:        ", page->getTrimBox());
+        printBox("ArtBox:         ", page->getArtBox());
+      }
     }
   }
 
-- 
1.6.4.2


From ffcc4834e076b7830bd57eeddf0b6327f1eabb4c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 11:19:53 +0200
Subject: [PATCH 05/15] Use PDFDoc::getPage() in pdffonts

---
 utils/pdffonts.cc |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/utils/pdffonts.cc b/utils/pdffonts.cc
index 81b20e4..30e25dc 100644
--- a/utils/pdffonts.cc
+++ b/utils/pdffonts.cc
@@ -166,7 +166,11 @@ int main(int argc, char *argv[]) {
   fonts = NULL;
   fontsLen = fontsSize = 0;
   for (pg = firstPage; pg <= lastPage; ++pg) {
-    page = doc->getCatalog()->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) {
+      error(-1, "Failed to read fonts from page %d", pg);
+      continue;
+    }
     if ((resDict = page->getResourceDict())) {
       scanFonts(resDict, doc);
     }
-- 
1.6.4.2


From d0799186bfed0f1cbd2f916b12028e89d8ed3d62 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 15:52:20 +0200
Subject: [PATCH 06/15] Use PDFDoc::getPage() in glib

---
 glib/poppler-action.cc   |    4 ++--
 glib/poppler-document.cc |   17 ++++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/glib/poppler-action.cc b/glib/poppler-action.cc
index ffc1842..a0f8576 100644
--- a/glib/poppler-action.cc
+++ b/glib/poppler-action.cc
@@ -422,13 +422,13 @@ find_annot_movie_for_action (PopplerDocument *document,
 
     xref->fetch (ref->num, ref->gen, &annotObj);
   } else if (link->hasAnnotTitle ()) {
-    Catalog *catalog = document->doc->getCatalog ();
     Object annots;
     GooString *title = link->getAnnotTitle ();
     int i;
 
     for (i = 1; i <= document->doc->getNumPages (); ++i) {
-      Page *p = catalog->getPage (i);
+      Page *p = document->doc->getPage (i);
+      if (!p) continue;
 
       if (p->getAnnots (&annots)->isArray ()) {
         int j;
diff --git a/glib/poppler-document.cc b/glib/poppler-document.cc
index cd6794a..ccb0f1c 100644
--- a/glib/poppler-document.cc
+++ b/glib/poppler-document.cc
@@ -379,15 +379,14 @@ PopplerPage *
 poppler_document_get_page (PopplerDocument  *document,
 			   int               index)
 {
-  Catalog *catalog;
   Page *page;
 
   g_return_val_if_fail (0 <= index &&
 			index < poppler_document_get_n_pages (document),
 			NULL);
 
-  catalog = document->doc->getCatalog();
-  page = catalog->getPage (index + 1);
+  page = document->doc->getPage (index + 1);
+  if (!page) return NULL;
 
   return _poppler_page_new (document, page, index);
 }
@@ -1909,18 +1908,22 @@ PopplerFormField *
 poppler_document_get_form_field (PopplerDocument *document,
 				 gint             id)
 {
-  Catalog *catalog = document->doc->getCatalog();
+  Page *page;
   unsigned pageNum;
   unsigned fieldNum;
   FormPageWidgets *widgets;
   FormWidget *field;
 
   FormWidget::decodeID (id, &pageNum, &fieldNum);
-  
-  widgets = catalog->getPage (pageNum)->getPageWidgets ();
+
+  page = document->doc->getPage (pageNum);
+  if (!page)
+    return NULL;
+
+  widgets = page->getPageWidgets ();
   if (!widgets)
     return NULL;
-  
+
   field = widgets->getWidget (fieldNum);
   if (field)
     return _poppler_form_field_new (document, field);
-- 
1.6.4.2


From 2f5c14573200e8318fdd6b52d9de01f0d733301b Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 22 Apr 2010 17:59:01 +0200
Subject: [PATCH 07/15] Use PDFDoc::getPage() in qt4

Note API change: With this patch, Document::Page(int index) can now return NULL
when poppler fails to create a page. Any application using these bindings
should check the return value.
---
 qt4/src/poppler-document.cc |    8 +++++++-
 qt4/src/poppler-link.cc     |    6 ++++--
 qt4/src/poppler-page.cc     |    3 ++-
 qt4/src/poppler-qt4.h       |    3 +++
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/qt4/src/poppler-document.cc b/qt4/src/poppler-document.cc
index 41d35b6..dc0ce97 100644
--- a/qt4/src/poppler-document.cc
+++ b/qt4/src/poppler-document.cc
@@ -98,7 +98,13 @@ namespace Poppler {
 
     Page *Document::page(int index) const
     {
-	return new Page(m_doc, index);
+	Page *page = new Page(m_doc, index);
+	if (!page->isOk()) {
+	  delete page;
+	  return NULL;
+	}
+
+	return page;
     }
 
     bool Document::isLocked() const
diff --git a/qt4/src/poppler-link.cc b/qt4/src/poppler-link.cc
index de06242..4f54201 100644
--- a/qt4/src/poppler-link.cc
+++ b/qt4/src/poppler-link.cc
@@ -232,9 +232,11 @@ class LinkMoviePrivate : public LinkPrivate
 		
 		int leftAux = 0, topAux = 0, rightAux = 0, bottomAux = 0;
 		
-		if (d->pageNum > 0 && d->pageNum <= data.doc->doc->getNumPages())
+		::Page *page;
+		if (d->pageNum > 0 &&
+		    d->pageNum <= data.doc->doc->getNumPages() &&
+		    (page = data.doc->doc->getPage( d->pageNum )))
 		{
-			::Page *page = data.doc->doc->getCatalog()->getPage( d->pageNum );
 			cvtUserToDev( page, left, top, &leftAux, &topAux );
 			cvtUserToDev( page, right, bottom, &rightAux, &bottomAux );
 			
diff --git a/qt4/src/poppler-page.cc b/qt4/src/poppler-page.cc
index 6dbf50f..335f2ce 100644
--- a/qt4/src/poppler-page.cc
+++ b/qt4/src/poppler-page.cc
@@ -186,8 +186,9 @@ Page::Page(DocumentData *doc, int index) {
   m_page = new PageData();
   m_page->index = index;
   m_page->parentDoc = doc;
-  m_page->page = doc->doc->getCatalog()->getPage(m_page->index + 1);
+  m_page->page = doc->doc->getPage(m_page->index + 1);
   m_page->transition = 0;
+  ok = m_page->page ? true : false;
 }
 
 Page::~Page()
diff --git a/qt4/src/poppler-qt4.h b/qt4/src/poppler-qt4.h
index 117dc43..2e77f48 100644
--- a/qt4/src/poppler-qt4.h
+++ b/qt4/src/poppler-qt4.h
@@ -587,11 +587,14 @@ delete it;
 	**/
 	QString label() const;
 	
+	bool isOk() { return ok; };
+
     private:
 	Q_DISABLE_COPY(Page)
 
 	Page(DocumentData *doc, int index);
 	PageData *m_page;
+        bool ok;
     };
 
 /**
-- 
1.6.4.2


From 0232472448894471e21f994cadee505be9fe625d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Fri, 23 Apr 2010 09:21:23 +0200
Subject: [PATCH 08/15] Use PDFDoc::getPage() in qt

Note API change: With this patch, Document::getPage(int index) can now
return NULL when poppler fails to create a page. Any application using
these bindings should check the return value.
---
 qt/poppler-document.cc |   11 +++++++++++
 qt/poppler-page.cc     |   11 +++++++----
 qt/poppler-qt.h        |    6 +++++-
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/qt/poppler-document.cc b/qt/poppler-document.cc
index bade1d1..1a5892b 100644
--- a/qt/poppler-document.cc
+++ b/qt/poppler-document.cc
@@ -113,6 +113,17 @@ int Document::getNumPages() const
   return data->doc.getNumPages();
 }
 
+Page *Document::getPage(int index) const
+{
+  Page *p = new Page(this, index);
+  if (!p->isOk()) {
+    delete p;
+    return NULL;
+  }
+
+  return p;
+}
+
 QValueList<FontInfo> Document::fonts() const
 {
   QValueList<FontInfo> ourList;
diff --git a/qt/poppler-page.cc b/qt/poppler-page.cc
index a42aa15..ef077a7 100644
--- a/qt/poppler-page.cc
+++ b/qt/poppler-page.cc
@@ -47,6 +47,7 @@ class PageData {
   const Document *doc;
   int index;
   PageTransition *transition;
+  ::Page *page;
 };
 
 Page::Page(const Document *doc, int index) {
@@ -54,6 +55,8 @@ Page::Page(const Document *doc, int index) {
   data->index = index;
   data->doc = doc;
   data->transition = 0;
+  data->page = doc->data->doc.getPage(data->index + 1);
+  ok = data->page ? true : false;
 }
 
 Page::~Page()
@@ -132,7 +135,7 @@ QString Page::getText(const Rectangle &r) const
   output_dev = new TextOutputDev(0, gFalse, gFalse, gFalse);
   data->doc->data->doc.displayPageSlice(output_dev, data->index + 1, 72, 72,
       0, false, false, false, -1, -1, -1, -1);
-  p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  p = data->page;
   if (r.isNull())
   {
     rect = p->getCropBox();
@@ -197,7 +200,7 @@ PageTransition *Page::getTransition() const
   {
     Object o;
     PageTransitionParams params;
-    params.dictObj = data->doc->data->doc.getCatalog()->getPage(data->index + 1)->getTrans(&o);
+    params.dictObj = data->page->getTrans(&o);
     data->transition = new PageTransition(params);
     o.free();
   }
@@ -208,7 +211,7 @@ QSize Page::pageSize() const
 {
   ::Page *p;
 
-  p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  p = data->page;
   if ( ( Page::Landscape == orientation() ) || (Page::Seascape == orientation() ) ) {
     return QSize( (int)p->getCropHeight(), (int)p->getCropWidth() );
   } else {
@@ -218,7 +221,7 @@ QSize Page::pageSize() const
 
 Page::Orientation Page::orientation() const
 {
-  ::Page *p = data->doc->data->doc.getCatalog()->getPage(data->index + 1);
+  ::Page *p = data->page;
 
   int rotation = p->getRotate();
   switch (rotation) {
diff --git a/qt/poppler-qt.h b/qt/poppler-qt.h
index a6b1e6e..549ffd2 100644
--- a/qt/poppler-qt.h
+++ b/qt/poppler-qt.h
@@ -31,6 +31,7 @@
 #include <qdom.h>
 #include <qpixmap.h>
 
+
 namespace Poppler {
 
 class Document;
@@ -198,9 +199,12 @@ class Page {
     */
     QValueList<Link*> links() const;
 
+    bool isOk() { return ok; };
+
   private:
     Page(const Document *doc, int index);
     PageData *data;
+    bool ok;
 };
 
 class DocumentData;
@@ -219,7 +223,7 @@ public:
   
   static Document *load(const QString & filePath);
   
-  Page *getPage(int index) const{ return new Page(this, index); }
+  Page *getPage(int index) const;
   
   int getNumPages() const;
   
-- 
1.6.4.2


From f856758f24e1124181020d82ba4851230c855608 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Fri, 23 Apr 2010 12:07:39 +0200
Subject: [PATCH 09/15] Use PDFDoc::getPage() in PSOutputDev

---
 glib/poppler-page.cc            |    1 +
 poppler/PSOutputDev.cc          |   37 ++++++++++++++++++++++---------------
 poppler/PSOutputDev.h           |   13 ++++++++-----
 qt/poppler-document.cc          |    2 +-
 qt4/src/poppler-ps-converter.cc |    1 +
 utils/pdftohtml.cc              |    2 +-
 utils/pdftops.cc                |    2 +-
 7 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 39645bd..106b636 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1161,6 +1161,7 @@ poppler_page_render_to_ps (PopplerPage   *page,
 
   if (!ps_file->out)
     ps_file->out = new PSOutputDev (ps_file->filename,
+                                    ps_file->document->doc,
                                     ps_file->document->doc->getXRef(),
                                     ps_file->document->doc->getCatalog(),
                                     NULL,
diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc
index 179a494..5e5d3d0 100644
--- a/poppler/PSOutputDev.cc
+++ b/poppler/PSOutputDev.cc
@@ -70,6 +70,7 @@
 #  include "SplashOutputDev.h"
 #endif
 #include "PSOutputDev.h"
+#include "PDFDoc.h"
 
 #ifdef MACOS
 // needed for setting type/creator of MacOS files
@@ -972,7 +973,7 @@ static void outputToFile(void *stream, char *data, int len) {
   fwrite(data, 1, len, (FILE *)stream);
 }
 
-PSOutputDev::PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
+PSOutputDev::PSOutputDev(const char *fileName, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 			 char *psTitle,
 			 int firstPage, int lastPage, PSOutMode modeA,
 			 int paperWidthA, int paperHeightA, GBool duplexA,
@@ -1033,13 +1034,14 @@ PSOutputDev::PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
   }
 
   init(outputToFile, f, fileTypeA, psTitle,
-       xrefA, catalog, firstPage, lastPage, modeA,
+       doc, xrefA, catalog, firstPage, lastPage, modeA,
        imgLLXA, imgLLYA, imgURXA, imgURYA, manualCtrlA,
        paperWidthA, paperHeightA, duplexA);
 }
 
 PSOutputDev::PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
 			 char *psTitle,
+			 PDFDoc *doc,
 			 XRef *xrefA, Catalog *catalog,
 			 int firstPage, int lastPage, PSOutMode modeA,
 			 int paperWidthA, int paperHeightA, GBool duplexA,
@@ -1068,18 +1070,17 @@ PSOutputDev::PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
   forceRasterize = forceRasterizeA;
 
   init(outputFuncA, outputStreamA, psGeneric, psTitle,
-       xrefA, catalog, firstPage, lastPage, modeA,
+       doc, xrefA, catalog, firstPage, lastPage, modeA,
        imgLLXA, imgLLYA, imgURXA, imgURYA, manualCtrlA,
        paperWidthA, paperHeightA, duplexA);
 }
 
 void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
-		       PSFileType fileTypeA, char *pstitle, XRef *xrefA, Catalog *catalog,
+		       PSFileType fileTypeA, char *pstitle, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 		       int firstPage, int lastPage, PSOutMode modeA,
 		       int imgLLXA, int imgLLYA, int imgURXA, int imgURYA,
 		       GBool manualCtrlA, int paperWidthA, int paperHeightA,
 		       GBool duplexA) {
-  Page *page;
   PDFRectangle *box;
 
   // initialize
@@ -1099,12 +1100,12 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
   imgURX = imgURXA;
   imgURY = imgURYA;
   if (paperWidth < 0 || paperHeight < 0) {
-    // this check is needed in case the document has zero pages
-    if (firstPage > 0 && firstPage <= catalog->getNumPages()) {
-      page = catalog->getPage(firstPage);
+    Page *page;
+    if ((page = doc->getPage(firstPage))) {
       paperWidth = (int)ceil(page->getMediaWidth());
       paperHeight = (int)ceil(page->getMediaHeight());
     } else {
+      error(-1, "Invalid page %d", firstPage);
       paperWidth = 1;
       paperHeight = 1;
     }
@@ -1170,14 +1171,16 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
   embFontList = new GooString();
 
   if (!manualCtrl) {
+    Page *page;
     // this check is needed in case the document has zero pages
-    if (firstPage > 0 && firstPage <= catalog->getNumPages()) {
+    if ((page = doc->getPage(firstPage))) {
       writeHeader(firstPage, lastPage,
-		  catalog->getPage(firstPage)->getMediaBox(),
-		  catalog->getPage(firstPage)->getCropBox(),
-		  catalog->getPage(firstPage)->getRotate(),
+		  page->getMediaBox(),
+		  page->getCropBox(),
+		  page->getRotate(),
 		  pstitle);
     } else {
+      error(-1, "Invalid page %d", firstPage);
       box = new PDFRectangle(0, 0, 1, 1);
       writeHeader(firstPage, lastPage, box, box, 0, pstitle);
       delete box;
@@ -1190,7 +1193,7 @@ void PSOutputDev::init(PSOutputFunc outputFuncA, void *outputStreamA,
       writePS("%%EndProlog\n");
       writePS("%%BeginSetup\n");
     }
-    writeDocSetup(catalog, firstPage, lastPage, duplexA);
+    writeDocSetup(doc, catalog, firstPage, lastPage, duplexA);
     if (mode != psModeForm) {
       writePS("%%EndSetup\n");
     }
@@ -1400,7 +1403,7 @@ void PSOutputDev::writeXpdfProcset() {
   }
 }
 
-void PSOutputDev::writeDocSetup(Catalog *catalog,
+void PSOutputDev::writeDocSetup(PDFDoc *doc, Catalog *catalog,
 				int firstPage, int lastPage,
                                 GBool duplexA) {
   Page *page;
@@ -1416,7 +1419,11 @@ void PSOutputDev::writeDocSetup(Catalog *catalog,
     writePS("xpdf begin\n");
   }
   for (pg = firstPage; pg <= lastPage; ++pg) {
-    page = catalog->getPage(pg);
+    page = doc->getPage(pg);
+    if (!page) {
+      error(-1, "Failed writing resources for page %d", pg);
+      continue;
+    }
     if ((resDict = page->getResourceDict())) {
       setupResources(resDict);
     }
diff --git a/poppler/PSOutputDev.h b/poppler/PSOutputDev.h
index 38c838c..a84a638 100644
--- a/poppler/PSOutputDev.h
+++ b/poppler/PSOutputDev.h
@@ -50,6 +50,7 @@ struct PSFont8Info;
 struct PSFont16Enc;
 class PSOutCustomColor;
 class Function;
+class PDFDoc;
 
 //------------------------------------------------------------------------
 // PSOutputDev
@@ -75,7 +76,7 @@ class PSOutputDev: public OutputDev {
 public:
 
   // Open a PostScript output file, and write the prolog.
-  PSOutputDev(const char *fileName, XRef *xrefA, Catalog *catalog,
+  PSOutputDev(const char *fileName, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 	      char *psTitle,
 	      int firstPage, int lastPage, PSOutMode modeA,
 	      int paperWidthA = -1, int paperHeightA = -1,
@@ -88,6 +89,7 @@ public:
   // Open a PSOutputDev that will write to a generic stream.
   PSOutputDev(PSOutputFunc outputFuncA, void *outputStreamA,
 	      char *psTitle,
+	      PDFDoc *doc,
 	      XRef *xrefA, Catalog *catalog,
 	      int firstPage, int lastPage, PSOutMode modeA,
 	      int paperWidthA = -1, int paperHeightA = -1,
@@ -145,9 +147,6 @@ public:
   // Write the Xpdf procset.
   void writeXpdfProcset();
 
-  // Write the document-level setup.
-  void writeDocSetup(Catalog *catalog, int firstPage, int lastPage, GBool duplexA);
-
   // Write the trailer for the current page.
   void writePageTrailer();
 
@@ -287,7 +286,7 @@ public:
 private:
 
   void init(PSOutputFunc outputFuncA, void *outputStreamA,
-	    PSFileType fileTypeA, char *pstitle, XRef *xrefA, Catalog *catalog,
+	    PSFileType fileTypeA, char *pstitle, PDFDoc *doc, XRef *xrefA, Catalog *catalog,
 	    int firstPage, int lastPage, PSOutMode modeA,
 	    int imgLLXA, int imgLLYA, int imgURXA, int imgURYA,
 	    GBool manualCtrlA, int paperWidthA, int paperHeightA,
@@ -341,6 +340,10 @@ private:
 		    double *x1, double *y1);
 #endif
   void cvtFunction(Function *func);
+
+  // Write the document-level setup.
+  void writeDocSetup(PDFDoc *doc, Catalog *catalog, int firstPage, int lastPage, GBool duplexA);
+
   void writePSChar(char c);
   void writePS(char *s);
   void writePSFmt(const char *fmt, ...);
diff --git a/qt/poppler-document.cc b/qt/poppler-document.cc
index 1a5892b..03d01fa 100644
--- a/qt/poppler-document.cc
+++ b/qt/poppler-document.cc
@@ -325,7 +325,7 @@ bool Document::print(const QString &fileName, QValueList<int> pageList, double h
 
 bool Document::print(const QString &file, QValueList<int> pageList, double hDPI, double vDPI, int rotate, int paperWidth, int paperHeight)
 {
-  PSOutputDev *psOut = new PSOutputDev(file.latin1(), data->doc.getXRef(), data->doc.getCatalog(), NULL, 1, data->doc.getNumPages(), psModePS, paperWidth, paperHeight);
+  PSOutputDev *psOut = new PSOutputDev(file.latin1(), &(data->doc), data->doc.getXRef(), data->doc.getCatalog(), NULL, 1, data->doc.getNumPages(), psModePS, paperWidth, paperHeight);
   
   if (psOut->isOk()) {
     QValueList<int>::iterator it;
diff --git a/qt4/src/poppler-ps-converter.cc b/qt4/src/poppler-ps-converter.cc
index 7a1957b..9dc82ec 100644
--- a/qt4/src/poppler-ps-converter.cc
+++ b/qt4/src/poppler-ps-converter.cc
@@ -195,6 +195,7 @@ bool PSConverter::convert()
 	
 	PSOutputDev *psOut = new PSOutputDev(outputToQIODevice, dev,
 	                                     pstitlechar,
+	                                     d->document->doc,
 	                                     d->document->doc->getXRef(),
 	                                     d->document->doc->getCatalog(),
 	                                     1,
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 3c74c6e..0558e5c 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -350,7 +350,7 @@ int main(int argc, char *argv[]) {
     psFileName = new GooString(htmlFileName->getCString());
     psFileName->append(".ps");
 
-    psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+    psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
 			    doc->getCatalog(), NULL, firstPage, lastPage, psModePS, w, h);
     psOut->setDisplayText(gFalse);
     doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
diff --git a/utils/pdftops.cc b/utils/pdftops.cc
index 0bc43a1..8231458 100644
--- a/utils/pdftops.cc
+++ b/utils/pdftops.cc
@@ -359,7 +359,7 @@ int main(int argc, char *argv[]) {
   }
 
   // write PostScript file
-  psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+  psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
 			  doc->getCatalog(), NULL, firstPage, lastPage, mode,
 			  paperWidth,
 			  paperHeight,
-- 
1.6.4.2


From e046bf1778582a613205c231e18efe37e37733dc Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Sat, 24 Apr 2010 10:17:56 +0200
Subject: [PATCH 10/15] Use PDFDoc::getPage() in HtmlOutputDev

---
 utils/HtmlOutputDev.cc |    2 +-
 utils/HtmlOutputDev.h  |    2 ++
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 81f8b88..4f7dff6 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -1093,7 +1093,7 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state) {
 
 
 void HtmlOutputDev::endPage() {
-  Links *linksList = catalog->getPage(pageNum)->getLinks(catalog);
+  Links *linksList = docPage->getLinks(catalog);
   for (int i = 0; i < linksList->getNumLinks(); ++i)
   {
       doProcessLink(linksList->getLink(i));
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index 24ccfd1..48b04c6 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -256,6 +256,7 @@ public:
                                GBool (* abortCheckCbk)(void *data) = NULL,
                                void * abortCheckCbkData = NULL)
   {
+   docPage = page;
    catalog = catalogA;
    return gTrue;
   }
@@ -323,6 +324,7 @@ private:
   GooString *docTitle;
   GooList *glMetaVars;
   Catalog *catalog;
+  Page *docPage;
   friend class HtmlPage;
 };
 
-- 
1.6.4.2


From 0ae15644df43ca7d5ef4c1a52f8731e6f4a14fbf Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 31 Mar 2010 14:39:57 +0200
Subject: [PATCH 11/15] Parse page tree on demand

---
 poppler/Catalog.cc |  266 ++++++++++++++++++++++++++++++++++-----------------
 poppler/Catalog.h  |   12 ++-
 2 files changed, 185 insertions(+), 93 deletions(-)

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index 900cdd7..416fb66 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -59,9 +59,6 @@ Catalog::Catalog(XRef *xrefA) {
   Object catDict, pagesDict, pagesDictRef;
   Object obj, obj2;
   Object optContentProps;
-  char *alreadyRead;
-  int numPages0;
-  int i;
 
   ok = gTrue;
   xref = xrefA;
@@ -78,6 +75,12 @@ Catalog::Catalog(XRef *xrefA) {
   embeddedFileNameTree = NULL;
   jsNameTree = NULL;
 
+  pagesList = NULL;
+  pagesRefList = NULL;
+  attrsList = NULL;
+  kidsIdxList = NULL;
+  lastCachedPage = 0;
+
   xref->getCatalog(&catDict);
   if (!catDict.isDict()) {
     error(-1, "Catalog object is wrong type (%s)", catDict.getTypeName());
@@ -100,31 +103,11 @@ Catalog::Catalog(XRef *xrefA) {
   if (!obj.isNum()) {
     error(-1, "Page count in top-level pages object is wrong type (%s)",
 	  obj.getTypeName());
-    pagesSize = numPages0 = 0;
+    numPages = 0;
   } else {
-    pagesSize = numPages0 = (int)obj.getNum();
+    numPages = (int)obj.getNum();
   }
   obj.free();
-  pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
-  pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
-  for (i = 0; i < pagesSize; ++i) {
-    pages[i] = NULL;
-    pageRefs[i].num = -1;
-    pageRefs[i].gen = -1;
-  }
-  alreadyRead = (char *)gmalloc(xref->getNumObjects());
-  memset(alreadyRead, 0, xref->getNumObjects());
-  if (catDict.dictLookupNF("Pages", &pagesDictRef)->isRef() &&
-      pagesDictRef.getRefNum() >= 0 &&
-      pagesDictRef.getRefNum() < xref->getNumObjects()) {
-    alreadyRead[pagesDictRef.getRefNum()] = 1;
-  }
-  pagesDictRef.free();
-  numPages = readPageTree(pagesDict.getDict(), NULL, 0, alreadyRead);
-  gfree(alreadyRead);
-  if (numPages != numPages0) {
-    error(-1, "Page count in top-level pages object is incorrect");
-  }
   pagesDict.free();
 
   // read base URI
@@ -163,6 +146,10 @@ Catalog::Catalog(XRef *xrefA) {
 Catalog::~Catalog() {
   int i;
 
+  delete kidsIdxList;
+  delete attrsList;
+  delete pagesRefList;
+  delete pagesList;
   if (pages) {
     for (i = 0; i < pagesSize; ++i) {
       if (pages[i]) {
@@ -225,91 +212,192 @@ GooString *Catalog::readMetadata() {
   return s;
 }
 
-int Catalog::readPageTree(Dict *pagesDict, PageAttrs *attrs, int start,
-			  char *alreadyRead) {
-  Object kids;
-  Object kid;
-  Object kidRef;
-  PageAttrs *attrs1, *attrs2;
-  Page *page;
-  int i, j;
-
-  attrs1 = new PageAttrs(attrs, pagesDict);
-  pagesDict->lookup("Kids", &kids);
-  if (!kids.isArray()) {
-    error(-1, "Kids object (page %d) is wrong type (%s)",
-	  start+1, kids.getTypeName());
-    return start;
-  }
-  for (i = 0; i < kids.arrayGetLength(); ++i) {
-    kids.arrayGetNF(i, &kidRef);
-    if (kidRef.isRef() &&
-	kidRef.getRefNum() >= 0 &&
-	kidRef.getRefNum() < xref->getNumObjects()) {
-      if (alreadyRead[kidRef.getRefNum()]) {
-	error(-1, "Loop in Pages tree");
-	kidRef.free();
-	continue;
+Page *Catalog::getPage(int i)
+{
+  if (i < 1) return NULL;
+
+  if (i > lastCachedPage) {
+     if (cachePageTree(i) == gFalse) return NULL;
+  }
+  return pages[i-1];
+}
+
+Ref *Catalog::getPageRef(int i)
+{
+  if (i < 1) return NULL;
+
+  if (i > lastCachedPage) {
+     if (cachePageTree(i) == gFalse) return NULL;
+  }
+  return &pageRefs[i-1];
+}
+
+GBool Catalog::cachePageTree(int page)
+{
+  Dict *pagesDict;
+
+  if (pagesList == NULL) {
+
+    Object catDict;
+    Ref pagesRef;
+
+    xref->getCatalog(&catDict);
+
+    Object pagesDictRef;
+    if (catDict.dictLookupNF("Pages", &pagesDictRef)->isRef() &&
+        pagesDictRef.getRefNum() >= 0 &&
+        pagesDictRef.getRefNum() < xref->getNumObjects()) {
+      pagesRef = pagesDictRef.getRef();
+      pagesDictRef.free();
+    } else {
+       error(-1, "Catalog dictionary does not contain a valid \"Pages\" entry");
+       pagesDictRef.free();
+       return gFalse;
+    }
+
+    Object obj;
+    catDict.dictLookup("Pages", &obj);
+    // This should really be isDict("Pages"), but I've seen at least one
+    // PDF file where the /Type entry is missing.
+    if (obj.isDict()) {
+      obj.getDict()->incRef();
+      pagesDict = obj.getDict();
+      obj.free();
+    }
+    else {
+      error(-1, "Top-level pages object is wrong type (%s)", obj.getTypeName());
+      obj.free();
+      return gFalse;
+    }
+
+    pagesSize = numPages;
+    pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
+    pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
+    for (int i = 0; i < pagesSize; ++i) {
+      pages[i] = NULL;
+      pageRefs[i].num = -1;
+      pageRefs[i].gen = -1;
+    }
+
+    pagesList = new GooVector<Dict *>();
+    pagesList->push_back(pagesDict);
+    pagesRefList = new GooVector<Ref>();
+    pagesRefList->push_back(pagesRef);
+    attrsList = new GooVector<PageAttrs *>();
+    attrsList->push_back(new PageAttrs(NULL, pagesDict));
+    kidsIdxList = new GooVector<int>();
+    kidsIdxList->push_back(0);
+    lastCachedPage = 0;
+
+  }
+
+  while(1) {
+
+    if (page <= lastCachedPage) return gTrue;
+
+    if (pagesList->empty()) return gFalse;
+
+    pagesDict = pagesList->back();
+    Object kids;
+    pagesDict->lookup("Kids", &kids);
+    if (!kids.isArray()) {
+      error(-1, "Kids object (page %d) is wrong type (%s)",
+            lastCachedPage+1, kids.getTypeName());
+      kids.free();
+      return gFalse;
+    }
+
+    int kidsIdx = kidsIdxList->back();
+    if (kidsIdx >= kids.arrayGetLength()) {
+       delete pagesList->back();
+       pagesList->pop_back();
+       pagesRefList->pop_back();
+       delete attrsList->back();
+       attrsList->pop_back();
+       kidsIdxList->pop_back();
+       if (!kidsIdxList->empty()) kidsIdxList->back()++;
+       kids.free();
+       continue;
+    }
+
+    Object kidRef;
+    kids.arrayGetNF(kidsIdx, &kidRef);
+    if (!kidRef.isRef()) {
+      error(-1, "Kid object (page %d) is not an indirect reference (%s)",
+            lastCachedPage+1, kidRef.getTypeName());
+      kidRef.free();
+      kids.free();
+      return gFalse;
+    }
+
+    for (size_t i = 0; i < pagesRefList->size(); i++) {
+      if (((*pagesRefList)[i]).num == kidRef.getRefNum()) {
+         error(-1, "Loop in Pages tree");
+         kidRef.free();
+         kids.free();
+         kidsIdxList->back()++;
+         continue;
       }
-      alreadyRead[kidRef.getRefNum()] = 1;
     }
-    kids.arrayGet(i, &kid);
+
+    Object kid;
+    kids.arrayGet(kidsIdx, &kid);
+    kids.free();
     if (kid.isDict("Page")) {
-      attrs2 = new PageAttrs(attrs1, kid.getDict());
-      page = new Page(xref, start+1, kid.getDict(), kidRef.getRef(), attrs2, getForm());
-      if (!page->isOk()) {
-	++start;
-	goto err3;
-      }
-      if (start >= pagesSize) {
-	pagesSize += 32;
-	pages = (Page **)greallocn(pages, pagesSize, sizeof(Page *));
-	pageRefs = (Ref *)greallocn(pageRefs, pagesSize, sizeof(Ref));
-	for (j = pagesSize - 32; j < pagesSize; ++j) {
-	  pages[j] = NULL;
-	  pageRefs[j].num = -1;
-	  pageRefs[j].gen = -1;
-	}
+      PageAttrs *attrs = new PageAttrs(attrsList->back(), kid.getDict());
+      Page *p = new Page(xref, lastCachedPage+1, kid.getDict(),
+                     kidRef.getRef(), attrs, form);
+      if (!p->isOk()) {
+        error(-1, "Failed to create page (page %d)", lastCachedPage+1);
+        delete p;
+        kidRef.free();
+        kid.free();
+        return gFalse;
       }
-      pages[start] = page;
-      if (kidRef.isRef()) {
-	pageRefs[start].num = kidRef.getRefNum();
-	pageRefs[start].gen = kidRef.getRefGen();
+
+      if (lastCachedPage >= numPages) {
+        error(-1, "Page count in top-level pages object is incorrect");
+        kidRef.free();
+        kid.free();
+        return gFalse;
       }
-      ++start;
+
+      pages[lastCachedPage] = p;
+      pageRefs[lastCachedPage].num = kidRef.getRefNum();
+      pageRefs[lastCachedPage].gen = kidRef.getRefGen();
+
+      lastCachedPage++;
+      kidsIdxList->back()++;
+
     // This should really be isDict("Pages"), but I've seen at least one
     // PDF file where the /Type entry is missing.
     } else if (kid.isDict()) {
-      if ((start = readPageTree(kid.getDict(), attrs1, start, alreadyRead))
-	  < 0)
-	goto err2;
+      attrsList->push_back(new PageAttrs(attrsList->back(), kid.getDict()));
+      pagesRefList->push_back(kidRef.getRef());
+      kid.getDict()->incRef();
+      pagesList->push_back(kid.getDict());
+      kidsIdxList->push_back(0);
     } else {
       error(-1, "Kid object (page %d) is wrong type (%s)",
-	    start+1, kid.getTypeName());
+            lastCachedPage+1, kid.getTypeName());
+      kidRef.free();
+      kid.free();
+      return gFalse;
     }
-    kid.free();
     kidRef.free();
+    kid.free();
+
   }
-  delete attrs1;
-  kids.free();
-  return start;
 
- err3:
-  delete page;
- err2:
-  kid.free();
-  kidRef.free();
-  kids.free();
-  delete attrs1;
-  ok = gFalse;
-  return -1;
+  return gFalse;
 }
 
 int Catalog::findPage(int num, int gen) {
   int i;
 
   for (i = 0; i < numPages; ++i) {
-    if (pageRefs[i].num == num && pageRefs[i].gen == gen)
+    Ref *ref = getPageRef(i+1);
+    if (ref->num == num && ref->gen == gen)
       return i + 1;
   }
   return 0;
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 2cab80a..5a25109 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -151,10 +151,10 @@ public:
   int getNumPages() { return numPages; }
 
   // Get a page.
-  Page *getPage(int i) { return pages[i-1]; }
+  Page *getPage(int i);
 
   // Get the reference for a page object.
-  Ref *getPageRef(int i) { return &pageRefs[i-1]; }
+  Ref *getPageRef(int i);
 
   // Return base URI, or NULL if none.
   GooString *getBaseURI() { return baseURI; }
@@ -232,6 +232,11 @@ private:
   XRef *xref;			// the xref table for this PDF file
   Page **pages;			// array of pages
   Ref *pageRefs;		// object ID for each page
+  int lastCachedPage;
+  GooVector<Dict *> *pagesList;
+  GooVector<Ref> *pagesRefList;
+  GooVector<PageAttrs *> *attrsList;
+  GooVector<int> *kidsIdxList;
   Form *form;
   int numPages;			// number of pages
   int pagesSize;		// size of pages array
@@ -251,8 +256,7 @@ private:
   PageMode pageMode;		// page mode
   PageLayout pageLayout;	// page layout
 
-  int readPageTree(Dict *pages, PageAttrs *attrs, int start,
-		   char *alreadyRead);
+  GBool cachePageTree(int page); // Cache first <page> pages.
   Object *findDestInTree(Object *tree, GooString *name, Object *obj);
 
   Object *getNames();
-- 
1.6.4.2


From e9ee24e6fddfa520407161f49a732c9b16f42251 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 22:01:41 +0100
Subject: [PATCH 12/15] Parse number of pages on demand

---
 poppler/Catalog.cc |   70 +++++++++++++++++++++++++++++++--------------------
 poppler/Catalog.h  |    2 +-
 2 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc
index 416fb66..536e474 100644
--- a/poppler/Catalog.cc
+++ b/poppler/Catalog.cc
@@ -64,7 +64,8 @@ Catalog::Catalog(XRef *xrefA) {
   xref = xrefA;
   pages = NULL;
   pageRefs = NULL;
-  numPages = pagesSize = 0;
+  numPages = -1;
+  pagesSize = 0;
   baseURI = NULL;
   pageLabelInfo = NULL;
   form = NULL;
@@ -89,27 +90,6 @@ Catalog::Catalog(XRef *xrefA) {
   // get the AcroForm dictionary
   catDict.dictLookup("AcroForm", &acroForm);
 
-  // read page tree
-  catDict.dictLookup("Pages", &pagesDict);
-  // This should really be isDict("Pages"), but I've seen at least one
-  // PDF file where the /Type entry is missing.
-  if (!pagesDict.isDict()) {
-    error(-1, "Top-level pages object is wrong type (%s)",
-	  pagesDict.getTypeName());
-    goto err2;
-  }
-  pagesDict.dictLookup("Count", &obj);
-  // some PDF files actually use real numbers here ("/Count 9.0")
-  if (!obj.isNum()) {
-    error(-1, "Page count in top-level pages object is wrong type (%s)",
-	  obj.getTypeName());
-    numPages = 0;
-  } else {
-    numPages = (int)obj.getNum();
-  }
-  obj.free();
-  pagesDict.free();
-
   // read base URI
   if (catDict.dictLookup("URI", &obj)->isDict()) {
     if (obj.dictLookup("Base", &obj2)->isString()) {
@@ -136,8 +116,6 @@ Catalog::Catalog(XRef *xrefA) {
   catDict.free();
   return;
 
- err2:
-  pagesDict.free();
  err1:
   catDict.free();
   ok = gFalse;
@@ -270,7 +248,7 @@ GBool Catalog::cachePageTree(int page)
       return gFalse;
     }
 
-    pagesSize = numPages;
+    pagesSize = getNumPages();
     pages = (Page **)gmallocn(pagesSize, sizeof(Page *));
     pageRefs = (Ref *)gmallocn(pagesSize, sizeof(Ref));
     for (int i = 0; i < pagesSize; ++i) {
@@ -395,7 +373,7 @@ GBool Catalog::cachePageTree(int page)
 int Catalog::findPage(int num, int gen) {
   int i;
 
-  for (i = 0; i < numPages; ++i) {
+  for (i = 0; i < getNumPages(); ++i) {
     Ref *ref = getPageRef(i+1);
     if (ref->num == num && ref->gen == gen)
       return i + 1;
@@ -719,7 +697,7 @@ GBool Catalog::labelToIndex(GooString *label, int *index)
       return gFalse;
   }
 
-  if (*index < 0 || *index >= numPages)
+  if (*index < 0 || *index >= getNumPages())
     return gFalse;
 
   return gTrue;
@@ -729,7 +707,7 @@ GBool Catalog::indexToLabel(int index, GooString *label)
 {
   char buffer[32];
 
-  if (index < 0 || index >= numPages)
+  if (index < 0 || index >= getNumPages())
     return gFalse;
 
   PageLabelInfo *pli = getPageLabelInfo();
@@ -845,6 +823,42 @@ EmbFile::EmbFile(Object *efDict, GooString *description)
     m_mimetype = new GooString();
 }
 
+int Catalog::getNumPages()
+{
+  if (numPages == -1)
+  {
+    Object catDict, pagesDict, obj;
+
+    xref->getCatalog(&catDict);
+    catDict.dictLookup("Pages", &pagesDict);
+    catDict.free();
+
+    // This should really be isDict("Pages"), but I've seen at least one
+    // PDF file where the /Type entry is missing.
+    if (!pagesDict.isDict()) {
+      error(-1, "Top-level pages object is wrong type (%s)",
+          pagesDict.getTypeName());
+      pagesDict.free();
+      return 0;
+    }
+
+    pagesDict.dictLookup("Count", &obj);
+    // some PDF files actually use real numbers here ("/Count 9.0")
+    if (!obj.isNum()) {
+      error(-1, "Page count in top-level pages object is wrong type (%s)",
+         obj.getTypeName());
+      numPages = 0;
+    } else {
+      numPages = (int)obj.getNum();
+    }
+
+    obj.free();
+    pagesDict.free();
+  }
+
+  return numPages;
+}
+
 PageLabelInfo *Catalog::getPageLabelInfo()
 {
   if (!pageLabelInfo) {
diff --git a/poppler/Catalog.h b/poppler/Catalog.h
index 5a25109..8bca80b 100644
--- a/poppler/Catalog.h
+++ b/poppler/Catalog.h
@@ -148,7 +148,7 @@ public:
   GBool isOk() { return ok; }
 
   // Get number of pages.
-  int getNumPages() { return numPages; }
+  int getNumPages();
 
   // Get a page.
   Page *getPage(int i);
-- 
1.6.4.2


From 27842f9134cf63cc9d75f90fca0c8b7dd4f3ba60 Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Thu, 25 Mar 2010 18:53:54 +0100
Subject: [PATCH 13/15] Get number of pages from linearization table

---
 poppler/PDFDoc.cc |    9 +++++++++
 poppler/PDFDoc.h  |    2 +-
 2 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 89dba6f..8f105fd 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -1033,6 +1033,15 @@ Guint PDFDoc::getMainXRefEntriesOffset()
   return mainXRefEntriesOffset;
 }
 
+int PDFDoc::getNumPages()
+{
+  if (isLinearized()) {
+    return getLinearization()->getNumPages();
+  } else {
+    return catalog->getNumPages();
+  }
+}
+
 Page *PDFDoc::getPage(int page)
 {
   if ((page < 1) || page > getNumPages()) return NULL;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 8de139f..9069698 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -118,7 +118,7 @@ public:
     { return getPage(page) ? getPage(page)->getRotate() : 0 ; }
 
   // Get number of pages.
-  int getNumPages() { return catalog->getNumPages(); }
+  int getNumPages();
 
   // Return the contents of the metadata stream, or NULL if there is
   // no metadata.
-- 
1.6.4.2


From cf2c35c975702b3e3a0744d62ba6a15d1ce80b6d Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Wed, 24 Mar 2010 22:03:27 +0100
Subject: [PATCH 14/15] Add hint tables support

---
 CMakeLists.txt      |    2 +
 poppler/Hints.cc    |  405 +++++++++++++++++++++++++++++++++++++++++++++++++++
 poppler/Hints.h     |   92 ++++++++++++
 poppler/Makefile.am |    2 +
 poppler/PDFDoc.cc   |   14 ++
 poppler/PDFDoc.h    |    5 +
 6 files changed, 520 insertions(+), 0 deletions(-)
 create mode 100644 poppler/Hints.cc
 create mode 100644 poppler/Hints.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a119a6d..6d43826 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,6 +245,7 @@ set(poppler_SRCS
   poppler/GfxFont.cc
   poppler/GfxState.cc
   poppler/GlobalParams.cc
+  poppler/Hints.cc
   poppler/JArithmeticDecoder.cc
   poppler/JBIG2Stream.cc
   poppler/Lexer.cc
@@ -391,6 +392,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/GfxState.h
     poppler/GfxState_helpers.h
     poppler/GlobalParams.h
+    poppler/Hints.h
     poppler/JArithmeticDecoder.h
     poppler/JBIG2Stream.h
     poppler/Lexer.h
diff --git a/poppler/Hints.cc b/poppler/Hints.cc
new file mode 100644
index 0000000..97af853
--- /dev/null
+++ b/poppler/Hints.cc
@@ -0,0 +1,405 @@
+//========================================================================
+//
+// Hints.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#include <config.h>
+
+#include "Hints.h"
+
+#include "Linearization.h"
+#include "Object.h"
+#include "Stream.h"
+#include "XRef.h"
+#include "Parser.h"
+#include "Lexer.h"
+
+#include <limits.h>
+
+//------------------------------------------------------------------------
+// Hints
+//------------------------------------------------------------------------
+
+Hints::Hints(BaseStream *str, Linearization *linearization)
+{
+  mainXRefEntriesOffset = linearization->getMainXRefEntriesOffset();
+  nPages = linearization->getNumPages();
+  pageFirst = linearization->getPageFirst();
+  pageEndFirst = linearization->getEndFirst();
+
+  if (nPages >= INT_MAX / (int)sizeof(Guint)) {
+     error(-1, "Invalid number of pages (%d) for hints table", nPages);
+     nPages = 0;
+  }
+  nObjects = (Guint *) gmallocn(nPages, sizeof(Guint));
+  xRefOffset = (Guint *) gmallocn(nPages, sizeof(Guint));
+  pageLength = (Guint *) gmallocn(nPages, sizeof(Guint));
+  pageOffset = (Guint *) gmallocn(nPages, sizeof(Guint));
+  numSharedObject = (Guint *) gmallocn(nPages, sizeof(Guint));
+  sharedObjectId = (Guint **) gmallocn(nPages, sizeof(Guint));
+  if (!nObjects || !xRefOffset || !pageLength || !pageOffset ||
+      !numSharedObject || !sharedObjectId) {
+    error(-1, "Failed to allocate memory for hints tabel");
+    nPages = 0;
+  }
+
+  memset(numSharedObject, 0, nPages);
+
+  nSharedGroups = 0;
+  groupLength = NULL;
+  groupOffset = NULL;
+  groupHasSignature = NULL;
+  groupNumObjects = NULL;
+  groupXRefOffset = NULL;
+
+  readTables(str, linearization);
+}
+
+Hints::~Hints()
+{
+  gfree(nObjects);
+  gfree(xRefOffset);
+  gfree(pageLength);
+  gfree(pageOffset);
+  gfree(numSharedObject);
+  for (int i=0; i< nPages; i++) gfree(sharedObjectId[i]);
+  gfree(sharedObjectId);
+
+  gfree(groupLength);
+  gfree(groupOffset);
+  gfree(groupHasSignature);
+  gfree(groupNumObjects);
+  gfree(groupXRefOffset);
+}
+
+void Hints::readTables(BaseStream *str, Linearization *linearization)
+{
+  hintsOffset = linearization->getHintsOffset();
+  hintsLength = linearization->getHintsLength();
+  hintsOffset2 = linearization->getHintsOffset2();
+  hintsLength2 = linearization->getHintsLength2();
+
+  Parser *parser;
+  Object obj;
+
+  int bufLength = hintsLength + hintsLength2;
+
+  char buf[bufLength];
+  char *p = buf;
+
+  obj.initNull();
+  Stream *s = str->makeSubStream(hintsOffset, gFalse, hintsLength, &obj);
+  s->reset();
+  for (Guint i=0; i < hintsLength; i++) { *p++ = s->getChar(); }
+  delete s;
+
+  if (hintsOffset2 && hintsLength2) {
+    obj.initNull();
+    s = str->makeSubStream(hintsOffset2, gFalse, hintsLength2, &obj);
+    s->reset();
+    for (Guint i=0; i < hintsLength2; i++) { *p++ = s->getChar(); }
+    delete s;
+  }
+
+  obj.initNull();
+  MemStream *memStream = new MemStream (buf, 0, bufLength, &obj);
+
+  obj.initNull();
+  parser = new Parser(NULL, new Lexer(NULL, memStream), gTrue);
+  if (parser->getObj(&obj)->isInt() &&
+     (obj.free(), parser->getObj(&obj)->isInt()) &&
+     (obj.free(), parser->getObj(&obj)->isCmd("obj")) &&
+     (obj.free(), parser->getObj(&obj)->isStream())){
+    Stream *hintsStream = obj.getStream();
+    Dict *hintsDict = obj.streamGetDict();
+
+    int sharedStreamOffset = 0;
+    if (hintsDict->lookupInt("S", NULL, &sharedStreamOffset) &&
+        sharedStreamOffset > 0) {
+
+        hintsStream->reset();
+        readPageOffsetTable(hintsStream);
+
+        hintsStream->reset();
+        for (int i=0; i<sharedStreamOffset; i++) hintsStream->getChar();
+        readSharedObjectsTable(hintsStream);
+    } else {
+      error(-1, "Invalid shared object hint table offset");
+    }
+  } else {
+    error(-1, "Failed parsing hints table object");
+  }
+  obj.free();
+
+  delete parser;
+}
+
+void Hints::readPageOffsetTable(Stream *str)
+{
+  if (nPages < 1) {
+    error(-1, "Invalid number of pages reading page offset hints table");
+    return;
+  }
+
+  inputBits = 0; // reset on byte boundary.
+
+  nObjectLeast = readBits(32, str);
+
+  objectOffsetFirst = readBits(32, str);
+  if (objectOffsetFirst >= hintsOffset) objectOffsetFirst += hintsLength;
+
+  nBitsDiffObjects = readBits(16, str);
+
+  pageLengthLeast = readBits(32, str);
+
+  nBitsDiffPageLength = readBits(16, str);
+
+  OffsetStreamLeast = readBits(32, str);
+
+  nBitsOffsetStream = readBits(16, str);
+
+  lengthStreamLeast = readBits(32, str);
+
+  nBitsLengthStream = readBits(16, str);
+
+  nBitsNumShared = readBits(16, str);
+
+  nBitsShared = readBits(16, str);
+
+  nBitsNumerator = readBits(16, str);
+
+  denominator = readBits(16, str);
+
+  for (int i=0; i<nPages; i++) {
+    nObjects[i] = nObjectLeast + readBits(nBitsDiffObjects, str);
+  }
+
+  nObjects[0] = 0;
+  xRefOffset[0] = mainXRefEntriesOffset + 20;
+  for (int i=1; i<nPages; i++) {
+    xRefOffset[i] = xRefOffset[i-1] + 20*nObjects[i-1];
+  }
+
+  for (int i=0; i<nPages; i++) {
+    pageLength[i] = pageLengthLeast + readBits(nBitsDiffPageLength, str);
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  numSharedObject[0] = readBits(nBitsNumShared, str);
+  numSharedObject[0] = 0; // Do not trust the read value to be 0.
+  sharedObjectId[0] = NULL;
+  for (int i=1; i<nPages; i++) {
+    numSharedObject[i] = readBits(nBitsNumShared, str);
+    if (numSharedObject[i] >= INT_MAX / (int)sizeof(Guint)) {
+       error(-1, "Invalid number of shared objects");
+       numSharedObject[i] = 0;
+       return;
+    }
+    sharedObjectId[i] = (Guint *) gmallocn(numSharedObject[i], sizeof(Guint));
+    if (numSharedObject[i] && !sharedObjectId[i]) {
+       error(-1, "Failed to allocate memory for shared object IDs");
+       numSharedObject[i] = 0;
+       return;
+    }
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (int i=1; i<nPages; i++) {
+    for (Guint j=0; j < numSharedObject[i]; j++) {
+      sharedObjectId[i][j] = readBits(nBitsShared, str);
+    }
+  }
+
+  pageOffset[0] = objectOffsetFirst;
+  pageLength[0] = pageEndFirst - objectOffsetFirst;
+  // set fake pageOffset[0] to correct for hint table.
+  if (pageOffset[0] < hintsOffset) {
+    pageOffset[0] += hintsLength;
+  }
+  // find pageOffsets.
+  for (int i=1; i<nPages; i++) {
+    pageOffset[i] = pageOffset[i-1] + pageLength[i-1];
+  }
+  // restore correct pageOffset[0].
+  pageOffset[0] = objectOffsetFirst;
+
+}
+
+void Hints::readSharedObjectsTable(Stream *str)
+{
+  inputBits = 0; // reset on byte boundary.
+
+  Guint firstSharedObjectNumber = readBits(32, str);
+
+  Guint firstSharedObjectOffset = readBits(32, str);
+  firstSharedObjectOffset += hintsLength;
+
+  Guint nSharedGroupsFirst = readBits(32, str);
+
+  Guint nSharedGroups = readBits(32, str);
+
+  Guint nBitsNumObjects = readBits(16, str);
+
+  Guint groupLengthLeast = readBits(32, str);
+
+  Guint nBitsDiffGroupLength = readBits(16, str);
+
+  if (nSharedGroups >= INT_MAX / (int)sizeof(Guint)) {
+     error(-1, "Invalid number of shared object groups");
+     nSharedGroups = 0;
+     return;
+  }
+  groupLength = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupOffset = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupHasSignature = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupNumObjects = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  groupXRefOffset = (Guint *) gmallocn(nSharedGroups, sizeof(Guint));
+  if (!groupLength || !groupOffset || !groupHasSignature ||
+      !groupNumObjects || !groupXRefOffset) {
+     error(-1, "Failed to allocate memory for shared object groups");
+     nSharedGroups = 0;
+     return;
+  }
+
+  if (nSharedGroupsFirst > nSharedGroups) {
+     error(-1, "Invalid number of first page shared object groups");
+     nSharedGroupsFirst = nSharedGroups;
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    groupLength[i] = groupLengthLeast + readBits(nBitsDiffGroupLength, str);
+  }
+
+  groupOffset[0] = objectOffsetFirst;
+  for (Guint i=1; i<nSharedGroupsFirst; i++) {
+    groupOffset[i] = groupOffset[i-1] + groupLength[i-1];
+  }
+  groupOffset[nSharedGroupsFirst] = firstSharedObjectOffset;
+  for (Guint i=nSharedGroupsFirst+1; i<nSharedGroups; i++) {
+    groupOffset[i] = groupOffset[i-1] + groupLength[i-1];
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    groupHasSignature[i] = readBits(1, str);
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    if (groupHasSignature[i]) {
+       readBits(128, str);
+    }
+  }
+
+  inputBits = 0; // reset on byte boundary. Not in specs!
+  for (Guint i=0; i<nSharedGroups; i++) {
+    groupNumObjects[i] =
+       nBitsNumObjects ? 1 + readBits(nBitsNumObjects, str) : 1;
+  }
+
+  for (Guint i=0; i<nSharedGroupsFirst; i++) {
+    groupNumObjects[i] = 0;
+    groupXRefOffset[i] = 0;
+  }
+  groupXRefOffset[nSharedGroupsFirst] =
+      mainXRefEntriesOffset + 20*firstSharedObjectNumber;
+  for (Guint i=nSharedGroupsFirst+1; i<nSharedGroups; i++) {
+    groupXRefOffset[i] = groupXRefOffset[i-1] + 20*groupNumObjects[i-1];
+  }
+
+}
+
+Guint Hints::getPageOffset(int page)
+{
+  if ((page < 1) || (page > nPages)) return 0;
+
+  if (page-1 > pageFirst)
+    return pageOffset[page-1];
+  else if (page-1 < pageFirst)
+    return pageOffset[page];
+  else
+    return pageOffset[0];
+}
+
+GooVector<ByteRange>* Hints::getPageRanges(int page)
+{
+  if ((page < 1) || (page > nPages)) return NULL;
+
+  int idx;
+  if (page-1 > pageFirst)
+     idx = page-1;
+  else if (page-1 < pageFirst)
+     idx = page;
+  else
+     idx = 0;
+
+  ByteRange pageRange;
+  GooVector<ByteRange> *v = new GooVector<ByteRange>;
+
+  pageRange.offset = pageOffset[idx];
+  pageRange.length = pageLength[idx];
+  v->push_back(pageRange);
+
+  pageRange.offset = xRefOffset[idx];
+  pageRange.length = 20*nObjects[idx];
+  v->push_back(pageRange);
+
+  for (Guint j=0; j<numSharedObject[idx]; j++) {
+     Guint k = sharedObjectId[idx][j];
+
+     pageRange.offset = groupOffset[k];
+     pageRange.length = groupLength[k];
+     v->push_back(pageRange);
+
+     pageRange.offset = groupXRefOffset[k];
+     pageRange.length = 20*groupNumObjects[k];
+     v->push_back(pageRange);
+  }
+
+  return v;
+}
+
+Guint Hints::readBit(Stream *str)
+{
+  Guint bit;
+  int c;
+
+  if (inputBits == 0) {
+    if ((c = str->getChar()) == EOF) {
+      return (Guint) -1;
+    }
+    bitsBuffer = c;
+    inputBits = 8;
+  }
+  bit = (bitsBuffer >> (inputBits - 1)) & 1;
+  --inputBits;
+  return bit;
+}
+
+Guint Hints::readBits(int n, Stream *str)
+{
+  Guint bit, bits;
+
+  if (n < 1) return -1;
+
+  if (n == 1)
+    return readBit(str);
+
+  bit = (readBit(str) << (n-1));
+  if (bit == (Guint) -1)
+    return -1;
+
+  bits = readBits(n-1, str);
+  if (bits == (Guint) -1)
+    return -1;
+
+  return bit | bits;
+}
+
+
diff --git a/poppler/Hints.h b/poppler/Hints.h
new file mode 100644
index 0000000..35a2e55
--- /dev/null
+++ b/poppler/Hints.h
@@ -0,0 +1,92 @@
+//========================================================================
+//
+// Hints.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2010 Hib Eris <hib at hiberis.nl>
+//
+//========================================================================
+
+#ifndef HINTS_H
+#define HINTS_H
+
+#include <string.h>
+#include "goo/gtypes.h"
+#include "goo/GooVector.h"
+//#include <vector>
+#include "PDFDoc.h"
+
+class Stream;
+class BaseStream;
+class Linearization;
+class XRef;
+
+//------------------------------------------------------------------------
+// Hints
+//------------------------------------------------------------------------
+
+class Hints {
+public:
+
+  Hints(BaseStream *str, Linearization *linearization);
+  ~Hints();
+
+  Guint getPageOffset(int page);
+  GooVector<ByteRange>* getPageRanges(int page);
+
+private:
+
+  void readTables(BaseStream *str, Linearization *linearization);
+  void readPageOffsetTable(Stream *str);
+  void readSharedObjectsTable(Stream *str);
+
+  Guint readBit(Stream *str);
+  Guint readBits(int n, Stream *str);
+
+  Guint hintsOffset;
+  Guint hintsLength;
+  Guint hintsOffset2;
+  Guint hintsLength2;
+  Guint mainXRefEntriesOffset;
+
+  int nPages;
+  int pageFirst;
+  Guint pageOffsetFirst;
+  Guint pageEndFirst;
+  int objectNumberFirst;
+
+  Guint nObjectLeast;
+  Guint objectOffsetFirst;
+  Guint nBitsDiffObjects;
+  Guint pageLengthLeast;
+  Guint nBitsDiffPageLength;
+  Guint OffsetStreamLeast;
+  Guint nBitsOffsetStream;
+  Guint lengthStreamLeast;
+  Guint nBitsLengthStream;
+  Guint nBitsNumShared;
+  Guint nBitsShared;
+  Guint nBitsNumerator;
+  Guint denominator;
+
+  Guint *nObjects;
+  Guint *xRefOffset;
+  Guint *pageLength;
+  Guint *pageOffset;
+  Guint *numSharedObject;
+  Guint **sharedObjectId;
+
+  Guint nSharedGroups;
+  Guint *groupLength;
+  Guint *groupOffset;
+  Guint *groupHasSignature;
+  Guint *groupNumObjects;
+  Guint *groupXRefOffset;
+
+  int inputBits;
+  char bitsBuffer;
+
+};
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 8c1e019..a6b7990 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -206,6 +206,7 @@ poppler_include_HEADERS =	\
 	GfxState.h		\
 	GfxState_helpers.h	\
 	GlobalParams.h		\
+	Hints.h			\
 	JArithmeticDecoder.h	\
 	JBIG2Stream.h		\
 	Lexer.h			\
@@ -285,6 +286,7 @@ libpoppler_la_SOURCES =		\
 	GfxFont.cc 		\
 	GfxState.cc		\
 	GlobalParams.cc		\
+	Hints.cc		\
 	JArithmeticDecoder.cc	\
 	JBIG2Stream.cc		\
 	Lexer.cc 		\
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 8f105fd..1d6c2f4 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -67,6 +67,7 @@
 #include "Outline.h"
 #endif
 #include "PDFDoc.h"
+#include "Hints.h"
 
 //------------------------------------------------------------------------
 
@@ -94,6 +95,7 @@ void PDFDoc::init()
   xref = NULL;
   linearization = NULL;
   catalog = NULL;
+  hints = NULL;
 #ifndef DISABLE_OUTLINE
   outline = NULL;
 #endif
@@ -268,6 +270,9 @@ PDFDoc::~PDFDoc() {
   if (xref) {
     delete xref;
   }
+  if (hints) {
+    delete hints;
+  }
   if (linearization) {
     delete linearization;
   }
@@ -471,6 +476,15 @@ GBool PDFDoc::isLinearized() {
     return gFalse;
 }
 
+Hints *PDFDoc::getHints()
+{
+  if (!hints && isLinearized()) {
+    hints = new Hints(str, getLinearization());
+  }
+
+  return hints;
+}
+
 int PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
   FILE *f;
   OutStream *outStr;
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index 9069698..b2f40c9 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -49,6 +49,7 @@ class LinkAction;
 class LinkDest;
 class Outline;
 class Linearization;
+class Hints;
 
 enum PDFWriteMode {
   writeStandard,
@@ -236,6 +237,9 @@ private:
   void saveIncrementalUpdate (OutStream* outStr);
   void saveCompleteRewrite (OutStream* outStr);
 
+  // Get hints.
+  Hints *getHints();
+
   PDFDoc();
   void init();
   GBool setup(GooString *ownerPassword, GooString *userPassword);
@@ -258,6 +262,7 @@ private:
   Linearization *linearization;
   XRef *xref;
   Catalog *catalog;
+  Hints *hints;
 #ifndef DISABLE_OUTLINE
   Outline *outline;
 #endif
-- 
1.6.4.2


From 18e9c4cb827053631d160db93ab93f1f913f454c Mon Sep 17 00:00:00 2001
From: Hib Eris <hib at hiberis.nl>
Date: Tue, 20 Apr 2010 19:06:02 +0200
Subject: [PATCH 15/15] Use hint tables for PDFDoc::getPage()

---
 poppler/PDFDoc.cc |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 poppler/PDFDoc.h  |    4 +++
 2 files changed, 76 insertions(+), 1 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 1d6c2f4..27926c3 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -100,6 +100,7 @@ void PDFDoc::init()
   outline = NULL;
 #endif
   startXRefPos = ~(Guint)0;
+  pageCache = NULL;
 }
 
 PDFDoc::PDFDoc()
@@ -259,6 +260,14 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) {
 }
 
 PDFDoc::~PDFDoc() {
+  if (pageCache) {
+    for (int i = 0; i < getNumPages(); i++) {
+      if (pageCache[i]) {
+        delete pageCache[i];
+      }
+    }
+    gfree(pageCache);
+  }
 #ifndef DISABLE_OUTLINE
   if (outline) {
     delete outline;
@@ -1056,11 +1065,73 @@ int PDFDoc::getNumPages()
   }
 }
 
+Guint PDFDoc::getPageOffset(int page)
+{
+  Guint offset;
+
+  if (getHints() && (offset = getHints()->getPageOffset(page))) {
+    return offset;
+  } else {
+    error(-1, "Failed getting page offset from hint table");
+    return 0;
+  }
+}
+
+Page *PDFDoc::parsePage(Guint offset, int page)
+{
+  Page *p = NULL;
+  Object obj;
+
+  obj.initNull();
+  Stream *stream = str->makeSubStream(offset, gFalse, 0, &obj);
+  Parser parser = Parser(xref, new Lexer(xref, stream), gTrue);
+
+  Object obj1, obj2, obj3, obj4;
+  if (parser.getObj(&obj1)->isInt() &&
+      parser.getObj(&obj2)->isInt() &&
+      parser.getObj(&obj3)->isCmd("obj") &&
+      parser.getObj(&obj4)->isDict("Page")) {
+    Ref pageRef;
+    Dict *pageDict;
+    pageRef.num = obj1.getInt();
+    pageRef.gen = obj2.getInt();
+    pageDict = obj4.getDict();
+    p = new Page(xref, page, pageDict, pageRef,
+                 new PageAttrs(NULL, pageDict),
+                 catalog->getForm());
+    if (!p->isOk()) {
+      delete p;
+      p = NULL;
+    }
+  }
+  obj4.free();
+  obj3.free();
+  obj2.free();
+  obj1.free();
+
+  return p;
+}
+
 Page *PDFDoc::getPage(int page)
 {
   if ((page < 1) || page > getNumPages()) return NULL;
 
-  {
+  if (isLinearized()) {
+    if (!pageCache) {
+      pageCache = (Page **) gmallocn(getNumPages(), sizeof(Page *));
+      for (int i = 0; i < getNumPages(); i++) {
+        pageCache[i] = NULL;
+      }
+    }
+    if (!pageCache[page-1]) {
+      pageCache[page-1] = parsePage(getPageOffset(page), page);
+      if (!pageCache[page-1]) {
+         error(-1, "Failed parsing page %d at offset %d",
+               page, getPageOffset(page));
+      }
+    }
+    return pageCache[page-1];
+  } else {
     return catalog->getPage(page);
   }
 }
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index b2f40c9..99c005e 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -237,6 +237,9 @@ private:
   void saveIncrementalUpdate (OutStream* outStr);
   void saveCompleteRewrite (OutStream* outStr);
 
+  Guint getPageOffset(int page);
+  Page *parsePage(Guint offset, int page);
+
   // Get hints.
   Hints *getHints();
 
@@ -266,6 +269,7 @@ private:
 #ifndef DISABLE_OUTLINE
   Outline *outline;
 #endif
+  Page **pageCache;
 
   GBool ok;
   int errCode;
-- 
1.6.4.2


More information about the poppler mailing list