[PATCH] Adds the ability to save PDF using either incremental update or by

Julien Rebetez julien at fhtagn.net
Thu Oct 25 12:32:43 PDT 2007


rewriting completly the PDF.
---
 poppler/PDFDoc.cc |  382 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 poppler/PDFDoc.h  |   18 +++-
 2 files changed, 394 insertions(+), 6 deletions(-)

diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 78fbea2..58cb81a 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -34,6 +34,7 @@
 #include "Lexer.h"
 #include "Parser.h"
 #include "SecurityHandler.h"
+#include "Decrypt.h"
 #ifndef DISABLE_OUTLINE
 #include "Outline.h"
 #endif
@@ -435,19 +436,390 @@ GBool PDFDoc::isLinearized() {
   return lin;
 }
 
-GBool PDFDoc::saveAs(GooString *name) {
+GBool PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
   FILE *f;
-  int c;
+  OutStream *outStr;
 
   if (!(f = fopen(name->getCString(), "wb"))) {
     error(-1, "Couldn't open file '%s'", name->getCString());
     return gFalse;
   }
+  outStr = new FileOutStream(f,0);
+
+  if (mode == writeForceRewrite) {
+    saveCompleteRewrite(outStr);
+  } else if (mode == writeForceIncremental) {
+    saveIncrementalUpdate(outStr); 
+  } else { // let poppler decide
+    // find if we have updated objects
+    GBool updated = gFalse;
+    for(int i=0; i<xref->getNumObjects(); i++) {
+      // We don't take null and none objects into account
+      if (!xref->getEntry(i)->obj.isNone() &&
+          !xref->getEntry(i)->obj.isNull()) {
+        updated = gTrue;
+        break;
+      }
+    }
+    if(updated) { 
+      saveIncrementalUpdate(outStr);
+    } else {
+      // simply copy the original file
+      int c;
+      str->reset();
+      while ((c = str->getChar()) != EOF) {
+        outStr->put(c);
+      }
+      str->close();
+    }
+  }
+
+  delete outStr;
+  fclose(f);
+  return gTrue;
+}
+
+void PDFDoc::saveIncrementalUpdate (OutStream* outStr)
+{
+  XRef *uxref;
+  int c;
+  //copy the original file
   str->reset();
   while ((c = str->getChar()) != EOF) {
-    fputc(c, f);
+    outStr->put(c);
   }
   str->close();
-  fclose(f);
-  return gTrue;
+
+  uxref = new XRef();
+  uxref->add(0, 65535, 0, gFalse);
+  int objectsCount = 0; //count the number of objects in the XRef(s)
+  for(int i=0; i<xref->getNumObjects(); i++) {
+    if ((xref->getEntry(i)->type == xrefEntryFree) && 
+        (xref->getEntry(i)->gen == 0)) //we skip the irrelevant free objects
+      continue;
+    objectsCount++;
+    if (!xref->getEntry(i)->obj.isNone() &&
+        !xref->getEntry(i)->obj.isNull()) { //we have an updated object
+      Object obj1;
+      Ref ref;
+      ref.num = i;
+      ref.gen = xref->getEntry(i)->gen;
+      xref->fetch(ref.num, ref.gen, &obj1);
+      Guint offset = writeObject(&obj1, &ref, outStr);
+      uxref->add(ref.num, ref.gen, offset, gTrue);
+      obj1.free();
+    }
+  }
+  if (uxref->getSize() == 0) { //we have nothing to update
+    delete uxref;
+    return;
+  }
+
+  Guint uxrefOffset = outStr->getPos();
+  uxref->writeToFile(outStr);
+
+  writeTrailer(uxrefOffset, objectsCount, outStr, gTrue);
+
+  delete uxref;
+}
+
+void PDFDoc::saveCompleteRewrite (OutStream* outStr)
+{
+  outStr->printf("%%PDF-%.1f\r\n",pdfVersion);
+  XRef *uxref = new XRef();
+  uxref->add(0, 65535, 0, gFalse);
+  for(int i=0; i<xref->getNumObjects(); i++) {
+    Object obj1;
+    Ref ref;
+    XRefEntryType type = xref->getEntry(i)->type;
+    if (type == xrefEntryFree) {
+      ref.num = i;
+      ref.gen = xref->getEntry(i)->gen;
+      /* the XRef class add a lot of irrelevant free entries, we only want the significant one
+          and we don't want the one with num=0 because it has already been added (gen = 65535)*/
+      if (ref.gen > 0 && ref.num > 0)
+        uxref->add(ref.num, ref.gen, 0, gFalse);
+    } else if (type == xrefEntryUncompressed){ 
+      ref.num = i;
+      ref.gen = xref->getEntry(i)->gen;
+      xref->fetch(ref.num, ref.gen, &obj1);
+      Guint offset = writeObject(&obj1, &ref, outStr);
+      uxref->add(ref.num, ref.gen, offset, gTrue);
+      obj1.free();
+    } else if (type == xrefEntryCompressed) {
+      ref.num = i;
+      ref.gen = 0; //compressed entries have gen == 0
+      xref->fetch(ref.num, ref.gen, &obj1);
+      Guint offset = writeObject(&obj1, &ref, outStr);
+      uxref->add(ref.num, ref.gen, offset, gTrue);
+      obj1.free();
+    }
+  }
+  Guint uxrefOffset = outStr->getPos();
+  uxref->writeToFile(outStr);
+
+  writeTrailer(uxrefOffset, uxref->getSize(), outStr, gFalse);
+
+
+  delete uxref;
+
+}
+
+void PDFDoc::writeDictionnary (Dict* dict, OutStream* outStr)
+{
+  Object obj1;
+  outStr->printf("<<");
+  for (int i=0; i<dict->getLength(); i++) {
+    outStr->printf("/%s ", dict->getKey(i));
+    writeObject(dict->getValNF(i, &obj1), NULL, outStr);
+    obj1.free();
+  }
+  outStr->printf(">>");
+}
+
+void PDFDoc::writeStream (Stream* str, OutStream* outStr)
+{
+  int c;
+  outStr->printf("stream\r\n");
+  str->reset();
+  for (int c=str->getChar(); c!= EOF; c=str->getChar()) {
+    outStr->printf("%c", c);  
+  }
+  outStr->printf("\r\nendstream\r\n");
+}
+
+void PDFDoc::writeRawStream (Stream* str, OutStream* outStr)
+{
+  Object obj1;
+  str->getDict()->lookup("Length", &obj1);
+  if (!obj1.isInt()) {
+    error (-1, "PDFDoc::writeRawStream, no Length in stream dict");
+    return;
+  }
+
+  const int length = obj1.getInt();
+  obj1.free();
+
+  outStr->printf("stream\r\n");
+  str->unfilteredReset();
+  for (int i=0; i<length; i++) {
+    int c = str->getUnfilteredChar();
+    outStr->printf("%c", c);  
+  }
+  str->reset();
+  outStr->printf("\r\nendstream\r\n");
+}
+
+void PDFDoc::writeString (GooString* s, OutStream* outStr)
+{
+  if (s->hasUnicodeMarker()) {
+    //unicode string don't necessary end with \0
+    const char* c = s->getCString();
+    outStr->printf("(");
+    for(int i=0; i<s->getLength(); i++) {
+      char unescaped = *(c+i)&0x000000ff;
+      //escape if needed
+      if (unescaped == '(' || unescaped == ')' || unescaped == '\\')
+        outStr->printf("%c", '\\');
+      outStr->printf("%c", unescaped);
+    }
+    outStr->printf(") ");
+  } else {
+    const char* c = s->getCString();
+    outStr->printf("(");
+    while(*c!='\0') {
+      char unescaped = (*c)&0x000000ff;
+      //escape if needed
+      if (unescaped == '(' || unescaped == ')' || unescaped == '\\')
+        outStr->printf("%c", '\\');
+      outStr->printf("%c", unescaped);
+      c++;
+    }
+    outStr->printf(") ");
+  }
+}
+
+Guint PDFDoc::writeObject (Object* obj, Ref* ref, OutStream* outStr)
+{
+  Array *array;
+  Object obj1;
+  Guint offset = outStr->getPos();
+  int tmp;
+
+  if(ref) 
+    outStr->printf("%i %i obj", ref->num, ref->gen);
+
+  switch (obj->getType()) {
+    case objBool:
+      outStr->printf("%s ", obj->getBool()?"true":"false");
+      break;
+    case objInt:
+      outStr->printf("%i ", obj->getInt());
+      break;
+    case objReal:
+      outStr->printf("%g ", obj->getReal());
+      break;
+    case objString:
+      writeString(obj->getString(), outStr);
+      break;
+    case objName:
+      outStr->printf("/%s ", obj->getName());
+      break;
+    case objNull:
+      outStr->printf( "null");
+      break;
+    case objArray:
+      array = obj->getArray();
+      outStr->printf("[");
+      for (int i=0; i<array->getLength(); i++) {
+        writeObject(array->getNF(i, &obj1), NULL,outStr);
+        obj1.free();
+      }
+      outStr->printf("]");
+      break;
+    case objDict:
+      writeDictionnary (obj->getDict(),outStr);
+      break;
+    case objStream: 
+      {
+        //We can't modify stream with the current implementation (no write functions in Stream API)
+        // => the only type of streams which that have been modified are internal streams (=strWeird)
+        Stream *stream = obj->getStream();
+        if (stream->getKind() == strWeird) {
+          //we write the stream unencoded => TODO: write stream encoder
+          stream->reset();
+          //recalculate stream length
+          tmp = 0;
+          for (int c=stream->getChar(); c!=EOF; c=stream->getChar()) {
+            tmp++;
+          }
+          obj1.initInt(tmp);
+          stream->getDict()->set("Length", &obj1);
+
+          //Remove Stream encoding
+          stream->getDict()->remove("Filter");
+          stream->getDict()->remove("DecodeParms");
+
+          writeDictionnary (stream->getDict(),outStr);
+          writeStream (stream,outStr);
+          obj1.free();
+        } else {
+          //raw stream copy
+          writeDictionnary (stream->getDict(), outStr);
+          writeRawStream (stream, outStr);
+        }
+        break;
+      }
+    case objRef:
+      outStr->printf("%i %i R ", obj->getRef().num, obj->getRef().gen);
+      break;
+    case objCmd:
+      outStr->printf("cmd\r\n");
+      break;
+    case objError:
+      outStr->printf("error\r\n");
+      break;
+    case objEOF:
+      outStr->printf("eof\r\n");
+      break;
+    case objNone:
+      outStr->printf("none\r\n");
+      break;
+    default:
+      error(-1,"Unhandled objType : %i, please report a bug with a testcase\r\n", obj->getType());
+      break;
+  }
+  if (ref)
+    outStr->printf("endobj\r\n");
+  return offset;
+}
+
+void PDFDoc::writeTrailer (Guint uxrefOffset, int uxrefSize, OutStream* outStr, GBool incrUpdate)
+{
+  Dict *trailerDict = new Dict(xref);
+  Object obj1;
+  obj1.initInt(uxrefSize);
+  trailerDict->set("Size", &obj1);
+  obj1.free();
+
+
+  //build a new ID, as recommended in the reference, uses:
+  // - current time
+  // - file name
+  // - file size
+  // - values of entry in information dictionnary
+  GooString message;
+  char buffer[256];
+  sprintf(buffer, "%i", (int)time(NULL));
+  message.append(buffer);
+  message.append(fileName);
+  // file size
+  unsigned int fileSize = 0;
+  int c;
+  str->reset();
+  while ((c = str->getChar()) != EOF) {
+    fileSize++;
+  }
+  str->close();
+  sprintf(buffer, "%i", fileSize);
+  message.append(buffer);
+
+  //info dict -- only use text string
+  if (xref->getDocInfo(&obj1)->isDict()) {
+    for(int i=0; i<obj1.getDict()->getLength(); i++) {
+      Object obj2;
+      obj1.getDict()->getVal(i, &obj2);  
+      if (obj2.isString()) {
+        message.append(obj2.getString());
+      }
+      obj2.free();
+    }
+  }
+  obj1.free();
+
+  //calculate md5 digest
+  Guchar digest[16];
+  Decrypt::md5((Guchar*)message.getCString(), message.getLength(), digest);
+  obj1.initString(new GooString((const char*)digest, 16));
+
+  //create ID array
+  Object obj2,obj3,obj4;
+  obj2.initArray(xref);
+
+  if (incrUpdate) {
+    //only update the second part of the array
+    if(xref->getTrailerDict()->getDict()->lookup("ID", &obj4) != NULL) {
+      if (!obj4.isArray()) {
+        error(-1, "PDFDoc::writeTrailer original file's ID entry isn't an array. Trying to continue");
+      } else {
+        //Get the first part of the ID
+        obj4.arrayGet(0,&obj3); 
+
+        obj2.arrayAdd(&obj3); 
+        obj2.arrayAdd(&obj1);
+        trailerDict->set("ID", &obj2);
+      }
+    }
+  } else {
+    //new file => same values for the two identifiers
+    obj2.arrayAdd(&obj1);
+    obj2.arrayAdd(&obj1);
+    trailerDict->set("ID", &obj2);
+  }
+
+
+  obj1.initRef(xref->getRootNum(), xref->getRootGen());
+  trailerDict->set("Root", &obj1);
+  obj1.free();
+
+  if (incrUpdate) { 
+    obj1.initInt(xref->getLastXRefPos());
+    trailerDict->set("Prev", &obj1);
+    obj1.free();
+  }
+  outStr->printf( "trailer\r\n");
+  writeDictionnary(trailerDict, outStr);
+  outStr->printf( "\r\nstartxref\r\n");
+  outStr->printf( "%i\r\n", uxrefOffset);
+  outStr->printf( "%%%%EOF\r\n");
 }
diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
index e6992ba..2336694 100644
--- a/poppler/PDFDoc.h
+++ b/poppler/PDFDoc.h
@@ -27,6 +27,12 @@ class LinkAction;
 class LinkDest;
 class Outline;
 
+enum PDFWriteMode {
+  writeStandard,
+  writeForceRewrite,
+  writeForceIncremental
+};
+
 //------------------------------------------------------------------------
 // PDFDoc
 //------------------------------------------------------------------------
@@ -169,12 +175,22 @@ public:
   double getPDFVersion() { return pdfVersion; }
 
   // Save this file with another name.
-  GBool saveAs(GooString *name);
+  GBool saveAs(GooString *name, PDFWriteMode mode=writeStandard);
 
   // Return a pointer to the GUI (XPDFCore or WinPDFCore object).
   void *getGUIData() { return guiData; }
 
 private:
+  // Add object to current file stream and return the offset of the beginning of the object
+  Guint writeObject (Object *obj, Ref *ref, OutStream* outStr);
+  void writeDictionnary (Dict* dict, OutStream* outStr);
+  void writeStream (Stream* str, OutStream* outStr);
+  void writeRawStream (Stream* str, OutStream* outStr);
+  void writeTrailer (Guint uxrefOffset, int uxrefSize, OutStream* outStr, GBool incrUpdate);
+  void writeString (GooString* s, OutStream* outStr);
+  void saveIncrementalUpdate (OutStream* outStr);
+  void saveCompleteRewrite (OutStream* outStr);
+
 
   GBool setup(GooString *ownerPassword, GooString *userPassword);
   GBool checkFooter();
-- 
1.5.2.5


--------------030500050101040903030608
Content-Type: text/x-patch;
 name="0004-Adds-deep-copy-constructor-to-Dict.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="0004-Adds-deep-copy-constructor-to-Dict.patch"



More information about the poppler mailing list