[poppler] PDF saving support

Sat Oct 27 09:32:57 PDT 2007

On Fri, Oct 26, 2007 at 05:09:37PM +0300, Julien Rebetez wrote:
> Hello,
> 
> I've made a series of patches that adds the ability to save modified PDF 
> documents.
> By default, incremental update is used, but full rewrite may be used (in 
> which case
> the whole document is loaded and then written back).

Out of curiosity, how hard would it be to OS X Leopard style pdf
modifications (reorder and omit pages, merge pdf's) on top of this code?

> 
> By the way, I hope this is the right way to submit patches with git.

Yep, this works well. Also, if you haven't tried it out yet, stgit is
very good at managing and modifying patch sets like this. I'd recommend
giving it a try.

Some review follows inline. I, admittedly, skimmed some of the bigger stuff.

-Jeff

> >From 26eb86ccfbe6d6e2c7a4b459fa28393dfbce3cf7 Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Wed, 24 Oct 2007 19:23:52 +0300
> Subject: [PATCH] Adds addIndirectObject method to XRef. This method allow the creation of new indirect objects.
> Modify the writeToFile method of XRef so it uses OutStream instead of a C file descriptor.
> ---
>  poppler/XRef.cc |   45 ++++++++++++++++++++++++++++++++++++++++-----
>  poppler/XRef.h  |    3 ++-
>  2 files changed, 42 insertions(+), 6 deletions(-)
> 
> diff --git a/poppler/XRef.cc b/poppler/XRef.cc
> index b84e198..8561712 100644
> --- a/poppler/XRef.cc
> +++ b/poppler/XRef.cc
> @@ -1023,14 +1023,49 @@ void XRef::setModifiedObject (Object* o, Ref r) {
>    o->copy(&entries[r.num].obj);
>  }

I find this function hard to follow. Maybe I'm just dense...
Also, the new method and the modification of writeToFile don't really
seem relate, so they could easily be two seperate commits.

> +Ref XRef::addIndirectObject (Object* o) {
> +  //Found the next free entry
> +  int lastEntry = 0;
> +  int newEntry;
> +
> +  do {
> +    newEntry = entries[lastEntry].offset;
> +    //we shouldn't reuse entry with a gen number of 65535
> +  } while (entries[newEntry].gen == 65535 &&
> +           (lastEntry = newEntry));
> +
> +  //the linked list of free entry is empty => create a new one
> +  if (newEntry == 0) {
> +    newEntry = size;
> +    size++;
> +    entries = (XRefEntry *)greallocn(entries, size, sizeof(XRefEntry));
> +    entries[newEntry].gen = 0;
> +    entries[newEntry].num = newEntry;
> +  } else { //reuse a free entry
> +    //restore the free entry linked list    
> +    entries[lastEntry].offset = entries[newEntry].offset;
> +    entries[newEntry].num = newEntry;
> +    //Don't touch gen number, it should have been incremented when the object was deleted
> +  }
> +
> +  entries[newEntry].type = xrefEntryUncompressed;
> +  o->copy(&entries[newEntry].obj);
> +
> +  Ref r;
> +  r.num = entries[newEntry].num;
> +  r.gen = entries[newEntry].gen;
> +  return r;
> +}
> +
>  //used to sort the entries
>  int compare (const void* a, const void* b)
>  {
>    return (((XRefEntry*)a)->num - ((XRefEntry*)b)->num);
>  }
>  
> -void XRef::writeToFile(FILE* file) {
> +void XRef::writeToFile(OutStream* outStr) { 
>    qsort(entries, size, sizeof(XRefEntry), compare);
> +
>    //create free entries linked-list
>    if (entries[0].gen != 65535) {
>      error(-1, "XRef::writeToFile, entry 0 of the XRef is invalid (gen != 65535)\n");
> @@ -1044,16 +1079,16 @@ void XRef::writeToFile(FILE* file) {
>    }
>    //write the new xref
>    int j;
> -  fprintf(file,"xref\r\n");
> +  outStr->printf("xref\r\n");
>    for (int i=0; i<size; i++) {
> -    for(j=i; j<size; j++) { //look for consecutive entry
> +    for(j=i; j<size; j++) { //look for consecutive entries
>        if (j!=i && entries[j].num != entries[j-1].num+1) 
>                break;
>      }
> -    fprintf(file,"%i %i\r\n", entries[i].num, j-i);
> +    outStr->printf("%i %i\r\n", entries[i].num, j-i);
>      for (int k=i; k<j; k++) {
>        if(entries[k].gen > 65535) entries[k].gen = 65535; //cap generation number to 65535 (required by PDFReference)
> -      fprintf(file,"%010i %05i %c\r\n", entries[k].offset, entries[k].gen, (entries[k].type==xrefEntryFree)?'f':'n');
> +      outStr->printf("%010i %05i %c\r\n", entries[k].offset, entries[k].gen, (entries[k].type==xrefEntryFree)?'f':'n');
>      }
>      i = j-1;
>    }
> diff --git a/poppler/XRef.h b/poppler/XRef.h
> index 05699c4..d5a395b 100644
> --- a/poppler/XRef.h
> +++ b/poppler/XRef.h
> @@ -109,8 +109,9 @@ public:
>  
>    // Write access
>    void setModifiedObject(Object* o, Ref r);
> +  Ref addIndirectObject (Object* o);
>    void add(int num, int gen,  Guint offs, GBool used);
> -  void writeToFile(FILE* f);
> +  void writeToFile(OutStream* outStr);
>  
>  private:
>  
> -- 
> 1.5.2.5
> 

> >From d3d7e387fb5384429721599532703fc3e85d9ebf Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Thu, 25 Oct 2007 22:29:42 +0300
> Subject: [PATCH] Make the md5 method of Decrypt public so it can be used by other files.
> 
> ---
>  poppler/Decrypt.cc |    5 ++---
>  poppler/Decrypt.h  |    1 +
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/poppler/Decrypt.cc b/poppler/Decrypt.cc
> index 3cf24ef..cc148ba 100644
> --- a/poppler/Decrypt.cc
> +++ b/poppler/Decrypt.cc
> @@ -21,7 +21,6 @@ static Guchar rc4DecryptByte(Guchar *state, Guchar *x, Guchar *y, Guchar c);
>  static void aesKeyExpansion(DecryptAESState *s,
>  			    Guchar *objKey, int objKeyLen);
>  static void aesDecryptBlock(DecryptAESState *s, Guchar *in, GBool last);
> -static void md5(Guchar *msg, int msgLen, Guchar *digest);
>  
>  static Guchar passwordPad[32] = {
>    0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41,
> @@ -208,7 +207,7 @@ DecryptStream::DecryptStream(Stream *strA, Guchar *fileKey,
>    } else {
>      n = keyLength + 5;
>    }
> -  md5(objKey, n, objKey);
> +  Decrypt::md5(objKey, n, objKey);
>    if ((objKeyLength = keyLength + 5) > 16) {
>      objKeyLength = 16;
>    }
> @@ -633,7 +632,7 @@ static inline Gulong md5Round4(Gulong a, Gulong b, Gulong c, Gulong d,
>    return b + rotateLeft((a + (c ^ (b | ~d)) + Xk + Ti), s);
>  }
>  
> -static void md5(Guchar *msg, int msgLen, Guchar *digest) {
> +void Decrypt::md5(Guchar *msg, int msgLen, Guchar *digest) {
>    Gulong x[16];
>    Gulong a, b, c, d, aa, bb, cc, dd;
>    int n64;
> diff --git a/poppler/Decrypt.h b/poppler/Decrypt.h
> index b2f754f..b861968 100644
> --- a/poppler/Decrypt.h
> +++ b/poppler/Decrypt.h
> @@ -24,6 +24,7 @@
>  
>  class Decrypt {
>  public:
> +  static void md5(Guchar *msg, int msgLen, Guchar *digest);
>  
>    // Generate a file key.  The <fileKey> buffer must have space for at
>    // least 16 bytes.  Checks <ownerPassword> and then <userPassword>
> -- 
> 1.5.2.5
> 
This one looks fine.

> >From 08391ecc731dc904f42b4566841f4dbae4bbd4c2 Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Thu, 25 Oct 2007 22:32:43 +0300
> Subject: [PATCH] Adds the ability to save PDF using either incremental update or by
> rewriting completly the PDF.
> ---
>  poppler/PDFDoc.cc |  382 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  poppler/PDFDoc.h  |   18 +++-
>  2 files changed, 394 insertions(+), 6 deletions(-)
> 
> diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
> index 78fbea2..58cb81a 100644
> --- a/poppler/PDFDoc.cc
> +++ b/poppler/PDFDoc.cc
> @@ -34,6 +34,7 @@
>  #include "Lexer.h"
>  #include "Parser.h"
>  #include "SecurityHandler.h"
> +#include "Decrypt.h"
>  #ifndef DISABLE_OUTLINE
>  #include "Outline.h"
>  #endif
> @@ -435,19 +436,390 @@ GBool PDFDoc::isLinearized() {
>    return lin;
>  }
>  
> -GBool PDFDoc::saveAs(GooString *name) {
> +GBool PDFDoc::saveAs(GooString *name, PDFWriteMode mode) {
>    FILE *f;
> -  int c;
> +  OutStream *outStr;
>  
>    if (!(f = fopen(name->getCString(), "wb"))) {
>      error(-1, "Couldn't open file '%s'", name->getCString());
>      return gFalse;
>    }
> +  outStr = new FileOutStream(f,0);
> +
> +  if (mode == writeForceRewrite) {
> +    saveCompleteRewrite(outStr);
> +  } else if (mode == writeForceIncremental) {
> +    saveIncrementalUpdate(outStr); 
> +  } else { // let poppler decide
> +    // find if we have updated objects
> +    GBool updated = gFalse;
> +    for(int i=0; i<xref->getNumObjects(); i++) {
> +      // We don't take null and none objects into account
Why?

> +      if (!xref->getEntry(i)->obj.isNone() &&
> +          !xref->getEntry(i)->obj.isNull()) {
> +        updated = gTrue;
> +        break;
> +      }
> +    }
> +    if(updated) { 
> +      saveIncrementalUpdate(outStr);
> +    } else {
> +      // simply copy the original file
> +      int c;
> +      str->reset();
> +      while ((c = str->getChar()) != EOF) {
> +        outStr->put(c);
> +      }
> +      str->close();
> +    }
> +  }
> +
> +  delete outStr;
> +  fclose(f);
> +  return gTrue;
> +}
> +
> +void PDFDoc::saveIncrementalUpdate (OutStream* outStr)
> +{
> +  XRef *uxref;
> +  int c;
> +  //copy the original file
>    str->reset();
>    while ((c = str->getChar()) != EOF) {
> -    fputc(c, f);
> +    outStr->put(c);
>    }
>    str->close();
> -  fclose(f);
> -  return gTrue;
> +
> +  uxref = new XRef();
> +  uxref->add(0, 65535, 0, gFalse);
> +  int objectsCount = 0; //count the number of objects in the XRef(s)
> +  for(int i=0; i<xref->getNumObjects(); i++) {
> +    if ((xref->getEntry(i)->type == xrefEntryFree) && 
> +        (xref->getEntry(i)->gen == 0)) //we skip the irrelevant free objects
> +      continue;
> +    objectsCount++;
> +    if (!xref->getEntry(i)->obj.isNone() &&
> +        !xref->getEntry(i)->obj.isNull()) { //we have an updated object

If you can come up with good name for !isNone() && !isNull() it might
be a good idea to make a helper function with that name that checks this
condition. The 'why?' from above could be addressed there too.

> +      Object obj1;
> +      Ref ref;
> +      ref.num = i;
> +      ref.gen = xref->getEntry(i)->gen;
> +      xref->fetch(ref.num, ref.gen, &obj1);
> +      Guint offset = writeObject(&obj1, &ref, outStr);
> +      uxref->add(ref.num, ref.gen, offset, gTrue);
> +      obj1.free();
> +    }
> +  }
> +  if (uxref->getSize() == 0) { //we have nothing to update
> +    delete uxref;
> +    return;
> +  }
> +
> +  Guint uxrefOffset = outStr->getPos();
> +  uxref->writeToFile(outStr);
> +
> +  writeTrailer(uxrefOffset, objectsCount, outStr, gTrue);
> +
> +  delete uxref;
> +}
> +
> +void PDFDoc::saveCompleteRewrite (OutStream* outStr)
> +{
> +  outStr->printf("%%PDF-%.1f\r\n",pdfVersion);
> +  XRef *uxref = new XRef();
> +  uxref->add(0, 65535, 0, gFalse);
> +  for(int i=0; i<xref->getNumObjects(); i++) {
> +    Object obj1;
> +    Ref ref;
> +    XRefEntryType type = xref->getEntry(i)->type;
> +    if (type == xrefEntryFree) {
> +      ref.num = i;
> +      ref.gen = xref->getEntry(i)->gen;
> +      /* the XRef class add a lot of irrelevant free entries, we only want the significant one
> +          and we don't want the one with num=0 because it has already been added (gen = 65535)*/

Should be 'XRef class adds'

> +      if (ref.gen > 0 && ref.num > 0)
> +        uxref->add(ref.num, ref.gen, 0, gFalse);
> +    } else if (type == xrefEntryUncompressed){ 
> +      ref.num = i;
> +      ref.gen = xref->getEntry(i)->gen;
> +      xref->fetch(ref.num, ref.gen, &obj1);
> +      Guint offset = writeObject(&obj1, &ref, outStr);
> +      uxref->add(ref.num, ref.gen, offset, gTrue);
> +      obj1.free();
> +    } else if (type == xrefEntryCompressed) {
> +      ref.num = i;
> +      ref.gen = 0; //compressed entries have gen == 0
> +      xref->fetch(ref.num, ref.gen, &obj1);
> +      Guint offset = writeObject(&obj1, &ref, outStr);
> +      uxref->add(ref.num, ref.gen, offset, gTrue);
> +      obj1.free();
> +    }
> +  }
> +  Guint uxrefOffset = outStr->getPos();
> +  uxref->writeToFile(outStr);
> +
> +  writeTrailer(uxrefOffset, uxref->getSize(), outStr, gFalse);
> +
> +
> +  delete uxref;
> +
> +}

How well does PDFDoc::saveCompleteRewrite work? i.e. given an arbitrary
pdf, what are the chances that it won't produce something good?

> +
> +void PDFDoc::writeDictionnary (Dict* dict, OutStream* outStr)
> +{
> +  Object obj1;
> +  outStr->printf("<<");
> +  for (int i=0; i<dict->getLength(); i++) {
> +    outStr->printf("/%s ", dict->getKey(i));
> +    writeObject(dict->getValNF(i, &obj1), NULL, outStr);
> +    obj1.free();
> +  }
> +  outStr->printf(">>");
> +}
> +
> +void PDFDoc::writeStream (Stream* str, OutStream* outStr)
> +{
> +  int c;
> +  outStr->printf("stream\r\n");
> +  str->reset();
> +  for (int c=str->getChar(); c!= EOF; c=str->getChar()) {
> +    outStr->printf("%c", c);  
> +  }
> +  outStr->printf("\r\nendstream\r\n");
> +}
> +
> +void PDFDoc::writeRawStream (Stream* str, OutStream* outStr)
> +{
> +  Object obj1;
> +  str->getDict()->lookup("Length", &obj1);
> +  if (!obj1.isInt()) {
> +    error (-1, "PDFDoc::writeRawStream, no Length in stream dict");
> +    return;
> +  }
> +
> +  const int length = obj1.getInt();
> +  obj1.free();
> +
> +  outStr->printf("stream\r\n");
> +  str->unfilteredReset();
> +  for (int i=0; i<length; i++) {
> +    int c = str->getUnfilteredChar();
> +    outStr->printf("%c", c);  
> +  }
> +  str->reset();
> +  outStr->printf("\r\nendstream\r\n");
> +}
> +
> +void PDFDoc::writeString (GooString* s, OutStream* outStr)
> +{
> +  if (s->hasUnicodeMarker()) {
> +    //unicode string don't necessary end with \0
> +    const char* c = s->getCString();
> +    outStr->printf("(");
> +    for(int i=0; i<s->getLength(); i++) {
> +      char unescaped = *(c+i)&0x000000ff;
> +      //escape if needed
> +      if (unescaped == '(' || unescaped == ')' || unescaped == '\\')
> +        outStr->printf("%c", '\\');
> +      outStr->printf("%c", unescaped);
> +    }
> +    outStr->printf(") ");
> +  } else {
> +    const char* c = s->getCString();
> +    outStr->printf("(");
> +    while(*c!='\0') {
> +      char unescaped = (*c)&0x000000ff;
> +      //escape if needed
> +      if (unescaped == '(' || unescaped == ')' || unescaped == '\\')
> +        outStr->printf("%c", '\\');
> +      outStr->printf("%c", unescaped);
> +      c++;
> +    }
> +    outStr->printf(") ");
> +  }
> +}
> +
> +Guint PDFDoc::writeObject (Object* obj, Ref* ref, OutStream* outStr)
> +{
> +  Array *array;
> +  Object obj1;
> +  Guint offset = outStr->getPos();
> +  int tmp;
> +
> +  if(ref) 
> +    outStr->printf("%i %i obj", ref->num, ref->gen);
> +
> +  switch (obj->getType()) {
> +    case objBool:
> +      outStr->printf("%s ", obj->getBool()?"true":"false");
> +      break;
> +    case objInt:
> +      outStr->printf("%i ", obj->getInt());
> +      break;
> +    case objReal:
> +      outStr->printf("%g ", obj->getReal());
> +      break;
> +    case objString:
> +      writeString(obj->getString(), outStr);
> +      break;
> +    case objName:
> +      outStr->printf("/%s ", obj->getName());
> +      break;
> +    case objNull:
> +      outStr->printf( "null");
> +      break;
> +    case objArray:
> +      array = obj->getArray();
> +      outStr->printf("[");
> +      for (int i=0; i<array->getLength(); i++) {
> +        writeObject(array->getNF(i, &obj1), NULL,outStr);
> +        obj1.free();
> +      }
> +      outStr->printf("]");
> +      break;
> +    case objDict:
> +      writeDictionnary (obj->getDict(),outStr);
> +      break;
> +    case objStream: 
> +      {
> +        //We can't modify stream with the current implementation (no write functions in Stream API)
> +        // => the only type of streams which that have been modified are internal streams (=strWeird)
> +        Stream *stream = obj->getStream();
> +        if (stream->getKind() == strWeird) {
> +          //we write the stream unencoded => TODO: write stream encoder
> +          stream->reset();
> +          //recalculate stream length
> +          tmp = 0;
> +          for (int c=stream->getChar(); c!=EOF; c=stream->getChar()) {
> +            tmp++;
> +          }
> +          obj1.initInt(tmp);
> +          stream->getDict()->set("Length", &obj1);
> +
> +          //Remove Stream encoding
> +          stream->getDict()->remove("Filter");
> +          stream->getDict()->remove("DecodeParms");
> +
> +          writeDictionnary (stream->getDict(),outStr);
> +          writeStream (stream,outStr);
> +          obj1.free();
> +        } else {
> +          //raw stream copy
> +          writeDictionnary (stream->getDict(), outStr);
> +          writeRawStream (stream, outStr);
> +        }
> +        break;
> +      }
> +    case objRef:
> +      outStr->printf("%i %i R ", obj->getRef().num, obj->getRef().gen);
> +      break;
> +    case objCmd:
> +      outStr->printf("cmd\r\n");
> +      break;
> +    case objError:
> +      outStr->printf("error\r\n");
> +      break;
> +    case objEOF:
> +      outStr->printf("eof\r\n");
> +      break;
> +    case objNone:
> +      outStr->printf("none\r\n");
> +      break;
> +    default:
> +      error(-1,"Unhandled objType : %i, please report a bug with a testcase\r\n", obj->getType());
> +      break;
> +  }
> +  if (ref)
> +    outStr->printf("endobj\r\n");
> +  return offset;
> +}
> +
> +void PDFDoc::writeTrailer (Guint uxrefOffset, int uxrefSize, OutStream* outStr, GBool incrUpdate)
> +{
> +  Dict *trailerDict = new Dict(xref);
> +  Object obj1;
> +  obj1.initInt(uxrefSize);
> +  trailerDict->set("Size", &obj1);
> +  obj1.free();
> +
> +
> +  //build a new ID, as recommended in the reference, uses:
> +  // - current time
> +  // - file name
> +  // - file size
> +  // - values of entry in information dictionnary
> +  GooString message;
> +  char buffer[256];
> +  sprintf(buffer, "%i", (int)time(NULL));

I don't really like the idea of using time here becase it makes things
non-deterministic. Some people also consider it a security leak. Could
we use an md5 hash or something instead?

> +  message.append(buffer);
> +  message.append(fileName);
> +  // file size
> +  unsigned int fileSize = 0;
> +  int c;
> +  str->reset();
> +  while ((c = str->getChar()) != EOF) {
> +    fileSize++;
> +  }
> +  str->close();
> +  sprintf(buffer, "%i", fileSize);
> +  message.append(buffer);
> +
> +  //info dict -- only use text string
> +  if (xref->getDocInfo(&obj1)->isDict()) {
> +    for(int i=0; i<obj1.getDict()->getLength(); i++) {
> +      Object obj2;
> +      obj1.getDict()->getVal(i, &obj2);  
> +      if (obj2.isString()) {
> +        message.append(obj2.getString());
> +      }
> +      obj2.free();
> +    }
> +  }
> +  obj1.free();
> +
> +  //calculate md5 digest
> +  Guchar digest[16];
> +  Decrypt::md5((Guchar*)message.getCString(), message.getLength(), digest);
> +  obj1.initString(new GooString((const char*)digest, 16));
> +
> +  //create ID array
> +  Object obj2,obj3,obj4;
> +  obj2.initArray(xref);
> +
> +  if (incrUpdate) {
> +    //only update the second part of the array
> +    if(xref->getTrailerDict()->getDict()->lookup("ID", &obj4) != NULL) {
> +      if (!obj4.isArray()) {
> +        error(-1, "PDFDoc::writeTrailer original file's ID entry isn't an array. Trying to continue");
> +      } else {
> +        //Get the first part of the ID
> +        obj4.arrayGet(0,&obj3); 
> +
> +        obj2.arrayAdd(&obj3); 
> +        obj2.arrayAdd(&obj1);
> +        trailerDict->set("ID", &obj2);
> +      }
> +    }
> +  } else {
> +    //new file => same values for the two identifiers
> +    obj2.arrayAdd(&obj1);
> +    obj2.arrayAdd(&obj1);
> +    trailerDict->set("ID", &obj2);
> +  }
> +
> +
> +  obj1.initRef(xref->getRootNum(), xref->getRootGen());
> +  trailerDict->set("Root", &obj1);
> +  obj1.free();
> +
> +  if (incrUpdate) { 
> +    obj1.initInt(xref->getLastXRefPos());
> +    trailerDict->set("Prev", &obj1);
> +    obj1.free();
> +  }
> +  outStr->printf( "trailer\r\n");
> +  writeDictionnary(trailerDict, outStr);
> +  outStr->printf( "\r\nstartxref\r\n");
> +  outStr->printf( "%i\r\n", uxrefOffset);
> +  outStr->printf( "%%%%EOF\r\n");
>  }
> diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h
> index e6992ba..2336694 100644
> --- a/poppler/PDFDoc.h
> +++ b/poppler/PDFDoc.h
> @@ -27,6 +27,12 @@ class LinkAction;
>  class LinkDest;
>  class Outline;
>  
> +enum PDFWriteMode {
> +  writeStandard,
> +  writeForceRewrite,
> +  writeForceIncremental
> +};
> +
>  //------------------------------------------------------------------------
>  // PDFDoc
>  //------------------------------------------------------------------------
> @@ -169,12 +175,22 @@ public:
>    double getPDFVersion() { return pdfVersion; }
>  
>    // Save this file with another name.
> -  GBool saveAs(GooString *name);
> +  GBool saveAs(GooString *name, PDFWriteMode mode=writeStandard);
>  
>    // Return a pointer to the GUI (XPDFCore or WinPDFCore object).
>    void *getGUIData() { return guiData; }
>  
>  private:
> +  // Add object to current file stream and return the offset of the beginning of the object
> +  Guint writeObject (Object *obj, Ref *ref, OutStream* outStr);
> +  void writeDictionnary (Dict* dict, OutStream* outStr);
> +  void writeStream (Stream* str, OutStream* outStr);
> +  void writeRawStream (Stream* str, OutStream* outStr);
> +  void writeTrailer (Guint uxrefOffset, int uxrefSize, OutStream* outStr, GBool incrUpdate);
> +  void writeString (GooString* s, OutStream* outStr);
> +  void saveIncrementalUpdate (OutStream* outStr);
> +  void saveCompleteRewrite (OutStream* outStr);
> +
>  
>    GBool setup(GooString *ownerPassword, GooString *userPassword);
>    GBool checkFooter();
> -- 
> 1.5.2.5
> 

> >From 83d2f0c7e784f10a86946c2665d7f870e9e69045 Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Fri, 26 Oct 2007 15:20:43 +0300
> Subject: [PATCH] Adds "deep" copy constructor to Dict.
> 
> ---
>  poppler/Dict.cc |   15 ++++++++++++++-
>  poppler/Dict.h  |    1 +
>  2 files changed, 15 insertions(+), 1 deletions(-)
> 
> diff --git a/poppler/Dict.cc b/poppler/Dict.cc
> index 0c74566..bc0d7b6 100644
> --- a/poppler/Dict.cc
> +++ b/poppler/Dict.cc
> @@ -30,6 +30,18 @@ Dict::Dict(XRef *xrefA) {
>    ref = 1;
>  }
>  
> +Dict::Dict(Dict* dictA) {
> +  xref = dictA->xref;
> +  size = length = dictA->length;
> +  ref = 1;
> +
> +  entries = (DictEntry *)gmallocn(size, sizeof(DictEntry));
> +  for (int i=0; i<length; i++) {
> +    entries[i].key = strdup(dictA->entries[i].key);
> +    dictA->entries[i].val.copy(&entries[i].val);
> +  }
> +}
> +
>  Dict::~Dict() {
>    int i;
>  
> @@ -89,7 +101,8 @@ void Dict::set(char *key, Object *val) {
>    e = find (key);
>    if (e) {
>      e->val.free();
> -    e->val = *val;
> +    //e->val = *val;

do we want to keep the commented line?

> +    val->copy(&e->val);
>    } else {
>      add (copyString(key), val);
>    }
> diff --git a/poppler/Dict.h b/poppler/Dict.h
> index 0d1e6a1..badb6b5 100644
> --- a/poppler/Dict.h
> +++ b/poppler/Dict.h
> @@ -29,6 +29,7 @@ public:
>  
>    // Constructor.
>    Dict(XRef *xrefA);
> +  Dict(Dict* dictA);
>  
>    // Destructor.
>    ~Dict();
> -- 
> 1.5.2.5
> 

> >From 5da741dca3a1d94af22ff2aae4410fbf4a01bfb5 Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Fri, 26 Oct 2007 15:23:22 +0300
> Subject: [PATCH] Annot will save their generated appearance in their AP dict.
> 
> ---
>  poppler/Annot.cc |   37 +++++++++++++++++++++++++++++++++++++
>  poppler/Annot.h  |    4 +++-
>  2 files changed, 40 insertions(+), 1 deletions(-)
> 
> diff --git a/poppler/Annot.cc b/poppler/Annot.cc
> index 850b729..a1bb227 100644
> --- a/poppler/Annot.cc
> +++ b/poppler/Annot.cc
> @@ -26,6 +26,7 @@
>  #include "CharCodeToUnicode.h"
>  #include "Form.h"
>  #include "Error.h"
> +#include "XRef.h"
>  
>  #define annotFlagHidden    0x0002
>  #define annotFlagPrint     0x0004
> @@ -106,6 +107,8 @@ void Annot::initialize(XRef *xrefA, Dict *acroForm, Dict *dict, Catalog *catalog
>    double borderR, borderG, borderB;
>    double t;
>  
> +  appRef.num = 0;
> +  appRef.gen = 65535;
>    ok = gTrue;
>    xref = xrefA;
>    appearBuf = NULL;
> @@ -711,6 +714,40 @@ void Annot::generateFieldAppearance(Dict *field, Dict *annot, Dict *acroForm) {
>    appearance.free();
>    appearance.initStream(appearStream);
>  
> +
> +  appearStream->setNeedFree(gTrue);
> +
> +  if (widget->isModified()) {
> +    //create a new object that will contains the new appearance
> +    
> +    //if we already have a N entry in our AP dict, reuse it
> +    if (annot->lookup("AP", &obj1)->isDict() &&
> +        obj1.dictLookupNF("N", &obj2)->isRef()) {
> +      appRef = obj2.getRef();
> +    }
> +
> +    // this annot doesn't have an AP yet, create one
> +    if (appRef.num == 0)
> +      appRef = xref->addIndirectObject(&appearance);
> +    else // since we reuse the already existing AP, we have to notify the xref about this update
> +      xref->setModifiedObject(&appearance, appRef);
> +
> +    // update object's AP and AS
> +    Object apObj;
> +    apObj.initDict(xref);
> +
> +    Object oaRef;
> +    oaRef.initRef(appRef.num, appRef.gen);
> +
> +    apObj.dictSet("N", &oaRef);
> +    annot->set("AP", &apObj);
> +    Dict* d = new Dict(annot);
> +    Object dictObj;
> +    dictObj.initDict(d);
> +
> +    xref->setModifiedObject(&dictObj, ref);
> +  }
> +
>    if (fontDict) {
>      delete fontDict;
>    }
> diff --git a/poppler/Annot.h b/poppler/Annot.h
> index 50f5bfb..56af2f5 100644
> --- a/poppler/Annot.h
> +++ b/poppler/Annot.h
> @@ -115,7 +115,9 @@ private:
>    FormWidget *widget;           // FormWidget object for this annotation
>    GooString *type;              // annotation type
>    Object appearance;		// a reference to the Form XObject stream
> -				//   for the normal appearance
> +  Ref appRef; //the reference to the indirect appearance object in XRef 
> +
> +	// for the normal appearance
>    GooString *appearBuf;
>    Guint flags;
>    double xMin, yMin,		// annotation rectangle
> -- 
> 1.5.2.5
> 

> >From a0520cb87c6800c2252a980cbfcf5de0a9ff09e9 Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Fri, 26 Oct 2007 15:24:32 +0300
> Subject: [PATCH] FormWidget's modified field is now correctly updated.

Why?

> 
> ---
>  poppler/Form.cc |    7 ++++++-
>  1 files changed, 6 insertions(+), 1 deletions(-)
> 
> diff --git a/poppler/Form.cc b/poppler/Form.cc
> index 334e45c..21cc19a 100644
> --- a/poppler/Form.cc
> +++ b/poppler/Form.cc
> @@ -163,6 +163,7 @@ void FormWidgetButton::setState (GBool astate, GBool calledByParent)
>      return;
>    //the state modification may be denied by the parent. e.g we don't want to let the user put all combo boxes to false
>    if (!calledByParent) { //avoid infinite recursion
> +    modified = gTrue;
>      if (!parent->setState(childNum, astate)) {
>        return;
>      }
> @@ -348,6 +349,7 @@ void FormWidgetText::setContent(GooString* new_content)
>      return;
>    }
>  
> +  modified = gTrue;
>    if (new_content == NULL) {
>      parent->setContentCopy(NULL);
>    } else {
> @@ -364,7 +366,6 @@ void FormWidgetText::setContent(GooString* new_content)
>      obj.getDict()->set("V", &obj1);
>      //notify the xref about the update
>      xref->setModifiedObject(&obj, ref);
> -    modified = gTrue;
>    }
>  }
>  
> @@ -524,6 +525,7 @@ void FormWidgetChoice::select (int i)
>      return;
>    }
>    if (!_checkRange(i)) return;
> +  modified = gTrue;
>    parent->select(i);
>    _updateV();
>  }
> @@ -535,6 +537,7 @@ void FormWidgetChoice::toggle (int i)
>      return;
>    }
>    if (!_checkRange(i)) return;
> +  modified = gTrue;
>    parent->toggle(i);
>    _updateV();
>  }
> @@ -545,6 +548,7 @@ void FormWidgetChoice::deselectAll ()
>      error(-1, "FormWidgetChoice::deselectAll called on a read only field\n");
>      return;
>    }
> +  modified = gTrue;
>    parent->deselectAll();
>    _updateV();
>  }
> @@ -575,6 +579,7 @@ void FormWidgetChoice::setEditChoice (GooString* new_content)
>      return;
>    }
>  
> +  modified = gTrue;
>    if (new_content == NULL) {
>      parent->setEditChoice(NULL);
>    } else {
> -- 
> 1.5.2.5
> 

> >From b9e7214c516a9f2abb25a03493301ae0ece2006e Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Fri, 26 Oct 2007 16:40:11 +0300
> Subject: [PATCH] Fix memory management problem with appearBuf in Annot.
> 
> ---
>  poppler/Annot.cc |    6 ++----
>  1 files changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/poppler/Annot.cc b/poppler/Annot.cc
> index a1bb227..b84fd87 100644
> --- a/poppler/Annot.cc
> +++ b/poppler/Annot.cc
> @@ -332,9 +332,6 @@ Annot::~Annot() {
>      delete type;
>    }
>    appearance.free();
> -  if (appearBuf) {
> -    delete appearBuf;
> -  }
>  
>    if (borderStyle) {
>      delete borderStyle;
> @@ -709,11 +706,12 @@ void Annot::generateFieldAppearance(Dict *field, Dict *annot, Dict *acroForm) {
>    drObj.free();
>  
>    // build the appearance stream
> -  appearStream = new MemStream(appearBuf->getCString(), 0,
> +  appearStream = new MemStream(strdup(appearBuf->getCString()), 0,
>        appearBuf->getLength(), &appearDict);

I believe this will leak the strdupped copy of appearBuf->getCString()
because MemStream won't free it.

>    appearance.free();
>    appearance.initStream(appearStream);
>  
> +  delete appearBuf;
>  
>    appearStream->setNeedFree(gTrue);
>  
> -- 
> 1.5.2.5
> 

> >From dadfef5772c6539a3f888d0403d5219ac0857cf9 Mon Sep 17 00:00:00 2001
> From: Julien Rebetez <julien at fhtagn.net>
> Date: Fri, 26 Oct 2007 17:03:00 +0300
> Subject: [PATCH] Adds a test application to test full rewrite functionnality.
> 
> ---
>  test/Makefile.am        |   11 ++++++++++-
>  test/pdf-fullrewrite.cc |   44 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 54 insertions(+), 1 deletions(-)
>  create mode 100644 test/pdf-fullrewrite.cc
> 
> diff --git a/test/Makefile.am b/test/Makefile.am
> index df1809a..d46b988 100644
> --- a/test/Makefile.am
> +++ b/test/Makefile.am
> @@ -33,6 +33,9 @@ perf_test =				\
>  
>  endif
>  
> +pdf_fullrewrite = \
> +	pdf-fullrewrite
> +
>  INCLUDES =					\
>  	-I$(top_srcdir)				\
>  	-I$(top_srcdir)/poppler			\
> @@ -42,7 +45,7 @@ INCLUDES =					\
>  	$(GTK_TEST_CFLAGS)			\
>  	$(FONTCONFIG_CFLAGS)
>  
> -noinst_PROGRAMS = $(gtk_splash_test) $(gtk_cairo_test) $(pdf_inspector) $(perf_test)
> +noinst_PROGRAMS = $(gtk_splash_test) $(gtk_cairo_test) $(pdf_inspector) $(perf_test) $(pdf_fullrewrite)
>  
>  gtk_splash_test_SOURCES =			\
>         gtk-splash-test.cc
> @@ -79,5 +82,11 @@ perf_test_LDADD =				\
>  	$(top_builddir)/poppler/libpoppler.la	\
>  	$(FREETYPE_LIBS)
>  
> +pdf_fullrewrite_SOURCES = \
> +	pdf-fullrewrite.cc
> +
> +pdf_fullrewrite_LDADD = \
> +	$(top_builddir)/poppler/libpoppler.la
> +
>  EXTRA_DIST =					\
>  	pdf-operators.c
> diff --git a/test/pdf-fullrewrite.cc b/test/pdf-fullrewrite.cc
> new file mode 100644
> index 0000000..b782bc4
> --- /dev/null
> +++ b/test/pdf-fullrewrite.cc
> @@ -0,0 +1,44 @@
> +//========================================================================
> +//
> +// pdf-fullrewrite.cc
> +//
> +// Copyright 2007 Julien Rebetez
> +//
> +//========================================================================
> +#include "config.h"
> +#include <poppler-config.h>
> +#include "GlobalParams.h"
> +#include "Error.h"
> +#include "PDFDoc.h"
> +#include "goo/GooString.h"
> +
> +int main (int argc, char *argv[])
> +{
> +  PDFDoc *doc;
> +  GooString *inputName, *outputName;
> +
> +  // parse args
> +  if (argc < 3) {
> +    fprintf(stderr, "usage: %s INPUT-FILE OUTPUT-FILE\n", argv[0]);
> +    return 1;
> +  }
> +
> +  inputName = new GooString(argv[1]);
> +  outputName = new GooString(argv[2]);
> +
> +  globalParams = new GlobalParams();
> +
> +  doc = new PDFDoc(inputName);
> +
> +  if (!doc->isOk()) {
> +    delete doc;
> +    fprintf(stderr, "Error loading document !\n");
> +    return 1;
> +  }
> +
> +
> +  doc->saveAs(outputName, writeForceRewrite);
> +
> +  delete doc;
> +  delete globalParams;
> +}
> -- 
> 1.5.2.5
> 

Looks good.

Thanks for doing this work, it will be good to have in the tree.

-Jeff