[poppler] [patch] Add ability to extract embedded files.

Albert Astals Cid aacid at kde.org
Mon Aug 29 06:21:39 EST 2005


Seems fine for me 

Albert

A Dissabte 27 Agost 2005 14:07, Brad Hards va escriure:
> G'day team,
>
> As a by-product of my flight to Akademy (yeah, thanks BA - I would like my
> clothes, back )-:, I worked up this little patch. It provides the
> capability to extract an "attached" or embedded file. I haven't seen this
> used a lot, but it was an interesting exercise in understanding the API a
> bit more, and the feature is a lot more usable in Acrobat 7, so users might
> start attaching the source document to files.
>
> The change to the core poppler code is as shown below. The change is
> actually pretty small - I did have to expand the API for the NameTree class
> a little. I'm not sure I'm walking down the datastructures in a reliable
> way, because I forgot to generate a test file with Acrobat before leaving
> home, and resorted to creating one with pdftk. See the attached document to
> see the proposed Qt4 API / test application. I've also checked in that
> pdftk-created test example (into test/unittestcases/, as
> WithAttachments.pdf)
>
> There is potentially other metadata that could be extracted. At this stage
> there is a description that shows up in Acrobat Reader that I can't find in
> the file. I guess that Acrobat 7 will put more that pdftk-1.12, based on
> the column headers in Acrobat 7 Reader.
>
> This is a work in progress (especially the Qt4 bindings), but feedback
> would be appreciated - including help with testing it against
> Acrobat-generated files.
>
> Brad
>
> Index: Catalog.cc
> ===================================================================
> RCS file: /cvs/poppler/poppler/poppler/Catalog.cc,v
> retrieving revision 1.11
> diff -u -4 -p -r1.11 Catalog.cc
> --- Catalog.cc  27 Aug 2005 08:43:43 -0000      1.11
> +++ Catalog.cc  27 Aug 2005 11:48:36 -0000
> @@ -96,8 +96,11 @@ Catalog::Catalog(XRef *xrefA) {
>    if (catDict.dictLookup("Names", &obj)->isDict()) {
>      obj.dictLookup("Dests", &obj2);
>      destNameTree.init(xref, &obj2);
>      obj2.free();
> +    obj.dictLookup("EmbeddedFiles", &obj2);
> +    embeddedFileNameTree.init(xref, &obj2);
> +    obj2.free();
>    }
>    obj.free();
>
>    if (catDict.dictLookup("PageLabels", &obj)->isDict())
> @@ -183,8 +186,9 @@ Catalog::~Catalog() {
>      gfree(pageRefs);
>    }
>    dests.free();
>    destNameTree.free();
> +  embeddedFileNameTree.free();
>    if (baseURI) {
>      delete baseURI;
>    }
>    delete pageLabelInfo;
> @@ -345,8 +349,38 @@ LinkDest *Catalog::findDest(GooString *n
>
>    return dest;
>  }
>
> +EmbFile *Catalog::embeddedFile(int i)
> +{
> +  Object efDict;
> +  Object fileSpec;
> +  Object strObj;
> +  Object obj, obj2, obj4;
> +  obj = embeddedFileNameTree.getValue(i);
> +  GooString *fileName = new GooString();
> +  Stream *efStream;
> +  if (obj.isRef()) {
> +    if (obj.fetch(xref, &efDict)->isDict()) {
> +      efDict.dictLookup("F", &fileSpec);
> +      if (fileSpec.isString()) {
> +       fileName = new GooString(fileSpec.getString());
> +      }
> +      fileSpec.free();
> +      efDict.dictLookup("EF", &obj2);
> +      if (obj2.isDict()) {
> +       obj2.dictLookup("F", &strObj);
> +       if (strObj.isStream()) {
> +         efStream = strObj.getStream();
> +       }
> +       // iterate through the dict entries for obj2 and efDict()?
> +      }
> +      obj2.free();
> +    }
> +  }
> +  return new EmbFile(fileName, efStream);
> +}
> +
>  NameTree::NameTree(void)
>  {
>    size = 0;
>    length = 0;
> @@ -434,8 +468,17 @@ GBool NameTree::lookup(GooString *name,
>      return gFalse;
>    }
>  }
>
> +Object NameTree::getValue(int index)
> +{
> +  if (index < length) {
> +    return entries[index]->value;
> +  } else {
> +    return Object();
> +  }
> +}
> +
>  void NameTree::free()
>  {
>    int i;
>
> Index: Catalog.h
> ===================================================================
> RCS file: /cvs/poppler/poppler/poppler/Catalog.h,v
> retrieving revision 1.6
> diff -u -4 -p -r1.6 Catalog.h
> --- Catalog.h   7 Jul 2005 11:04:08 -0000       1.6
> +++ Catalog.h   27 Aug 2005 11:48:36 -0000
> @@ -31,8 +31,11 @@ public:
>    void init(XRef *xref, Object *tree);
>    void parse(Object *tree);
>    GBool lookup(GooString *name, Object *obj);
>    void free();
> +  int numEntries() { return length; };
> +  // iterator accessor
> +  Object getValue(int i);
>
>  private:
>    struct Entry {
>      Entry(Array *array, int index);
> @@ -47,9 +50,26 @@ private:
>
>    XRef *xref;
>    Object *root;
>    Entry **entries;
> -  int size, length;
> +  int size, length; // size is the number of entries in
> +                    // the array of Entry*
> +                    // length is the number of real Entry
> +};
> +
> +class EmbFile {
> +public:
> +  EmbFile(GooString *name, Stream *efStream):
> +    m_name(name),
> +    m_efStream(efStream)
> +  {}
> +
> +  GooString *name() { return m_name; }
> +  Stream *stream() { return m_efStream; }
> +
> +private:
> +  GooString *m_name;
> +  Stream *m_efStream;
>  };
>
>  //------------------------------------------------------------------------
>  // Catalog
> @@ -93,8 +113,14 @@ public:
>    // Find a named destination.  Returns the link destination, or
>    // NULL if <name> is not a destination.
>    LinkDest *findDest(GooString *name);
>
> +  // Get the number of embedded files
> +  int numEmbeddedFiles() { return embeddedFileNameTree.numEntries(); }
> +
> +  // Get the i'th file embedded (at the Document level) in the document
> +  EmbFile *embeddedFile(int i);
> +
>    // Convert between page indices and page labels.
>    GBool labelToIndex(GooString *label, int *index);
>    GBool indexToLabel(int index, GooString *label);
>
> @@ -129,9 +155,10 @@ private:
>    Ref *pageRefs;               // object ID for each page
>    int numPages;                        // number of pages
>    int pagesSize;               // size of pages array
>    Object dests;                        // named destination dictionary
> -  NameTree destNameTree;       // name tree
> +  NameTree destNameTree;       // named destination name-tree
> +  NameTree embeddedFileNameTree;  // embedded file name-tree
>    GooString *baseURI;          // base URI for URI-type links
>    Object metadata;             // metadata stream
>    Object structTreeRoot;       // structure tree root dictionary
>    Object outline;              // outline dictionary


More information about the poppler mailing list