[poppler] poppler/Gfx.cc poppler/Stream.cc poppler/Stream.h utils/ImageOutputDev.cc utils/ImageOutputDev.h

Adrian Johnson ajohnson at kemper.freedesktop.org
Wed Aug 16 21:35:26 UTC 2017


 poppler/Gfx.cc          |    2 -
 poppler/Stream.cc       |   96 ++++++++++++++++++++++++++++++++++++++++++------
 poppler/Stream.h        |   14 ++++++-
 utils/ImageOutputDev.cc |   67 ++++++++++++++++++++++++++++++---
 utils/ImageOutputDev.h  |    1 
 5 files changed, 159 insertions(+), 21 deletions(-)

New commits:
commit 488d28ec9507eb99c7cb4cd2cafb54995a8bc9f8
Author: Adrian Johnson <ajohnson at redneon.com>
Date:   Wed Aug 16 21:01:07 2017 +0930

    pdfimages: support listing/extracting inline images
    
    The difficulty with extracting inline images is that inline images do
    not provide any way of determining the length or end of image data
    without decoding the image. We can get the length by using ImageStream
    to decode the data then check the stream position. But then we are
    still unable to extract the undecoded image data because embedded
    streams can only be read once.
    
    Since inline images tend to be small the solution implemented is to
    modify EmbedStream to keep a copy of the data read from it in memory
    and then allow the data to be read again.
    
    Two new functions have been added to EmbedStream. rewind() will cause
    EmbedStream.getChar() to stop recording data and switch to replaying
    the saved data, returning EOF when the end of the saved data is
    reached. The restore() function will make getChar() switch back to
    reading from the parent stream.
    
    ImageOutputDev can now extract or get the image size by first using
    ImageStream to read data from the embedded stream. After calling
    rewind() the undecoded image data can be read from the embedded stream
    until EOF is returned. Then restore() is called so that Gfx can read
    the 'EI' from the end of the embedded stream.
    
    Bug 25625

diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc
index be9810e1..2bfc1ecd 100644
--- a/poppler/Gfx.cc
+++ b/poppler/Gfx.cc
@@ -4901,7 +4901,7 @@ Stream *Gfx::buildImageStream() {
 
   // make stream
   if (parser->getStream()) {
-    str = new EmbedStream(parser->getStream(), std::move(dict), gFalse, 0);
+    str = new EmbedStream(parser->getStream(), std::move(dict), gFalse, 0, gTrue);
     str = str->addFilters(str->getDict());
   } else {
     str = NULL;
diff --git a/poppler/Stream.cc b/poppler/Stream.cc
index df767652..9cb48abc 100644
--- a/poppler/Stream.cc
+++ b/poppler/Stream.cc
@@ -1033,14 +1033,25 @@ void MemStream::moveStart(Goffset delta) {
 //------------------------------------------------------------------------
 
 EmbedStream::EmbedStream(Stream *strA, Object &&dictA,
-			 GBool limitedA, Goffset lengthA):
+			 GBool limitedA, Goffset lengthA, GBool reusableA):
     BaseStream(std::move(dictA), lengthA) {
   str = strA;
   limited = limitedA;
   length = lengthA;
+  reusable = reusableA;
+  record = gFalse;
+  replay = gFalse;
+  if (reusable) {
+    bufData = (unsigned char*)gmalloc(16384);
+    bufMax = 16384;
+    bufLen = 0;
+    record = gTrue;
+  }
 }
 
 EmbedStream::~EmbedStream() {
+  if (reusable)
+    gfree(bufData);
 }
 
 BaseStream *EmbedStream::copy() {
@@ -1054,31 +1065,94 @@ Stream *EmbedStream::makeSubStream(Goffset start, GBool limitedA,
   return NULL;
 }
 
+void EmbedStream::rewind() {
+  record = gFalse;
+  replay = gTrue;
+  bufPos = 0;
+}
+
+void EmbedStream::restore() {
+  replay = gFalse;
+}
+
+Goffset EmbedStream::getPos() {
+  if (replay)
+    return bufPos;
+  else
+    return str->getPos();
+}
+
 int EmbedStream::getChar() {
-  if (limited && !length) {
-    return EOF;
+  if (replay) {
+    if (bufPos < bufLen)
+      return bufData[bufPos++];
+    else
+      return EOF;
+  } else {
+    if (limited && !length) {
+      return EOF;
+    }
+    int c = str->getChar();
+    --length;
+    if (record) {
+      bufData[bufLen] = c;
+      bufLen++;
+      if (bufLen >= bufMax) {
+        bufMax *= 2;
+        bufData = (unsigned char *)grealloc(bufData, bufMax);
+      }
+    }
+    return c;
   }
-  --length;
-  return str->getChar();
 }
 
 int EmbedStream::lookChar() {
-  if (limited && !length) {
-    return EOF;
+  if (replay) {
+    if (bufPos < bufLen)
+      return bufData[bufPos];
+    else
+      return EOF;
+  } else {
+    if (limited && !length) {
+      return EOF;
+    }
+    return str->lookChar();
   }
-  return str->lookChar();
 }
 
 int EmbedStream::getChars(int nChars, Guchar *buffer) {
+  int len;
+
   if (nChars <= 0) {
     return 0;
   }
-  if (limited && length < nChars) {
-    nChars = length;
+  if (replay) {
+    if (bufPos >= bufLen)
+      return EOF;
+    len = bufLen - bufPos;
+    if (nChars > len)
+      nChars = len;
+    memcpy(buffer, bufData, len);
+    return len;
+  } else {
+    if (limited && length < nChars) {
+      nChars = length;
+    }
+    len = str->doGetChars(nChars, buffer);
+    if (record) {
+      if (bufLen + len >= bufMax) {
+        while (bufLen + len >= bufMax)
+          bufMax *= 2;
+        bufData = (unsigned char *)grealloc(bufData, bufMax);
+      }
+      memcpy(bufData+bufLen, buffer, len);
+      bufLen += len;
+    }
   }
-  return str->doGetChars(nChars, buffer);
+  return len;
 }
 
+
 void EmbedStream::setPos(Goffset pos, int dir) {
   error(errInternal, -1, "Internal: called setPos() on EmbedStream");
 }
diff --git a/poppler/Stream.h b/poppler/Stream.h
index 2317080e..7e67697c 100644
--- a/poppler/Stream.h
+++ b/poppler/Stream.h
@@ -607,7 +607,7 @@ private:
 class EmbedStream: public BaseStream {
 public:
 
-  EmbedStream(Stream *strA, Object &&dictA, GBool limitedA, Goffset lengthA);
+  EmbedStream(Stream *strA, Object &&dictA, GBool limitedA, Goffset lengthA, GBool reusableA = gFalse);
   ~EmbedStream();
   BaseStream *copy() override;
   Stream *makeSubStream(Goffset start, GBool limitedA,
@@ -616,7 +616,7 @@ public:
   void reset() override {}
   int getChar() override;
   int lookChar() override;
-  Goffset getPos() override { return str->getPos(); }
+  Goffset getPos() override;
   void setPos(Goffset pos, int dir = 0) override;
   Goffset getStart() override;
   void moveStart(Goffset delta) override;
@@ -624,6 +624,8 @@ public:
   int getUnfilteredChar () override { return str->getUnfilteredChar(); }
   void unfilteredReset () override { str->unfilteredReset(); }
 
+  void rewind();
+  void restore();
 
 private:
 
@@ -632,6 +634,14 @@ private:
 
   Stream *str;
   GBool limited;
+  GBool reusable;
+  GBool record;
+  GBool replay;
+  unsigned char *bufData;
+  long bufMax;
+  long bufLen;
+  long bufPos;
+
 };
 
 //------------------------------------------------------------------------
diff --git a/utils/ImageOutputDev.cc b/utils/ImageOutputDev.cc
index f6fb35dd..33cbb714 100644
--- a/utils/ImageOutputDev.cc
+++ b/utils/ImageOutputDev.cc
@@ -246,7 +246,9 @@ void ImageOutputDev::listImage(GfxState *state, Object *ref, Stream *str,
     printf("%5.0f ", yppi);
 
   Goffset embedSize = -1;
-  if (!inlineImg)
+  if (inlineImg)
+    embedSize = getInlineImageLength(str, width, height, colorMap);
+  else
     embedSize = str->getBaseStream()->getLength();
 
   long long imageSize = 0;
@@ -311,6 +313,43 @@ void ImageOutputDev::listImage(GfxState *state, Object *ref, Stream *str,
   }
 }
 
+long ImageOutputDev::getInlineImageLength(Stream *str, int width, int height,
+                                          GfxImageColorMap *colorMap) {
+  long len;
+
+  if (colorMap) {
+    ImageStream *imgStr = new ImageStream(str, width, colorMap->getNumPixelComps(),
+                                          colorMap->getBits());
+    imgStr->reset();
+    for (int y = 0; y < height; y++)
+      imgStr->getLine();
+
+    imgStr->close();
+    delete imgStr;
+  } else {
+    str->reset();
+    for (int y = 0; y < height; y++) {
+      int size = (width + 7)/8;
+      for (int x = 0; x < size; x++)
+        str->getChar();
+    }
+  }
+
+  EmbedStream *embedStr = (EmbedStream *) (str->getBaseStream());
+  embedStr->rewind();
+  if (str->getKind() == strDCT || str->getKind() == strCCITTFax)
+    str = str->getNextStream();
+  len = 0;
+  str->reset();
+  while (str->getChar() != EOF)
+    len++;
+
+  embedStr->restore();
+
+
+  return len;
+}
+
 void ImageOutputDev::writeRawImage(Stream *str, const char *ext) {
   FILE *f;
   int c;
@@ -498,15 +537,21 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str,
 				int width, int height,
 				GfxImageColorMap *colorMap, GBool inlineImg) {
   ImageFormat format;
+  EmbedStream *embedStr;
 
-  if (dumpJPEG && str->getKind() == strDCT &&
-      (colorMap->getNumPixelComps() == 1 ||
-       colorMap->getNumPixelComps() == 3) &&
-      !inlineImg) {
+  if (dumpJPEG && str->getKind() == strDCT) {
+    if (inlineImg) {
+      embedStr = (EmbedStream *) (str->getBaseStream());
+      getInlineImageLength(str, width, height, colorMap); // record the strean
+      embedStr->rewind();
+    }
 
     // dump JPEG file
     writeRawImage(str, "jpg");
 
+    if (inlineImg)
+      embedStr->restore();
+
   } else if (dumpJP2 && str->getKind() == strJPX && !inlineImg) {
     // dump JPEG2000 file
     writeRawImage(str, "jp2");
@@ -535,7 +580,7 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str,
     // dump JBIG2 embedded file
     writeRawImage(str, "jb2e");
 
-  } else if (dumpCCITT && str->getKind() == strCCITTFax && !inlineImg) {
+  } else if (dumpCCITT && str->getKind() == strCCITTFax) {
     // write CCITT parameters
     CCITTFaxStream *ccittStr = static_cast<CCITTFaxStream *>(str);
     FILE *f;
@@ -567,14 +612,22 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str,
 
     fclose(f);
 
+    if (inlineImg) {
+      embedStr = (EmbedStream *) (str->getBaseStream());
+      getInlineImageLength(str, width, height, colorMap); // record the strean
+      embedStr->rewind();
+    }
+
     // dump CCITT file
     writeRawImage(str, "ccitt");
 
+    if (inlineImg)
+      embedStr->restore();
+
   } else if (outputPNG && !(outputTiff && colorMap &&
                             (colorMap->getColorSpace()->getMode() == csDeviceCMYK ||
                              (colorMap->getColorSpace()->getMode() == csICCBased &&
                               colorMap->getNumPixelComps() == 4)))) {
-
     // output in PNG format
 
 #if ENABLE_LIBPNG
diff --git a/utils/ImageOutputDev.h b/utils/ImageOutputDev.h
index 22954cf0..baccd8ef 100644
--- a/utils/ImageOutputDev.h
+++ b/utils/ImageOutputDev.h
@@ -160,6 +160,7 @@ private:
   void writeRawImage(Stream *str, const char *ext);
   void writeImageFile(ImgWriter *writer, ImageFormat format, const char *ext,
                       Stream *str, int width, int height, GfxImageColorMap *colorMap);
+  long getInlineImageLength(Stream *str, int width, int height, GfxImageColorMap *colorMap);
 
   char *fileRoot;		// root of output file names
   char *fileName;		// buffer for output file names


More information about the poppler mailing list