[PATCH] Added rudimentary HTTP streaming support.

Stefan Thomas thomas at eload24.com
Fri Oct 16 09:40:48 PDT 2009


Currently this is not optimized at all. It takes over a thousand HTTP requests (although libcurl does keep the connection alive at least) to render one page of a 35MB test PDF.

On the plus side: There is a cache, so blocks that have been downloaded will not be downloaded again. Also, while testing, no change to pdftoppm was necessary, since the PDFDoc takes care of parsing the filename as a URL.

Note that no changes were necessary outside of poppler itself, meaning the API did not change at all. It just accepts filenames starting with http:// now.
---
 CMakeLists.txt       |    2 +
 poppler/CurlCache.cc |  210 ++++++++++++++++++++++++++++++++++++++++++++++++++
 poppler/CurlCache.h  |   97 +++++++++++++++++++++++
 poppler/Makefile.am  |    2 +
 poppler/PDFDoc.cc    |   53 +++++++++----
 poppler/Stream.cc    |   93 ++++++++++++++++++++++
 poppler/Stream.h     |   53 +++++++++++++
 7 files changed, 493 insertions(+), 17 deletions(-)
 create mode 100644 poppler/CurlCache.cc
 create mode 100644 poppler/CurlCache.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1c609bf..fcd60e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -164,6 +164,7 @@ set(poppler_SRCS
   poppler/Catalog.cc
   poppler/CharCodeToUnicode.cc
   poppler/CMap.cc
+  poppler/CurlCache.cc
   poppler/DateInfo.cc
   poppler/Decrypt.cc
   poppler/Dict.cc
@@ -294,6 +295,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/Catalog.h
     poppler/CharCodeToUnicode.h
     poppler/CMap.h
+    poppler/CurlCache.h
     poppler/DateInfo.h
     poppler/Decrypt.h
     poppler/Dict.h
diff --git a/poppler/CurlCache.cc b/poppler/CurlCache.cc
new file mode 100644
index 0000000..0134acf
--- /dev/null
+++ b/poppler/CurlCache.cc
@@ -0,0 +1,210 @@
+//========================================================================
+//
+// CurlCache.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2009 Stefan Thomas <thomas at eload24.com>
+//
+//========================================================================
+
+#include "CurlCache.h"
+
+#ifdef ENABLE_LIBCURL
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include "Error.h"
+#include <curl/curl.h>
+
+//------------------------------------------------------------------------
+
+CurlCache::CurlCache(GooString *urlA) {
+  url = urlA;
+
+  long code = NULL;
+  double contentLength = -1;
+
+  curl = curl_easy_init();
+  curl_easy_setopt(curl, CURLOPT_URL, url->getCString());
+  curl_easy_setopt(curl, CURLOPT_HEADER, 1);
+  curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &CurlCache::noop);
+  curl_easy_perform(curl);
+  curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);
+  curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &contentLength);
+  curl_easy_reset(curl);
+  
+  size = contentLength;
+  
+  streamPos = 0;
+}
+
+CurlCache::~CurlCache() {
+  curl_easy_cleanup(curl);
+}
+
+GooString *CurlCache::getFileName() {
+  int i, sl = 0, qm = 0;
+  for (i = 6; i < url->getLength(); i++) {
+    // note position after last slash
+    if (url->getChar(i) == '/') sl = i+1;
+	
+    // note position of first question mark
+    if (url->getChar(i) == '?' && !qm) qm = i;
+  }
+  // find document filename
+  return new GooString(url, sl, (qm) ? qm : (url->getLength()-sl));
+}
+
+long int CurlCache::tell() {
+  return streamPos;
+}
+
+int CurlCache::seek(long int offset, int origin) {
+  if (origin == SEEK_SET) {
+    streamPos = offset;
+  } else if (origin == SEEK_CUR) {
+    streamPos += offset;
+  } else {
+    streamPos = size + offset;
+  }
+  
+  return 0;
+}
+
+size_t CurlCache::read(void *ptr, size_t unitsize, size_t count) {
+  int endPos = streamPos + unitsize*count;
+  //printf("Reading %li - %li\n", streamPos, streamPos + unitsize*count);
+  
+  int startBlock = streamPos / curlCacheChunkSize;
+  int startSkip = streamPos % curlCacheChunkSize;
+  
+  int endBlock = (endPos-1) / curlCacheChunkSize;
+  int endSkip = curlCacheChunkSize-1 - ((endPos-1) % curlCacheChunkSize);
+  
+  //printf("Get block %i to %i, skipping %i at start and %i at end.\n", startBlock, endBlock, startSkip, endSkip);
+  
+  // Make sure data is in cache
+  loadChunks(startBlock, endBlock);
+  
+  // Write data to buffer
+  size_t toCopy = unitsize*count;
+  
+  while (toCopy) {
+    int chunk = streamPos / curlCacheChunkSize;
+    int offset = streamPos % curlCacheChunkSize;
+  
+    int len = curlCacheChunkSize-offset;
+  
+    if (len > toCopy)
+      len = toCopy;
+
+    //printf("Reading Chunk %i, offset %i, len %i\n", chunk, offset, len);
+    memcpy(ptr, chunks[chunk].data + offset, len);
+    streamPos += len;
+    toCopy -= len;
+    ptr = (char*)ptr + len;
+    
+    /*
+    // Dump a chunk
+    if (chunk == 28 || chunk == 29) {
+      for (int i = 0; i < len; ++i) {
+        printf("%02X ", (unsigned char) chunks[chunk].data[offset + i]);
+      }
+      printf("\n");
+    }
+    */
+  }
+  
+  return unitsize*count;
+}
+
+void CurlCache::loadChunks(int startBlock, int endBlock) {
+  int startSequence;
+  int i = startBlock;
+  
+  while (i <= endBlock) {
+    if (chunks[i].state == cccStateNew) {
+      startSequence = i;
+      while (i < endBlock) {
+        if (chunks[i].state != cccStateNew) {
+          i--;
+          break;
+        } else {
+          i++;
+        }
+      }
+      
+      CurlCacheJob *ccj = new CurlCacheJob(this, startSequence, i++);
+      ccj->run();
+    } else {
+      i++;
+    }
+  }
+}
+
+size_t CurlCache::noop(void *ptr, size_t size, size_t nmemb, void *ptr2) {
+  return size*nmemb;
+}
+
+//------------------------------------------------------------------------
+
+CurlCacheJob::CurlCacheJob(CurlCache *ccA, int startBlockA, int endBlockA) {
+  //printf("Getting blocks %i to %i\n", startBlockA, endBlockA);
+  cc = ccA;
+  startBlock = startBlockA;
+  endBlock = endBlockA;
+}
+
+void CurlCacheJob::run() {
+  int fromByte = startBlock * curlCacheChunkSize;
+  int toByte = ((endBlock+1) * curlCacheChunkSize)-1;
+  
+  if (toByte >= cc->size-1) {
+    toByte = cc->size-1;
+  }
+  
+  GooString *range = GooString::format("{0:ud}-{1:ud}", fromByte, toByte);
+  //printf("Range: %s\n", range->getCString());
+
+  currentByte = fromByte;
+
+  curl_easy_setopt(cc->curl, CURLOPT_URL, cc->url->getCString());
+  curl_easy_setopt(cc->curl, CURLOPT_WRITEFUNCTION, &CurlCacheJob::write);
+  curl_easy_setopt(cc->curl, CURLOPT_WRITEDATA, this);
+  curl_easy_setopt(cc->curl, CURLOPT_RANGE, range->getCString());
+  curl_easy_perform(cc->curl);
+  curl_easy_reset(cc->curl);
+}
+
+size_t CurlCacheJob::write(void *ptr, size_t size, size_t nmemb, CurlCacheJob *ccj) {
+  //printf("%u bytes received\n", size*nmemb);
+  size_t toCopy = size*nmemb;
+  
+  while (toCopy) {
+    int chunk = ccj->currentByte / curlCacheChunkSize;
+    int offset = ccj->currentByte % curlCacheChunkSize;
+  
+    size_t len = curlCacheChunkSize-offset;
+  
+    if (len > toCopy)
+      len = toCopy;
+
+    //printf("Writing Chunk %i, offset %i, len %i\n", chunk, offset, len);
+    memcpy(&ccj->cc->chunks[chunk].data[offset], ptr, len);
+    ccj->currentByte += len;
+    toCopy -= len;
+    ptr = (char*)ptr + len;
+    
+    ccj->cc->chunks[chunk].state = cccStateLoaded;
+  }
+  
+  return size*nmemb;
+}
+
+#endif
diff --git a/poppler/CurlCache.h b/poppler/CurlCache.h
new file mode 100644
index 0000000..20eda89
--- /dev/null
+++ b/poppler/CurlCache.h
@@ -0,0 +1,97 @@
+//========================================================================
+//
+// CurlCache.h
+//
+// Caching wrapper around curl.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2009 Stefan Thomas <thomas at eload24.com>
+//
+//========================================================================
+
+#ifndef CURLCACHE_H
+#define CURLCACHE_H
+
+#include <config.h>
+
+#ifdef ENABLE_LIBCURL
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "poppler-config.h"
+#include "goo/gtypes.h"
+#include "goo/GooString.h"
+
+#include <curl/curl.h>
+
+#include <map>
+
+//------------------------------------------------------------------------
+
+#define curlCacheChunkSize 8192
+
+enum CurlCacheChunkState {
+  cccStateNew,
+// If we want this to be thread-safe, concurrent, whatever, we need another state:
+// cccStateLoading,
+  cccStateLoaded
+};
+
+typedef struct {
+  CurlCacheChunkState state;
+  char data[curlCacheChunkSize];
+} CurlCacheChunk;
+
+class CurlCache {
+public:
+
+  friend class CurlCacheJob;
+
+  CurlCache(GooString *urlA);
+  ~CurlCache();
+  
+  GooString *getFileName();
+  
+  long int tell();
+  int seek(long int offset, int origin);
+  size_t read(void * ptr, size_t unitsize, size_t count);
+  
+  void loadChunks(int startBlock, int endBlock);
+
+private:
+
+  CURL *curl;
+  GooString *url;
+  long int size;
+  long int streamPos;
+  
+  std::map<unsigned, CurlCacheChunk> chunks;
+  
+  static size_t noop(void *ptr, size_t size, size_t nmemb, void *ptr2);
+
+};
+
+class CurlCacheJob {
+public:
+
+  CurlCacheJob(CurlCache *ccA, int startBlockA, int endBlockA);
+  
+  void run();
+
+private:
+
+  CurlCache *cc;
+  int startBlock;
+  int endBlock;
+  int currentByte;
+  
+  static size_t write(void *ptr, size_t size, size_t nmemb, CurlCacheJob *ccj);
+
+};
+
+#endif
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 24a22a4..883bb29 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -173,6 +173,7 @@ poppler_include_HEADERS =	\
 	Catalog.h		\
 	CharCodeToUnicode.h	\
 	CMap.h			\
+	CurlCache.h		\
 	DateInfo.h		\
 	Decrypt.h		\
 	Dict.h			\
@@ -244,6 +245,7 @@ libpoppler_la_SOURCES =		\
 	Catalog.cc 		\
 	CharCodeToUnicode.cc	\
 	CMap.cc			\
+	CurlCache.cc	\
 	DateInfo.cc		\
 	Decrypt.cc		\
 	Dict.cc 		\
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 4af402b..4180956 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -58,6 +58,9 @@
 #include "Parser.h"
 #include "SecurityHandler.h"
 #include "Decrypt.h"
+#ifdef ENABLE_LIBCURL
+#include "CurlCache.h"
+#endif
 #ifndef DISABLE_OUTLINE
 #include "Outline.h"
 #endif
@@ -89,28 +92,44 @@ PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
   outline = NULL;
 #endif
 
-  fileName = fileNameA;
+  if (fileNameA->cmpN("http://", 7) == 0 || fileNameA->cmpN("https://", 8) == 0) {
+#ifndef ENABLE_LIBCURL
+    error(-1, "Couldn't open file '%s': HTTP support not compiled in.", fileNameA->getCString());
+    errCode = errOpenFile;
+    return;
+#else
+  	CurlCache *cc = new CurlCache(fileNameA);
+  	
+  	fileName = cc->getFileName();
+  	
+    // create streamObject obj;
+    obj.initNull();
+    str = new HttpStream(cc, 0, gFalse, 0, &obj);
+#endif
+  } else {
+    fileName = fileNameA;
 
-  // try to open file
+    // try to open file
 #ifdef VMS
-  file = fopen(fileName->getCString(), "rb", "ctx=stm");
+    file = fopen(fileName->getCString(), "rb", "ctx=stm");
 #else
-  file = fopen(fileName->getCString(), "rb");
+    file = fopen(fileName->getCString(), "rb");
 #endif
-  if (file == NULL) {
-    // fopen() has failed.
-    // Keep a copy of the errno returned by fopen so that it can be 
-    // referred to later.
-    fopenErrno = errno;
-    error(-1, "Couldn't open file '%s': %s.", fileName->getCString(),
-                                              strerror(errno));
-    errCode = errOpenFile;
-    return;
-  }
+    if (file == NULL) {
+    	// fopen() has failed.
+    	// Keep a copy of the errno returned by fopen so that it can be 
+    	// referred to later.
+    	fopenErrno = errno;
+    	error(-1, "Couldn't open file '%s': %s.", fileName->getCString(),
+    	                                          strerror(errno));
+    	errCode = errOpenFile;
+    	return;
+    }
 
-  // create stream
-  obj.initNull();
-  str = new FileStream(file, 0, gFalse, 0, &obj);
+    // create stream
+    obj.initNull();
+    str = new FileStream(file, 0, gFalse, 0, &obj);
+  }
 
   ok = setup(ownerPassword, userPassword);
 }
diff --git a/poppler/Stream.cc b/poppler/Stream.cc
index 01efcd6..ce2905b 100644
--- a/poppler/Stream.cc
+++ b/poppler/Stream.cc
@@ -789,6 +789,99 @@ void FileStream::moveStart(int delta) {
 }
 
 //------------------------------------------------------------------------
+// HttpStream
+//------------------------------------------------------------------------
+
+#ifdef ENABLE_LIBCURL
+
+HttpStream::HttpStream(CurlCache *ccA, Guint startA, GBool limitedA,
+		       Guint lengthA, Object *dictA):
+    BaseStream(dictA) {
+  cc = ccA;
+  start = startA;
+  limited = limitedA;
+  length = lengthA;
+  bufPtr = bufEnd = buf;
+  bufPos = start;
+  savePos = 0;
+  saved = gFalse;
+}
+
+HttpStream::~HttpStream() {
+  close();
+}
+
+Stream *HttpStream::makeSubStream(Guint startA, GBool limitedA,
+				  Guint lengthA, Object *dictA) {
+  return new HttpStream(cc, startA, limitedA, lengthA, dictA);
+}
+
+void HttpStream::reset() {
+  savePos = (Guint)cc->tell();
+  cc->seek(start, SEEK_SET);
+  
+  saved = gTrue;
+  bufPtr = bufEnd = buf;
+  bufPos = start;
+}
+
+void HttpStream::close() {
+  if (saved) {
+    cc->seek(savePos, SEEK_SET);
+    saved = gFalse;
+  }
+}
+
+GBool HttpStream::fillBuf() {
+  int n;
+
+  bufPos += bufEnd - buf;
+  bufPtr = bufEnd = buf;
+  if (limited && bufPos >= start + length) {
+    return gFalse;
+  }
+  if (limited && bufPos + httpStreamBufSize > start + length) {
+    n = start + length - bufPos;
+  } else {
+    n = httpStreamBufSize;
+  }
+  cc->read(buf, 1, n);
+  bufEnd = buf + n;
+  if (bufPtr >= bufEnd) {
+    return gFalse;
+  }
+  return gTrue;
+}
+
+void HttpStream::setPos(Guint pos, int dir) {
+  Guint size;
+
+  if (dir >= 0) {
+    cc->seek(pos, SEEK_SET);
+    bufPos = pos;
+  } else {
+    cc->seek(0, SEEK_END);
+    size = (Guint)cc->tell();
+    
+    if (pos > size)
+      pos = (Guint)size;
+    
+    cc->seek(-(int)pos, SEEK_END);
+    bufPos = (Guint)cc->tell();
+  }
+  
+  bufPtr = bufEnd = buf;
+}
+
+void HttpStream::moveStart(int delta) {
+  start += delta;
+  bufPtr = bufEnd = buf;
+  bufPos = start;
+}
+
+#endif
+
+//------------------------------------------------------------------------
 // MemStream
 //------------------------------------------------------------------------
 
diff --git a/poppler/Stream.h b/poppler/Stream.h
index 9c0068e..7ce37d9 100644
--- a/poppler/Stream.h
+++ b/poppler/Stream.h
@@ -34,12 +34,17 @@
 #include "goo/gtypes.h"
 #include "Object.h"
 
+#ifdef ENABLE_LIBCURL
+#include "CurlCache.h"
+#endif
+
 class BaseStream;
 
 //------------------------------------------------------------------------
 
 enum StreamKind {
   strFile,
+  strHttp,
   strASCIIHex,
   strASCII85,
   strLZW,
@@ -399,6 +404,54 @@ private:
 };
 
 //------------------------------------------------------------------------
+// HttpStream
+//------------------------------------------------------------------------
+
+#ifdef ENABLE_LIBCURL
+
+#define httpStreamBufSize 1024
+
+class HttpStream: public BaseStream {
+public:
+
+  HttpStream(CurlCache *ccA, Guint startA, GBool limitedA,
+	     Guint lengthA, Object *dictA);
+  virtual ~HttpStream();
+  virtual Stream *makeSubStream(Guint startA, GBool limitedA,
+				Guint lengthA, Object *dictA);
+  virtual StreamKind getKind() { return strHttp; }
+  virtual void reset();
+  virtual void close();
+  virtual int getChar()
+    { return (bufPtr >= bufEnd && !fillBuf()) ? EOF : (*bufPtr++ & 0xff); }
+  virtual int lookChar()
+    { return (bufPtr >= bufEnd && !fillBuf()) ? EOF : (*bufPtr & 0xff); }
+  virtual int getPos() { return bufPos + (bufPtr - buf); }
+  virtual void setPos(Guint pos, int dir = 0);
+  virtual Guint getStart() { return start; }
+  virtual void moveStart(int delta);
+
+  virtual int getUnfilteredChar () { return getChar(); }
+  virtual void unfilteredReset () { reset(); }
+
+private:
+
+  GBool fillBuf();
+
+  CurlCache *cc;
+  Guint start;
+  GBool limited;
+  Guint length;
+  char buf[httpStreamBufSize];
+  char *bufPtr;
+  char *bufEnd;
+  Guint bufPos;
+  int savePos;
+  GBool saved;
+};
+#endif
+
+//------------------------------------------------------------------------
 // MemStream
 //------------------------------------------------------------------------
 
-- 
1.6.0.4


--------------090607010404040105030602--


More information about the poppler mailing list