[PATCH] Added rudimentary HTTP streaming support.
Stefan Thomas
thomas at eload24.com
Fri Oct 16 09:40:48 PDT 2009
Currently this is not optimized at all. It takes over a thousand HTTP requests (although libcurl does keep the connection alive at least) to render one page of a 35MB test PDF.
On the plus side: There is a cache, so blocks that have been downloaded will not be downloaded again. Also, while testing, no change to pdftoppm was necessary, since the PDFDoc takes care of parsing the filename as a URL.
Note that no changes were necessary outside of poppler itself, meaning the API did not change at all. It just accepts filenames starting with http:// now.
---
CMakeLists.txt | 2 +
poppler/CurlCache.cc | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++
poppler/CurlCache.h | 97 +++++++++++++++++++++++
poppler/Makefile.am | 2 +
poppler/PDFDoc.cc | 53 +++++++++----
poppler/Stream.cc | 93 ++++++++++++++++++++++
poppler/Stream.h | 53 +++++++++++++
7 files changed, 493 insertions(+), 17 deletions(-)
create mode 100644 poppler/CurlCache.cc
create mode 100644 poppler/CurlCache.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1c609bf..fcd60e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -164,6 +164,7 @@ set(poppler_SRCS
poppler/Catalog.cc
poppler/CharCodeToUnicode.cc
poppler/CMap.cc
+ poppler/CurlCache.cc
poppler/DateInfo.cc
poppler/Decrypt.cc
poppler/Dict.cc
@@ -294,6 +295,7 @@ if(ENABLE_XPDF_HEADERS)
poppler/Catalog.h
poppler/CharCodeToUnicode.h
poppler/CMap.h
+ poppler/CurlCache.h
poppler/DateInfo.h
poppler/Decrypt.h
poppler/Dict.h
diff --git a/poppler/CurlCache.cc b/poppler/CurlCache.cc
new file mode 100644
index 0000000..0134acf
--- /dev/null
+++ b/poppler/CurlCache.cc
@@ -0,0 +1,210 @@
+//========================================================================
+//
+// CurlCache.cc
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2009 Stefan Thomas <thomas at eload24.com>
+//
+//========================================================================
+
+#include "CurlCache.h"
+
+#ifdef ENABLE_LIBCURL
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include "Error.h"
+#include <curl/curl.h>
+
+//------------------------------------------------------------------------
+
+CurlCache::CurlCache(GooString *urlA) {
+ url = urlA;
+
+ long code = NULL;
+ double contentLength = -1;
+
+ curl = curl_easy_init();
+ curl_easy_setopt(curl, CURLOPT_URL, url->getCString());
+ curl_easy_setopt(curl, CURLOPT_HEADER, 1);
+ curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &CurlCache::noop);
+ curl_easy_perform(curl);
+ curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);
+ curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &contentLength);
+ curl_easy_reset(curl);
+
+ size = contentLength;
+
+ streamPos = 0;
+}
+
+CurlCache::~CurlCache() {
+ curl_easy_cleanup(curl);
+}
+
+GooString *CurlCache::getFileName() {
+ int i, sl = 0, qm = 0;
+ for (i = 6; i < url->getLength(); i++) {
+ // note position after last slash
+ if (url->getChar(i) == '/') sl = i+1;
+
+ // note position of first question mark
+ if (url->getChar(i) == '?' && !qm) qm = i;
+ }
+ // find document filename
+ return new GooString(url, sl, (qm) ? qm : (url->getLength()-sl));
+}
+
+long int CurlCache::tell() {
+ return streamPos;
+}
+
+int CurlCache::seek(long int offset, int origin) {
+ if (origin == SEEK_SET) {
+ streamPos = offset;
+ } else if (origin == SEEK_CUR) {
+ streamPos += offset;
+ } else {
+ streamPos = size + offset;
+ }
+
+ return 0;
+}
+
+size_t CurlCache::read(void *ptr, size_t unitsize, size_t count) {
+ int endPos = streamPos + unitsize*count;
+ //printf("Reading %li - %li\n", streamPos, streamPos + unitsize*count);
+
+ int startBlock = streamPos / curlCacheChunkSize;
+ int startSkip = streamPos % curlCacheChunkSize;
+
+ int endBlock = (endPos-1) / curlCacheChunkSize;
+ int endSkip = curlCacheChunkSize-1 - ((endPos-1) % curlCacheChunkSize);
+
+ //printf("Get block %i to %i, skipping %i at start and %i at end.\n", startBlock, endBlock, startSkip, endSkip);
+
+ // Make sure data is in cache
+ loadChunks(startBlock, endBlock);
+
+ // Write data to buffer
+ size_t toCopy = unitsize*count;
+
+ while (toCopy) {
+ int chunk = streamPos / curlCacheChunkSize;
+ int offset = streamPos % curlCacheChunkSize;
+
+ int len = curlCacheChunkSize-offset;
+
+ if (len > toCopy)
+ len = toCopy;
+
+ //printf("Reading Chunk %i, offset %i, len %i\n", chunk, offset, len);
+ memcpy(ptr, chunks[chunk].data + offset, len);
+ streamPos += len;
+ toCopy -= len;
+ ptr = (char*)ptr + len;
+
+ /*
+ // Dump a chunk
+ if (chunk == 28 || chunk == 29) {
+ for (int i = 0; i < len; ++i) {
+ printf("%02X ", (unsigned char) chunks[chunk].data[offset + i]);
+ }
+ printf("\n");
+ }
+ */
+ }
+
+ return unitsize*count;
+}
+
+void CurlCache::loadChunks(int startBlock, int endBlock) {
+ int startSequence;
+ int i = startBlock;
+
+ while (i <= endBlock) {
+ if (chunks[i].state == cccStateNew) {
+ startSequence = i;
+ while (i < endBlock) {
+ if (chunks[i].state != cccStateNew) {
+ i--;
+ break;
+ } else {
+ i++;
+ }
+ }
+
+ CurlCacheJob *ccj = new CurlCacheJob(this, startSequence, i++);
+ ccj->run();
+ } else {
+ i++;
+ }
+ }
+}
+
+size_t CurlCache::noop(void *ptr, size_t size, size_t nmemb, void *ptr2) {
+ return size*nmemb;
+}
+
+//------------------------------------------------------------------------
+
+CurlCacheJob::CurlCacheJob(CurlCache *ccA, int startBlockA, int endBlockA) {
+ //printf("Getting blocks %i to %i\n", startBlockA, endBlockA);
+ cc = ccA;
+ startBlock = startBlockA;
+ endBlock = endBlockA;
+}
+
+void CurlCacheJob::run() {
+ int fromByte = startBlock * curlCacheChunkSize;
+ int toByte = ((endBlock+1) * curlCacheChunkSize)-1;
+
+ if (toByte >= cc->size-1) {
+ toByte = cc->size-1;
+ }
+
+ GooString *range = GooString::format("{0:ud}-{1:ud}", fromByte, toByte);
+ //printf("Range: %s\n", range->getCString());
+
+ currentByte = fromByte;
+
+ curl_easy_setopt(cc->curl, CURLOPT_URL, cc->url->getCString());
+ curl_easy_setopt(cc->curl, CURLOPT_WRITEFUNCTION, &CurlCacheJob::write);
+ curl_easy_setopt(cc->curl, CURLOPT_WRITEDATA, this);
+ curl_easy_setopt(cc->curl, CURLOPT_RANGE, range->getCString());
+ curl_easy_perform(cc->curl);
+ curl_easy_reset(cc->curl);
+}
+
+size_t CurlCacheJob::write(void *ptr, size_t size, size_t nmemb, CurlCacheJob *ccj) {
+ //printf("%u bytes received\n", size*nmemb);
+ size_t toCopy = size*nmemb;
+
+ while (toCopy) {
+ int chunk = ccj->currentByte / curlCacheChunkSize;
+ int offset = ccj->currentByte % curlCacheChunkSize;
+
+ size_t len = curlCacheChunkSize-offset;
+
+ if (len > toCopy)
+ len = toCopy;
+
+ //printf("Writing Chunk %i, offset %i, len %i\n", chunk, offset, len);
+ memcpy(&ccj->cc->chunks[chunk].data[offset], ptr, len);
+ ccj->currentByte += len;
+ toCopy -= len;
+ ptr = (char*)ptr + len;
+
+ ccj->cc->chunks[chunk].state = cccStateLoaded;
+ }
+
+ return size*nmemb;
+}
+
+#endif
diff --git a/poppler/CurlCache.h b/poppler/CurlCache.h
new file mode 100644
index 0000000..20eda89
--- /dev/null
+++ b/poppler/CurlCache.h
@@ -0,0 +1,97 @@
+//========================================================================
+//
+// CurlCache.h
+//
+// Caching wrapper around curl.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright 2009 Stefan Thomas <thomas at eload24.com>
+//
+//========================================================================
+
+#ifndef CURLCACHE_H
+#define CURLCACHE_H
+
+#include <config.h>
+
+#ifdef ENABLE_LIBCURL
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include "poppler-config.h"
+#include "goo/gtypes.h"
+#include "goo/GooString.h"
+
+#include <curl/curl.h>
+
+#include <map>
+
+//------------------------------------------------------------------------
+
+#define curlCacheChunkSize 8192
+
+enum CurlCacheChunkState {
+ cccStateNew,
+// If we want this to be thread-safe, concurrent, whatever, we need another state:
+// cccStateLoading,
+ cccStateLoaded
+};
+
+typedef struct {
+ CurlCacheChunkState state;
+ char data[curlCacheChunkSize];
+} CurlCacheChunk;
+
+class CurlCache {
+public:
+
+ friend class CurlCacheJob;
+
+ CurlCache(GooString *urlA);
+ ~CurlCache();
+
+ GooString *getFileName();
+
+ long int tell();
+ int seek(long int offset, int origin);
+ size_t read(void * ptr, size_t unitsize, size_t count);
+
+ void loadChunks(int startBlock, int endBlock);
+
+private:
+
+ CURL *curl;
+ GooString *url;
+ long int size;
+ long int streamPos;
+
+ std::map<unsigned, CurlCacheChunk> chunks;
+
+ static size_t noop(void *ptr, size_t size, size_t nmemb, void *ptr2);
+
+};
+
+class CurlCacheJob {
+public:
+
+ CurlCacheJob(CurlCache *ccA, int startBlockA, int endBlockA);
+
+ void run();
+
+private:
+
+ CurlCache *cc;
+ int startBlock;
+ int endBlock;
+ int currentByte;
+
+ static size_t write(void *ptr, size_t size, size_t nmemb, CurlCacheJob *ccj);
+
+};
+
+#endif
+
+#endif
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 24a22a4..883bb29 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -173,6 +173,7 @@ poppler_include_HEADERS = \
Catalog.h \
CharCodeToUnicode.h \
CMap.h \
+ CurlCache.h \
DateInfo.h \
Decrypt.h \
Dict.h \
@@ -244,6 +245,7 @@ libpoppler_la_SOURCES = \
Catalog.cc \
CharCodeToUnicode.cc \
CMap.cc \
+ CurlCache.cc \
DateInfo.cc \
Decrypt.cc \
Dict.cc \
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 4af402b..4180956 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -58,6 +58,9 @@
#include "Parser.h"
#include "SecurityHandler.h"
#include "Decrypt.h"
+#ifdef ENABLE_LIBCURL
+#include "CurlCache.h"
+#endif
#ifndef DISABLE_OUTLINE
#include "Outline.h"
#endif
@@ -89,28 +92,44 @@ PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword,
outline = NULL;
#endif
- fileName = fileNameA;
+ if (fileNameA->cmpN("http://", 7) == 0 || fileNameA->cmpN("https://", 8) == 0) {
+#ifndef ENABLE_LIBCURL
+ error(-1, "Couldn't open file '%s': HTTP support not compiled in.", fileNameA->getCString());
+ errCode = errOpenFile;
+ return;
+#else
+ CurlCache *cc = new CurlCache(fileNameA);
+
+ fileName = cc->getFileName();
+
+ // create streamObject obj;
+ obj.initNull();
+ str = new HttpStream(cc, 0, gFalse, 0, &obj);
+#endif
+ } else {
+ fileName = fileNameA;
- // try to open file
+ // try to open file
#ifdef VMS
- file = fopen(fileName->getCString(), "rb", "ctx=stm");
+ file = fopen(fileName->getCString(), "rb", "ctx=stm");
#else
- file = fopen(fileName->getCString(), "rb");
+ file = fopen(fileName->getCString(), "rb");
#endif
- if (file == NULL) {
- // fopen() has failed.
- // Keep a copy of the errno returned by fopen so that it can be
- // referred to later.
- fopenErrno = errno;
- error(-1, "Couldn't open file '%s': %s.", fileName->getCString(),
- strerror(errno));
- errCode = errOpenFile;
- return;
- }
+ if (file == NULL) {
+ // fopen() has failed.
+ // Keep a copy of the errno returned by fopen so that it can be
+ // referred to later.
+ fopenErrno = errno;
+ error(-1, "Couldn't open file '%s': %s.", fileName->getCString(),
+ strerror(errno));
+ errCode = errOpenFile;
+ return;
+ }
- // create stream
- obj.initNull();
- str = new FileStream(file, 0, gFalse, 0, &obj);
+ // create stream
+ obj.initNull();
+ str = new FileStream(file, 0, gFalse, 0, &obj);
+ }
ok = setup(ownerPassword, userPassword);
}
diff --git a/poppler/Stream.cc b/poppler/Stream.cc
index 01efcd6..ce2905b 100644
--- a/poppler/Stream.cc
+++ b/poppler/Stream.cc
@@ -789,6 +789,99 @@ void FileStream::moveStart(int delta) {
}
//------------------------------------------------------------------------
+// HttpStream
+//------------------------------------------------------------------------
+
+#ifdef ENABLE_LIBCURL
+
+HttpStream::HttpStream(CurlCache *ccA, Guint startA, GBool limitedA,
+ Guint lengthA, Object *dictA):
+ BaseStream(dictA) {
+ cc = ccA;
+ start = startA;
+ limited = limitedA;
+ length = lengthA;
+ bufPtr = bufEnd = buf;
+ bufPos = start;
+ savePos = 0;
+ saved = gFalse;
+}
+
+HttpStream::~HttpStream() {
+ close();
+}
+
+Stream *HttpStream::makeSubStream(Guint startA, GBool limitedA,
+ Guint lengthA, Object *dictA) {
+ return new HttpStream(cc, startA, limitedA, lengthA, dictA);
+}
+
+void HttpStream::reset() {
+ savePos = (Guint)cc->tell();
+ cc->seek(start, SEEK_SET);
+
+ saved = gTrue;
+ bufPtr = bufEnd = buf;
+ bufPos = start;
+}
+
+void HttpStream::close() {
+ if (saved) {
+ cc->seek(savePos, SEEK_SET);
+ saved = gFalse;
+ }
+}
+
+GBool HttpStream::fillBuf() {
+ int n;
+
+ bufPos += bufEnd - buf;
+ bufPtr = bufEnd = buf;
+ if (limited && bufPos >= start + length) {
+ return gFalse;
+ }
+ if (limited && bufPos + httpStreamBufSize > start + length) {
+ n = start + length - bufPos;
+ } else {
+ n = httpStreamBufSize;
+ }
+ cc->read(buf, 1, n);
+ bufEnd = buf + n;
+ if (bufPtr >= bufEnd) {
+ return gFalse;
+ }
+ return gTrue;
+}
+
+void HttpStream::setPos(Guint pos, int dir) {
+ Guint size;
+
+ if (dir >= 0) {
+ cc->seek(pos, SEEK_SET);
+ bufPos = pos;
+ } else {
+ cc->seek(0, SEEK_END);
+ size = (Guint)cc->tell();
+
+ if (pos > size)
+ pos = (Guint)size;
+
+ cc->seek(-(int)pos, SEEK_END);
+ bufPos = (Guint)cc->tell();
+ }
+
+ bufPtr = bufEnd = buf;
+}
+
+void HttpStream::moveStart(int delta) {
+ start += delta;
+ bufPtr = bufEnd = buf;
+ bufPos = start;
+}
+
+#endif
+
+//------------------------------------------------------------------------
// MemStream
//------------------------------------------------------------------------
diff --git a/poppler/Stream.h b/poppler/Stream.h
index 9c0068e..7ce37d9 100644
--- a/poppler/Stream.h
+++ b/poppler/Stream.h
@@ -34,12 +34,17 @@
#include "goo/gtypes.h"
#include "Object.h"
+#ifdef ENABLE_LIBCURL
+#include "CurlCache.h"
+#endif
+
class BaseStream;
//------------------------------------------------------------------------
enum StreamKind {
strFile,
+ strHttp,
strASCIIHex,
strASCII85,
strLZW,
@@ -399,6 +404,54 @@ private:
};
//------------------------------------------------------------------------
+// HttpStream
+//------------------------------------------------------------------------
+
+#ifdef ENABLE_LIBCURL
+
+#define httpStreamBufSize 1024
+
+class HttpStream: public BaseStream {
+public:
+
+ HttpStream(CurlCache *ccA, Guint startA, GBool limitedA,
+ Guint lengthA, Object *dictA);
+ virtual ~HttpStream();
+ virtual Stream *makeSubStream(Guint startA, GBool limitedA,
+ Guint lengthA, Object *dictA);
+ virtual StreamKind getKind() { return strHttp; }
+ virtual void reset();
+ virtual void close();
+ virtual int getChar()
+ { return (bufPtr >= bufEnd && !fillBuf()) ? EOF : (*bufPtr++ & 0xff); }
+ virtual int lookChar()
+ { return (bufPtr >= bufEnd && !fillBuf()) ? EOF : (*bufPtr & 0xff); }
+ virtual int getPos() { return bufPos + (bufPtr - buf); }
+ virtual void setPos(Guint pos, int dir = 0);
+ virtual Guint getStart() { return start; }
+ virtual void moveStart(int delta);
+
+ virtual int getUnfilteredChar () { return getChar(); }
+ virtual void unfilteredReset () { reset(); }
+
+private:
+
+ GBool fillBuf();
+
+ CurlCache *cc;
+ Guint start;
+ GBool limited;
+ Guint length;
+ char buf[httpStreamBufSize];
+ char *bufPtr;
+ char *bufEnd;
+ Guint bufPos;
+ int savePos;
+ GBool saved;
+};
+#endif
+
+//------------------------------------------------------------------------
// MemStream
//------------------------------------------------------------------------
--
1.6.0.4
--------------090607010404040105030602--
More information about the poppler
mailing list