[poppler] 6 commits - CMakeLists.txt goo/gbase64.cc goo/gbase64.h goo/gbasename.cc goo/gbasename.h utils/CMakeLists.txt utils/HtmlOutputDev.cc utils/HtmlOutputDev.h utils/InMemoryFile.cc utils/InMemoryFile.h utils/pdfsig.cc utils/pdftohtml.1 utils/pdftohtml.cc
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Sun Feb 10 14:49:54 UTC 2019
CMakeLists.txt | 2
goo/gbase64.cc | 50 ++++++++++++++++++++++
goo/gbase64.h | 28 ++++++++++++
goo/gbasename.cc | 51 ++++++++++++++++++++++
goo/gbasename.h | 22 +++++++++
utils/CMakeLists.txt | 1
utils/HtmlOutputDev.cc | 110 +++++++++++++++++++++----------------------------
utils/HtmlOutputDev.h | 22 ++++-----
utils/InMemoryFile.cc | 75 +++++++++++++++++++++++++++++++++
utils/InMemoryFile.h | 51 ++++++++++++++++++++++
utils/pdfsig.cc | 6 --
utils/pdftohtml.1 | 3 +
utils/pdftohtml.cc | 49 +++++++++++++++------
13 files changed, 377 insertions(+), 93 deletions(-)
New commits:
commit 5f6ff67b0e1dc075d737fc840642c292329dcd08
Author: Albert Astals Cid <aacid at kde.org>
Date: Sun Feb 10 15:32:26 2019 +0100
pdftohtml: Add -dataurls to man page
diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1
index 5d711ba9..ea386ffd 100644
--- a/utils/pdftohtml.1
+++ b/utils/pdftohtml.1
@@ -43,6 +43,9 @@ generate complex output
.B \-s
generate single HTML that includes all pages
.TP
+.B \-dataurls
+use data URLs instead of external images in HTML. No available in all platforms
+.TP
.B \-i
ignore images
.TP
commit 7b8dbc0a4dc8e0738658b8e4fe7c44adad15af24
Author: Greg Knight <lyngvi at gmail.com>
Date: Fri Nov 23 22:30:12 2018 -0500
pdftohtml: singleHtml and stout are not mutually exclusive. with -dataurls is actually quite reasonable.
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 864ece99..6218ff3c 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -318,7 +318,7 @@ int main(int argc, char *argv[]) {
if (scale>3.0) scale=3.0;
if (scale<0.5) scale=0.5;
- if (complexMode || singleHtml) {
+ if (complexMode) {
//noframes=false;
stout=false;
}
@@ -326,7 +326,6 @@ int main(int argc, char *argv[]) {
if (stout) {
noframes=true;
complexMode=false;
- singleHtml=false;
}
if (xml)
commit 91ab53fa635e9ea964f10e9a6681d04d7185c732
Author: Greg Knight <lyngvi at gmail.com>
Date: Fri Nov 23 19:53:38 2018 -0500
pdftohtml: add support for dataUrls argument
eliminate the 'extension' field used to regenerate background images; replace with a list of background images
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 69a93724..1b078003 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -60,6 +60,8 @@
#include "goo/GooString.h"
#include "goo/gbasename.h"
#include "goo/GooList.h"
+#include "goo/gbase64.h"
+#include "goo/gbasename.h"
#include "UnicodeMap.h"
#include "goo/gmem.h"
#include "Error.h"
@@ -71,6 +73,7 @@
#include "HtmlOutputDev.h"
#include "HtmlFonts.h"
#include "HtmlUtils.h"
+#include "InMemoryFile.h"
#include "Outline.h"
#include "PDFDoc.h"
@@ -102,6 +105,7 @@ static inline bool IS_CLOSER(float x, float y, float z) { return fabs((x)-(y)) <
extern bool complexMode;
extern bool singleHtml;
+extern bool dataUrls;
extern bool ignore;
extern bool printCommands;
extern bool printHtml;
@@ -267,7 +271,7 @@ void HtmlString::endString()
// HtmlPage
//------------------------------------------------------------------------
-HtmlPage::HtmlPage(bool rawOrder, const char *imgExtVal) {
+HtmlPage::HtmlPage(bool rawOrder) {
this->rawOrder = rawOrder;
curStr = nullptr;
yxStrings = nullptr;
@@ -281,7 +285,6 @@ HtmlPage::HtmlPage(bool rawOrder, const char *imgExtVal) {
fontsPageMarker = 0;
DocName=nullptr;
firstPage = -1;
- imgExt = new GooString(imgExtVal);
}
HtmlPage::~HtmlPage() {
@@ -289,7 +292,6 @@ HtmlPage::~HtmlPage() {
delete DocName;
delete fonts;
delete links;
- delete imgExt;
deleteGooList<HtmlImage>(imgList);
}
@@ -849,14 +851,12 @@ int HtmlPage::dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page)
return 0;
}
-void HtmlPage::dumpComplex(FILE *file, int page){
+void HtmlPage::dumpComplex(FILE *file, int page, const std::vector<std::string>& backgroundImages) {
FILE* pageFile;
if( firstPage == -1 ) firstPage = page;
if (dumpComplexHeaders(file, pageFile, page)) { error(errIO, -1, "Couldn't write headers."); return; }
-
- const std::string str = gbasename(DocName->c_str());
fputs("<style type=\"text/css\">\n<!--\n",pageFile);
fputs("\tp {margin: 0; padding: 0;}",pageFile);
@@ -880,12 +880,11 @@ void HtmlPage::dumpComplex(FILE *file, int page){
fprintf(pageFile,"<div id=\"page%d-div\" style=\"position:relative;width:%dpx;height:%dpx;\">\n",
page, pageWidth, pageHeight);
- if( !ignore )
+ if(!ignore && (size_t) (page - firstPage) < backgroundImages.size())
{
fprintf(pageFile,
- "<img width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\"/>\n",
- pageWidth, pageHeight, str.c_str(),
- (page-firstPage+1), imgExt->c_str());
+ "<img width=\"%d\" height=\"%d\" src=\"%s\" alt=\"background image\"/>\n",
+ pageWidth, pageHeight, backgroundImages[page - firstPage].c_str());
}
for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){
@@ -915,12 +914,12 @@ void HtmlPage::dumpComplex(FILE *file, int page){
}
-void HtmlPage::dump(FILE *f, int pageNum)
+void HtmlPage::dump(FILE *f, int pageNum, const std::vector<std::string>& backgroundImages)
{
if (complexMode || singleHtml)
{
if (xml) dumpAsXML(f, pageNum);
- if (!xml) dumpComplex(f, pageNum);
+ if (!xml) dumpComplex(f, pageNum, backgroundImages);
}
else
{
@@ -1083,7 +1082,6 @@ void HtmlOutputDev::doFrame(int firstPage){
HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title,
const char *author, const char *keywords, const char *subject, const char *date,
- const char *extension,
bool rawOrder, int firstPage, bool outline)
{
catalog = catalogA;
@@ -1099,7 +1097,7 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char
//pageNum=firstPage;
// open file
needClose = false;
- pages = new HtmlPage(rawOrder, extension);
+ pages = new HtmlPage(rawOrder);
glMetaVars = new GooList();
glMetaVars->push_back(new HtmlMetaVar("generator", "pdftohtml 0.36"));
@@ -1107,7 +1105,7 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char
if( keywords ) glMetaVars->push_back(new HtmlMetaVar("keywords", keywords));
if( date ) glMetaVars->push_back(new HtmlMetaVar("date", date));
if( subject ) glMetaVars->push_back(new HtmlMetaVar("subject", subject));
-
+
maxPageWidth = 0;
maxPageHeight = 0;
@@ -1272,7 +1270,7 @@ void HtmlOutputDev::endPage() {
pages->conv();
pages->coalesce();
- pages->dump(page, pageNum);
+ pages->dump(page, pageNum, backgroundImages);
// I don't yet know what to do in the case when there are pages of different
// sizes and we want complex output: running ghostscript many times
@@ -1284,6 +1282,10 @@ void HtmlOutputDev::endPage() {
if(!stout && !globalParams->getErrQuiet()) printf("Page-%d\n",(pageNum));
}
+void HtmlOutputDev::addBackgroundImage(const std::string& img) {
+ backgroundImages.push_back(img);
+}
+
void HtmlOutputDev::updateFont(GfxState *state) {
pages->updateFont(state);
}
@@ -1309,12 +1311,14 @@ void HtmlOutputDev::drawChar(GfxState *state, double x, double y,
void HtmlOutputDev::drawJpegImage(GfxState *state, Stream *str)
{
- FILE *f1;
+ InMemoryFile ims;
+ FILE *f1 = nullptr;
int c;
// open the image file
- GooString *fName=createImageFileName("jpg");
- if (!(f1 = fopen(fName->c_str(), "wb"))) {
+ GooString *fName = createImageFileName("jpg");
+ f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb");
+ if (!f1) {
error(errIO, -1, "Couldn't open image file '{0:t}'", fName);
delete fName;
return;
@@ -1330,9 +1334,11 @@ void HtmlOutputDev::drawJpegImage(GfxState *state, Stream *str)
fclose(f1);
- if (fName) {
- pages->addImage(fName, state);
+ if (dataUrls) {
+ delete fName;
+ fName = new GooString(std::string("data:image/jpeg;base64,") + gbase64Encode(ims.getBuffer()));
}
+ pages->addImage(fName, state);
}
void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int height,
@@ -1340,6 +1346,7 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
{
#ifdef ENABLE_LIBPNG
FILE *f1;
+ InMemoryFile ims;
if (!colorMap && !isMask) {
error(errInternal, -1, "Can't have color image without a color map");
@@ -1348,7 +1355,8 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
// open the image file
GooString *fName=createImageFileName("png");
- if (!(f1 = fopen(fName->c_str(), "wb"))) {
+ f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb");
+ if (!f1) {
error(errIO, -1, "Couldn't open image file '{0:t}'", fName);
delete fName;
return;
@@ -1453,6 +1461,10 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
delete writer;
fclose(f1);
+ if (dataUrls) {
+ delete fName;
+ fName = new GooString(std::string("data:image/png;base64,") + gbase64Encode(ims.getBuffer()));
+ }
pages->addImage(fName, state);
#else
return;
@@ -1461,16 +1473,7 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
GooString *HtmlOutputDev::createImageFileName(const char *ext)
{
- GooString *fName=new GooString(Docname);
- fName->append("-");
- GooString *pgNum= GooString::fromInt(pageNum);
- GooString *imgnum= GooString::fromInt(pages->getNumImages()+1);
-
- fName->append(pgNum)->append("_")->append(imgnum)->append(".")->append(ext);
- delete pgNum;
- delete imgnum;
-
- return fName;
+ return GooString::format("{0:s}-{1:d}_{2:d}.{3:s}", Docname->c_str(), pageNum, pages->getNumImages() + 1, ext);
}
void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index a6866295..7f09c056 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -36,6 +36,7 @@
#include <stdio.h>
#include "goo/GooList.h"
+#include "goo/gbasename.h"
#include "GfxFont.h"
#include "OutputDev.h"
#include "HtmlLinks.h"
@@ -63,7 +64,6 @@ enum UnicodeTextDirection {
textDirTopBottom
};
-
class HtmlString {
public:
@@ -116,7 +116,7 @@ class HtmlPage {
public:
// Constructor.
- HtmlPage(bool rawOrder, const char *imgExtVal);
+ HtmlPage(bool rawOrder);
// Destructor.
~HtmlPage();
@@ -159,7 +159,7 @@ public:
// number of images on the current page
int getNumImages() { return imgList->getLength(); }
- void dump(FILE *f, int pageNum);
+ void dump(FILE *f, int pageNum, const std::vector<std::string>& backgroundImages);
// Clear the page.
void clear();
@@ -179,7 +179,7 @@ private:
void setDocName(const char* fname);
void dumpAsXML(FILE* f,int page);
- void dumpComplex(FILE* f, int page);
+ void dumpComplex(FILE* f, int page, const std::vector<std::string>& backgroundImages);
int dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page);
// marks the position of the fonts that belong to current page (for noframes)
@@ -189,7 +189,6 @@ private:
GooList *imgList;
GooString *DocName;
- GooString *imgExt;
int pageWidth;
int pageHeight;
int firstPage; // used to begin the numeration of pages
@@ -234,7 +233,6 @@ public:
const char *keywords,
const char *subject,
const char *date,
- const char *extension,
bool rawOrder,
int firstPage = 1,
bool outline = 0);
@@ -283,6 +281,10 @@ public:
// End a page.
void endPage() override;
+ // add a background image to the list of background images,
+ // as this seems to be done outside other processing. takes ownership of img.
+ void addBackgroundImage(const std::string& img);
+
//----- update text state
void updateFont(GfxState *state) override;
@@ -345,6 +347,7 @@ private:
GooList *glMetaVars;
Catalog *catalog;
Page *docPage;
+ std::vector<std::string> backgroundImages;
friend class HtmlPage;
};
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index bbd98237..864ece99 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -46,6 +46,8 @@
#include <time.h>
#include "parseargs.h"
#include "goo/GooString.h"
+#include "goo/gbase64.h"
+#include "goo/gbasename.h"
#include "goo/gmem.h"
#include "Object.h"
#include "Stream.h"
@@ -68,6 +70,7 @@
#include "DateInfo.h"
#include "goo/gfile.h"
#include "Win32Console.h"
+#include "InMemoryFile.h"
static int firstPage = 1;
static int lastPage = 0;
@@ -77,6 +80,7 @@ static bool printHelp = false;
bool printHtml = false;
bool complexMode=false;
bool singleHtml=false; // singleHtml
+bool dataUrls = false;
bool ignore=false;
static char extension[5]="png";
static double scale=1.5;
@@ -123,6 +127,10 @@ static const ArgDesc argDesc[] = {
"generate complex document"},
{"-s", argFlag, &singleHtml, 0,
"generate single document that includes all pages"},
+#ifdef HAVE_IN_MEMORY_FILE
+ {"-dataurls", argFlag, &dataUrls, 0,
+ "use data URLs instead of external images in HTML"},
+#endif
{"-i", argFlag, &ignore, 0,
"ignore images"},
{"-noframes", argFlag, &noframes, 0,
@@ -366,7 +374,6 @@ int main(int argc, char *argv[]) {
keywords ? keywords->c_str() : nullptr,
subject ? subject->c_str() : nullptr,
date ? date->c_str() : nullptr,
- extension,
rawOrder,
firstPage,
doOutline);
@@ -387,13 +394,6 @@ int main(int argc, char *argv[]) {
{
delete date;
}
-
- if (htmlOut->isOk())
- {
- doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0,
- true, false, false);
- htmlOut->dumpDocOutline(doc);
- }
if ((complexMode || singleHtml) && !xml && !ignore) {
#ifdef HAVE_SPLASH
@@ -409,6 +409,7 @@ int main(int argc, char *argv[]) {
splashOut->startDoc(doc);
for (int pg = firstPage; pg <= lastPage; ++pg) {
+ InMemoryFile imf;
doc->displayPage(splashOut, pg,
72 * scale, 72 * scale,
0, true, false, false);
@@ -416,10 +417,22 @@ int main(int argc, char *argv[]) {
imgFileName = GooString::format("{0:s}{1:03d}.{2:s}",
htmlFileName->c_str(), pg, extension);
-
- bitmap->writeImgFile(format, imgFileName->c_str(),
- 72 * scale, 72 * scale);
-
+ auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName->c_str(), "wb");
+ if (!f1) {
+ fprintf(stderr, "Could not open %s\n", imgFileName->c_str());
+ delete imgFileName;
+ continue;
+ }
+ bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale);
+ fclose(f1);
+ if (dataUrls) {
+ htmlOut->addBackgroundImage(
+ std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") +
+ gbase64Encode(imf.getBuffer())
+ );
+ } else {
+ htmlOut->addBackgroundImage(gbasename(imgFileName->c_str()));
+ }
delete imgFileName;
}
@@ -434,7 +447,14 @@ int main(int argc, char *argv[]) {
return -1;
#endif
}
-
+
+ if (htmlOut->isOk())
+ {
+ doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0,
+ true, false, false);
+ htmlOut->dumpDocOutline(doc);
+ }
+
delete htmlOut;
exit_status = EXIT_SUCCESS;
commit 44da4d785cffeb5d4bbb1460479add6ce01edea2
Author: Greg Knight <lyngvi at gmail.com>
Date: Sun Feb 10 10:31:36 2019 +0100
Introduce gbase64
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74294ca8..bf187ab4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ configure_file(config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
configure_file(poppler/poppler-config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h)
set(poppler_SRCS
+ goo/gbase64.cc
goo/gbasename.cc
goo/gfile.cc
goo/GooTimer.cc
diff --git a/goo/gbase64.cc b/goo/gbase64.cc
new file mode 100644
index 00000000..e0da77ff
--- /dev/null
+++ b/goo/gbase64.cc
@@ -0,0 +1,50 @@
+//========================================================================
+//
+// gbase64.cc
+//
+// Implementation of a base64 encoder, because another one did not immediately
+// avail itself.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#include "gbase64.h"
+#include <sstream>
+
+static void b64encodeTriplet(char output[4], unsigned char a, unsigned char b, unsigned char c)
+{
+ static const char* base64table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+ output[0] = base64table[((a >> 2) & 0x3f) ]; // upper 6 of first byte
+ output[1] = base64table[((a << 4) & 0x30) | ((b >> 4) & 0x0f)]; // lower 2 of first byte, upper 4 of second byte
+ output[2] = base64table[((b << 2) & 0x3c) | ((c >> 6) & 0x03)]; // lower 4 of second byte, upper 2 of third byte
+ output[3] = base64table[((c ) & 0x3f)]; // lower 6 of third byte
+}
+
+std::string gbase64Encode(const void* input, size_t len)
+{
+ char quad[4];
+ size_t pos = 0;
+ std::stringstream buf;
+ auto bytes = static_cast<const unsigned char*>(input);
+ for ( ; pos + 3 <= len; pos += 3) {
+ b64encodeTriplet(quad, bytes[0], bytes[1], bytes[2]);
+ buf.write(&quad[0], 4);
+ bytes += 3;
+ }
+ switch (len - pos) {
+ case 1:
+ b64encodeTriplet(quad, bytes[0], 0, 0);
+ quad[2] = quad[3] = '=';
+ buf.write(&quad[0], 4);
+ break;
+ case 2:
+ b64encodeTriplet(quad, bytes[0], bytes[1], 0);
+ quad[3] = '=';
+ buf.write(&quad[0], 4);
+ break;
+ }
+ return buf.str();
+}
diff --git a/goo/gbase64.h b/goo/gbase64.h
new file mode 100644
index 00000000..06e2e8b6
--- /dev/null
+++ b/goo/gbase64.h
@@ -0,0 +1,28 @@
+//========================================================================
+//
+// gbase64.h
+//
+// Implementation of a base64 encoder, because another one did not immediately
+// avail itself.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#ifndef GOO_GBASE64_H
+#define GOO_GBASE64_H
+
+#include <string>
+#include <vector>
+
+std::string gbase64Encode(const void* input, size_t sz);
+
+inline std::string gbase64Encode(const std::vector<char>& input)
+ { return input.empty() ? std::string() : gbase64Encode(&input[0], input.size()); }
+
+inline std::string gbase64Encode(const std::vector<unsigned char>& input)
+ { return input.empty() ? std::string() : gbase64Encode(&input[0], input.size()); }
+
+#endif // ndef GOO_GBASE64_H
commit 2ba81611e9ccdcb49275ee247308bd0dcba3e64d
Author: Greg Knight <lyngvi at gmail.com>
Date: Sun Feb 10 10:28:26 2019 +0100
Introduce gbasename
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d49721c..74294ca8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ configure_file(config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
configure_file(poppler/poppler-config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h)
set(poppler_SRCS
+ goo/gbasename.cc
goo/gfile.cc
goo/GooTimer.cc
goo/GooString.cc
diff --git a/goo/gbasename.cc b/goo/gbasename.cc
new file mode 100644
index 00000000..dd4607b4
--- /dev/null
+++ b/goo/gbasename.cc
@@ -0,0 +1,51 @@
+//========================================================================
+//
+// gbasename.cc
+//
+// Wrapper for libgen's basename() call which returns a std::string.
+// This is a convenience method working around questionable behavior
+// in the copy of basename() provided by libgen.h.
+//
+// According to man 3 basename:
+//
+// Both dirname() and basename() may modify the contents of path, so it
+// may be desirable to pass a copy when calling one of these functions.
+//
+// ...
+//
+// These functions may return pointers to statically allocated memory
+// which may be overwritten by subsequent calls. Alternatively, they
+// may return a pointer to some part of path, so that the string
+// referred to by path should not be modified or freed until the pointer
+// returned by the function is no longer required.
+//
+// Because basename can modify filename (for some reason), we have to
+// duplicate our input into a mutable buffer before we can call it.
+// The return value might be part of this mutable temporary, but not
+// generally the front, so 'char *' cannot be used as our return value.
+// The return value might also be a statically allocated string,
+// rendering basename (and thus gbasename) non-thread-safe. Because
+// we don't know how basename()'s return value is lifecycled, we need
+// to duplicate it again into something whose lifecycle we can predict.
+//
+// This is how a method that should amount to finding the last slash
+// in a string ends up requiring two memory allocations while managing
+// not to be thread-safe. In a way, it's kind of impressive.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#include "gbasename.h"
+#include <libgen.h>
+#include <string.h>
+
+std::string gbasename(const char* filename)
+{
+ char* mutabl = strdup(filename);
+ std::string retu = basename(mutabl);
+ free(mutabl);
+ return retu;
+}
diff --git a/goo/gbasename.h b/goo/gbasename.h
new file mode 100644
index 00000000..3c5e0065
--- /dev/null
+++ b/goo/gbasename.h
@@ -0,0 +1,22 @@
+//========================================================================
+//
+// gbasename.h
+//
+// Wrapper for libgen's basename() call which returns a std::string.
+// This is a convenience method working around questionable behavior
+// in the copy of basename() provided by libgen.h.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#ifndef GBASENAME_H
+#define GBASENAME_H
+
+#include <string>
+
+std::string gbasename(const char* input);
+
+#endif // ndef GBASENAME_H
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index d83319e2..69a93724 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -58,6 +58,7 @@
#include <math.h>
#include <iostream>
#include "goo/GooString.h"
+#include "goo/gbasename.h"
#include "goo/GooList.h"
#include "UnicodeMap.h"
#include "goo/gmem.h"
@@ -116,16 +117,6 @@ extern double wordBreakThreshold;
static bool debug = false;
static GooString *gstr_buff0 = nullptr; // a workspace in which I format strings
-static GooString* basename(GooString* str){
-
- const char *p=str->c_str();
- int len=str->getLength();
- for (int i=len-1;i>=0;i--)
- if (*(p+i)==SLASH)
- return new GooString((p+i+1),len-i-1);
- return new GooString(str);
-}
-
#if 0
static GooString* Dirname(GooString* str){
@@ -860,13 +851,12 @@ int HtmlPage::dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page)
void HtmlPage::dumpComplex(FILE *file, int page){
FILE* pageFile;
- GooString* tmp;
if( firstPage == -1 ) firstPage = page;
if (dumpComplexHeaders(file, pageFile, page)) { error(errIO, -1, "Couldn't write headers."); return; }
- tmp=basename(DocName);
+ const std::string str = gbasename(DocName->c_str());
fputs("<style type=\"text/css\">\n<!--\n",pageFile);
fputs("\tp {margin: 0; padding: 0;}",pageFile);
@@ -894,12 +884,10 @@ void HtmlPage::dumpComplex(FILE *file, int page){
{
fprintf(pageFile,
"<img width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\"/>\n",
- pageWidth, pageHeight, tmp->c_str(),
+ pageWidth, pageHeight, str.c_str(),
(page-firstPage+1), imgExt->c_str());
}
- delete tmp;
-
for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){
if (tmp1->htext){
fprintf(pageFile,
@@ -1070,7 +1058,7 @@ void HtmlOutputDev::doFrame(int firstPage){
delete fName;
- fName=basename(Docname);
+ const std::string baseName = gbasename(Docname->c_str());
fputs(DOCTYPE, fContentsFrame);
fputs("\n<html>",fContentsFrame);
fputs("\n<head>",fContentsFrame);
@@ -1080,16 +1068,15 @@ void HtmlOutputDev::doFrame(int firstPage){
dumpMetaVars(fContentsFrame);
fprintf(fContentsFrame, "</head>\n");
fputs("<frameset cols=\"100,*\">\n",fContentsFrame);
- fprintf(fContentsFrame,"<frame name=\"links\" src=\"%s_ind.html\"/>\n",fName->c_str());
+ fprintf(fContentsFrame,"<frame name=\"links\" src=\"%s_ind.html\"/>\n", baseName.c_str());
fputs("<frame name=\"contents\" src=",fContentsFrame);
if (complexMode)
- fprintf(fContentsFrame,"\"%s-%d.html\"",fName->c_str(), firstPage);
+ fprintf(fContentsFrame,"\"%s-%d.html\"", baseName.c_str(), firstPage);
else
- fprintf(fContentsFrame,"\"%ss.html\"",fName->c_str());
+ fprintf(fContentsFrame,"\"%ss.html\"", baseName.c_str());
fputs("/>\n</frameset>\n</html>\n",fContentsFrame);
- delete fName;
delete htmlEncoding;
fclose(fContentsFrame);
}
@@ -1149,9 +1136,9 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char
if (doOutline)
{
- GooString *str = basename(Docname);
- fprintf(fContentsFrame, "<a href=\"%s%s\" target=\"contents\">Outline</a><br/>", str->c_str(), complexMode ? "-outline.html" : "s.html#outline");
- delete str;
+ fprintf(fContentsFrame, "<a href=\"%s%s\" target=\"contents\">Outline</a><br/>",
+ gbasename(Docname->c_str()).c_str(),
+ complexMode ? "-outline.html" : "s.html#outline");
}
}
if (!complexMode)
@@ -1256,24 +1243,22 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) {
#endif
this->pageNum = pageNum;
- GooString *str=basename(Docname);
+ const std::string str = gbasename(Docname->c_str());
pages->clear();
if(!noframes)
{
if (fContentsFrame)
{
if (complexMode)
- fprintf(fContentsFrame,"<a href=\"%s-%d.html\"",str->c_str(),pageNum);
+ fprintf(fContentsFrame,"<a href=\"%s-%d.html\"", str.c_str(), pageNum);
else
- fprintf(fContentsFrame,"<a href=\"%ss.html#%d\"",str->c_str(),pageNum);
+ fprintf(fContentsFrame,"<a href=\"%ss.html#%d\"", str.c_str(), pageNum);
fprintf(fContentsFrame," target=\"contents\" >Page %d</a><br/>\n",pageNum);
}
}
pages->pageWidth=static_cast<int>(state->getPageWidth());
pages->pageHeight=static_cast<int>(state->getPageHeight());
-
- delete str;
}
@@ -1561,8 +1546,8 @@ GooString* HtmlOutputDev::getLinkDest(AnnotLink *link){
switch(link->getAction()->getKind())
{
case actionGoTo:
- {
- GooString* file=basename(Docname);
+ {
+ GooString* file = new GooString(gbasename(Docname->c_str()));
int page=1;
LinkGoTo *ha=(LinkGoTo *)link->getAction();
LinkDest *dest=nullptr;
@@ -1781,7 +1766,7 @@ bool HtmlOutputDev::newHtmlOutlineLevel(FILE *output, const GooList *outlines, i
frames file-4.html files.html#4
noframes file.html#4 file.html#4
*/
- linkName=basename(Docname);
+ linkName = new GooString(gbasename(Docname->c_str()));
GooString *str=GooString::fromInt(page);
if (noframes) {
linkName->append(".html#");
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index 30a7f1d3..a6866295 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -44,13 +44,6 @@
#include "Catalog.h"
#include "UnicodeMap.h"
-
-#ifdef _WIN32
-# define SLASH '\\'
-#else
-# define SLASH '/'
-#endif
-
#define xoutRound(x) ((int)(x + 0.5))
#define DOCTYPE "<!DOCTYPE html>"
diff --git a/utils/pdfsig.cc b/utils/pdfsig.cc
index 64299690..3a432b36 100644
--- a/utils/pdfsig.cc
+++ b/utils/pdfsig.cc
@@ -24,10 +24,10 @@
#include <time.h>
#include <hasht.h>
#include <fstream>
-#include <libgen.h>
#include "parseargs.h"
#include "Object.h"
#include "Array.h"
+#include "goo/gbasename.h"
#include "Page.h"
#include "PDFDoc.h"
#include "PDFDocFactory.h"
@@ -108,9 +108,7 @@ static void dumpSignature(int sig_num, int sigCount, FormWidgetSignature *sig_wi
// since { is the magic character to replace things we need to put it twice where
// we don't want it to be replaced
GooString *format = GooString::format("{{0:s}}.sig{{1:{0:d}d}}", sigCountLength);
- char *filenameCopy = strdup(filename);
- GooString *path = GooString::format(format->c_str(), basename(filenameCopy), sig_num);
- free(filenameCopy);
+ GooString *path = GooString::format(format->c_str(), gbasename(filename).c_str(), sig_num);
printf("Signature #%d (%u bytes) => %s\n", sig_num, signature->getLength(), path->c_str());
std::ofstream outfile(path->c_str(), std::ofstream::binary);
outfile.write(signature->c_str(), signature->getLength());
commit 7f4da59665969f624c18a1ba3e1f1ac1ca3478b1
Author: Greg Knight <lyngvi at gmail.com>
Date: Fri Nov 23 19:37:37 2018 -0500
pdftohtml data urls: adding InMemoryFile utility class
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index 34d96475..3516479e 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -121,6 +121,7 @@ install(FILES pdftotext.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
# pdftohtml
set(pdftohtml_SOURCES ${common_srcs}
+ InMemoryFile.cc
pdftohtml.cc
HtmlFonts.cc
HtmlLinks.cc
diff --git a/utils/InMemoryFile.cc b/utils/InMemoryFile.cc
new file mode 100644
index 00000000..d4ed0f48
--- /dev/null
+++ b/utils/InMemoryFile.cc
@@ -0,0 +1,75 @@
+//========================================================================
+//
+// InMemoryFile.cc
+//
+// Represents a file in-memory with GNU's stdio wrappers.
+// NOTE as of this writing, open() depends on the glibc 'fopencookie'
+// extension and is not supported on other platforms. The
+// HAVE_IN_MEMORY_FILE macro is intended to reflect whether this class is
+// usable.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#include "InMemoryFile.h"
+
+#include <string.h>
+#include <sstream>
+
+InMemoryFile::InMemoryFile()
+ : iohead(0)
+ , fptr(nullptr)
+{
+}
+
+ssize_t InMemoryFile::_read(char* buf, size_t sz)
+{
+ auto toRead = std::min<size_t>(data.size() - iohead, sz);
+ memcpy(&buf[0], &data[iohead], toRead);
+ iohead += toRead;
+ return toRead;
+}
+
+ssize_t InMemoryFile::_write(const char* buf, size_t sz)
+{
+ if (iohead + sz > data.size())
+ data.resize(iohead + sz);
+ memcpy(&data[iohead], buf, sz);
+ iohead += sz;
+ return sz;
+}
+
+int InMemoryFile::_seek(off64_t* offset, int whence)
+{
+ switch (whence) {
+ case SEEK_SET: iohead = (*offset); break;
+ case SEEK_CUR: iohead += (*offset); break;
+ case SEEK_END: iohead -= (*offset); break;
+ }
+ (*offset) = std::min<off64_t>(std::max<off64_t>(iohead, 0l), data.size());
+ iohead = static_cast<size_t>(*offset);
+ return 0;
+}
+
+FILE* InMemoryFile::open(const char* mode)
+{
+#if HAVE_IN_MEMORY_FILE_FOPENCOOKIE
+ if (fptr != nullptr) {
+ fprintf(stderr, "InMemoryFile: BUG: Why is this opened more than once?");
+ return nullptr; // maybe there's some legit reason for it, whoever comes up with one can remove this line
+ }
+ static cookie_io_functions_t methods = {
+ /* .read = */ [](void* self, char* buf, size_t sz) { return ((InMemoryFile*)self)->_read(buf, sz); },
+ /* .write = */ [](void* self, const char* buf, size_t sz) { return ((InMemoryFile*)self)->_write(buf, sz); },
+ /* .seek = */ [](void* self, off64_t* offset, int whence) { return ((InMemoryFile*)self)->_seek(offset, whence); },
+ /* .close = */ [](void* self) { ((InMemoryFile*)self)->fptr = nullptr; return 0; },
+ };
+ return fptr = fopencookie(this, mode, methods);
+#else
+ fprintf (stderr, "If you can read this, your platform does not support the features necessary to achieve your goals.");
+ return nullptr;
+#endif
+}
diff --git a/utils/InMemoryFile.h b/utils/InMemoryFile.h
new file mode 100644
index 00000000..6af7d503
--- /dev/null
+++ b/utils/InMemoryFile.h
@@ -0,0 +1,51 @@
+//========================================================================
+//
+// InMemoryFile.h
+//
+// Represents a file in-memory with GNU's stdio wrappers.
+// NOTE as of this writing, open() depends on the glibc 'fopencookie'
+// extension and is not supported on other platforms. The
+// HAVE_IN_MEMORY_FILE macro is intended to reflect whether this class is
+// usable.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#ifndef IN_MEMORY_FILE_H
+#define IN_MEMORY_FILE_H
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(__USE_GNU) && !defined(__ANDROID_API__)
+# define HAVE_IN_MEMORY_FILE (1)
+# define HAVE_IN_MEMORY_FILE_FOPENCOOKIE (1) // used internally
+#endif
+
+class InMemoryFile {
+private:
+ size_t iohead;
+ std::vector<char> data;
+ FILE *fptr;
+
+ ssize_t _read(char* buf, size_t sz);
+ ssize_t _write(const char* buf, size_t sz);
+ int _seek(off64_t* offset, int whence);
+
+public:
+ InMemoryFile();
+
+public:
+ /* Returns a file handle for this file. This is scoped to this object
+ * and must be fclosed() by the caller before destruction. */
+ FILE* open(const char* mode);
+
+ const std::vector<char>& getBuffer() const
+ { return data; }
+};
+
+#endif // IN_MEMORY_FILE_H
More information about the poppler
mailing list