[poppler] 6 commits - CMakeLists.txt goo/gbase64.cc goo/gbase64.h goo/gbasename.cc goo/gbasename.h utils/CMakeLists.txt utils/HtmlOutputDev.cc utils/HtmlOutputDev.h utils/InMemoryFile.cc utils/InMemoryFile.h utils/pdfsig.cc utils/pdftohtml.1 utils/pdftohtml.cc

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Sun Feb 10 14:49:54 UTC 2019


 CMakeLists.txt         |    2 
 goo/gbase64.cc         |   50 ++++++++++++++++++++++
 goo/gbase64.h          |   28 ++++++++++++
 goo/gbasename.cc       |   51 ++++++++++++++++++++++
 goo/gbasename.h        |   22 +++++++++
 utils/CMakeLists.txt   |    1 
 utils/HtmlOutputDev.cc |  110 +++++++++++++++++++++----------------------------
 utils/HtmlOutputDev.h  |   22 ++++-----
 utils/InMemoryFile.cc  |   75 +++++++++++++++++++++++++++++++++
 utils/InMemoryFile.h   |   51 ++++++++++++++++++++++
 utils/pdfsig.cc        |    6 --
 utils/pdftohtml.1      |    3 +
 utils/pdftohtml.cc     |   49 +++++++++++++++------
 13 files changed, 377 insertions(+), 93 deletions(-)

New commits:
commit 5f6ff67b0e1dc075d737fc840642c292329dcd08
Author: Albert Astals Cid <aacid at kde.org>
Date:   Sun Feb 10 15:32:26 2019 +0100

    pdftohtml: Add -dataurls to man page

diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1
index 5d711ba9..ea386ffd 100644
--- a/utils/pdftohtml.1
+++ b/utils/pdftohtml.1
@@ -43,6 +43,9 @@ generate complex output
 .B \-s
 generate single HTML that includes all pages
 .TP
+.B \-dataurls
+use data URLs instead of external images in HTML. No available in all platforms
+.TP
 .B \-i
 ignore images
 .TP
commit 7b8dbc0a4dc8e0738658b8e4fe7c44adad15af24
Author: Greg Knight <lyngvi at gmail.com>
Date:   Fri Nov 23 22:30:12 2018 -0500

    pdftohtml: singleHtml and stout are not mutually exclusive. with -dataurls is actually quite reasonable.

diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 864ece99..6218ff3c 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -318,7 +318,7 @@ int main(int argc, char *argv[]) {
    if (scale>3.0) scale=3.0;
    if (scale<0.5) scale=0.5;
    
-   if (complexMode || singleHtml) {
+   if (complexMode) {
      //noframes=false;
      stout=false;
    } 
@@ -326,7 +326,6 @@ int main(int argc, char *argv[]) {
    if (stout) {
      noframes=true;
      complexMode=false;
-     singleHtml=false;
    }
 
    if (xml)
commit 91ab53fa635e9ea964f10e9a6681d04d7185c732
Author: Greg Knight <lyngvi at gmail.com>
Date:   Fri Nov 23 19:53:38 2018 -0500

    pdftohtml: add support for dataUrls argument
    
    eliminate the 'extension' field used to regenerate background images; replace with a list of background images

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 69a93724..1b078003 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -60,6 +60,8 @@
 #include "goo/GooString.h"
 #include "goo/gbasename.h"
 #include "goo/GooList.h"
+#include "goo/gbase64.h"
+#include "goo/gbasename.h"
 #include "UnicodeMap.h"
 #include "goo/gmem.h"
 #include "Error.h"
@@ -71,6 +73,7 @@
 #include "HtmlOutputDev.h"
 #include "HtmlFonts.h"
 #include "HtmlUtils.h"
+#include "InMemoryFile.h"
 #include "Outline.h"
 #include "PDFDoc.h"
 
@@ -102,6 +105,7 @@ static inline bool IS_CLOSER(float x, float y, float z) { return fabs((x)-(y)) <
 
 extern bool complexMode;
 extern bool singleHtml;
+extern bool dataUrls;
 extern bool ignore;
 extern bool printCommands;
 extern bool printHtml;
@@ -267,7 +271,7 @@ void HtmlString::endString()
 // HtmlPage
 //------------------------------------------------------------------------
 
-HtmlPage::HtmlPage(bool rawOrder, const char *imgExtVal) {
+HtmlPage::HtmlPage(bool rawOrder) {
   this->rawOrder = rawOrder;
   curStr = nullptr;
   yxStrings = nullptr;
@@ -281,7 +285,6 @@ HtmlPage::HtmlPage(bool rawOrder, const char *imgExtVal) {
   fontsPageMarker = 0;
   DocName=nullptr;
   firstPage = -1;
-  imgExt = new GooString(imgExtVal);
 }
 
 HtmlPage::~HtmlPage() {
@@ -289,7 +292,6 @@ HtmlPage::~HtmlPage() {
   delete DocName;
   delete fonts;
   delete links;
-  delete imgExt;
   deleteGooList<HtmlImage>(imgList);
 }
 
@@ -849,14 +851,12 @@ int HtmlPage::dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page)
   return 0;
 }
 
-void HtmlPage::dumpComplex(FILE *file, int page){
+void HtmlPage::dumpComplex(FILE *file, int page, const std::vector<std::string>& backgroundImages) {
   FILE* pageFile;
 
   if( firstPage == -1 ) firstPage = page; 
   
   if (dumpComplexHeaders(file, pageFile, page)) { error(errIO, -1, "Couldn't write headers."); return; }
-
-  const std::string str = gbasename(DocName->c_str());
    
   fputs("<style type=\"text/css\">\n<!--\n",pageFile);
   fputs("\tp {margin: 0; padding: 0;}",pageFile);
@@ -880,12 +880,11 @@ void HtmlPage::dumpComplex(FILE *file, int page){
   fprintf(pageFile,"<div id=\"page%d-div\" style=\"position:relative;width:%dpx;height:%dpx;\">\n",
       page, pageWidth, pageHeight);
 
-  if( !ignore ) 
+  if(!ignore && (size_t) (page - firstPage) < backgroundImages.size())
   {
     fprintf(pageFile,
-	    "<img width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\"/>\n",
-	    pageWidth, pageHeight, str.c_str(),
-		(page-firstPage+1), imgExt->c_str());
+      "<img width=\"%d\" height=\"%d\" src=\"%s\" alt=\"background image\"/>\n",
+      pageWidth, pageHeight, backgroundImages[page - firstPage].c_str());
   }
   
   for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){
@@ -915,12 +914,12 @@ void HtmlPage::dumpComplex(FILE *file, int page){
 }
 
 
-void HtmlPage::dump(FILE *f, int pageNum) 
+void HtmlPage::dump(FILE *f, int pageNum, const std::vector<std::string>& backgroundImages)
 {
   if (complexMode || singleHtml)
   {
     if (xml) dumpAsXML(f, pageNum);
-    if (!xml) dumpComplex(f, pageNum);  
+    if (!xml) dumpComplex(f, pageNum, backgroundImages);
   }
   else
   {
@@ -1083,7 +1082,6 @@ void HtmlOutputDev::doFrame(int firstPage){
 
 HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title,
 	const char *author, const char *keywords, const char *subject, const char *date,
-	const char *extension,
 	bool rawOrder, int firstPage, bool outline) 
 {
   catalog = catalogA;
@@ -1099,7 +1097,7 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char
   //pageNum=firstPage;
   // open file
   needClose = false;
-  pages = new HtmlPage(rawOrder, extension);
+  pages = new HtmlPage(rawOrder);
   
   glMetaVars = new GooList();
   glMetaVars->push_back(new HtmlMetaVar("generator", "pdftohtml 0.36"));
@@ -1107,7 +1105,7 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char
   if( keywords ) glMetaVars->push_back(new HtmlMetaVar("keywords", keywords));
   if( date ) glMetaVars->push_back(new HtmlMetaVar("date", date));
   if( subject ) glMetaVars->push_back(new HtmlMetaVar("subject", subject));
- 
+
   maxPageWidth = 0;
   maxPageHeight = 0;
 
@@ -1272,7 +1270,7 @@ void HtmlOutputDev::endPage() {
 
   pages->conv();
   pages->coalesce();
-  pages->dump(page, pageNum);
+  pages->dump(page, pageNum, backgroundImages);
   
   // I don't yet know what to do in the case when there are pages of different
   // sizes and we want complex output: running ghostscript many times 
@@ -1284,6 +1282,10 @@ void HtmlOutputDev::endPage() {
   if(!stout && !globalParams->getErrQuiet()) printf("Page-%d\n",(pageNum));
 }
 
+void HtmlOutputDev::addBackgroundImage(const std::string& img) {
+  backgroundImages.push_back(img);
+}
+
 void HtmlOutputDev::updateFont(GfxState *state) {
   pages->updateFont(state);
 }
@@ -1309,12 +1311,14 @@ void HtmlOutputDev::drawChar(GfxState *state, double x, double y,
 
 void HtmlOutputDev::drawJpegImage(GfxState *state, Stream *str)
 {
-  FILE *f1;
+  InMemoryFile ims;
+  FILE *f1 = nullptr;
   int c;
 
   // open the image file
-  GooString *fName=createImageFileName("jpg");
-  if (!(f1 = fopen(fName->c_str(), "wb"))) {
+  GooString *fName = createImageFileName("jpg");
+  f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb");
+  if (!f1) {
     error(errIO, -1, "Couldn't open image file '{0:t}'", fName);
     delete fName;
     return;
@@ -1330,9 +1334,11 @@ void HtmlOutputDev::drawJpegImage(GfxState *state, Stream *str)
 
   fclose(f1);
 
-  if (fName) {
-      pages->addImage(fName, state);
+  if (dataUrls) {
+    delete fName;
+    fName = new GooString(std::string("data:image/jpeg;base64,") + gbase64Encode(ims.getBuffer()));
   }
+  pages->addImage(fName, state);
 }
 
 void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int height,
@@ -1340,6 +1346,7 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
 {
 #ifdef ENABLE_LIBPNG
   FILE *f1;
+  InMemoryFile ims;
 
   if (!colorMap && !isMask) {
     error(errInternal, -1, "Can't have color image without a color map");
@@ -1348,7 +1355,8 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
 
   // open the image file
   GooString *fName=createImageFileName("png");
-  if (!(f1 = fopen(fName->c_str(), "wb"))) {
+  f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb");
+  if (!f1) {
     error(errIO, -1, "Couldn't open image file '{0:t}'", fName);
     delete fName;
     return;
@@ -1453,6 +1461,10 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
   delete writer;
   fclose(f1);
 
+  if (dataUrls) {
+    delete fName;
+    fName = new GooString(std::string("data:image/png;base64,") + gbase64Encode(ims.getBuffer()));
+  }
   pages->addImage(fName, state);
 #else
   return;
@@ -1461,16 +1473,7 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he
 
 GooString *HtmlOutputDev::createImageFileName(const char *ext)
 {
-  GooString *fName=new GooString(Docname);
-  fName->append("-");
-  GooString *pgNum= GooString::fromInt(pageNum);
-  GooString *imgnum= GooString::fromInt(pages->getNumImages()+1);
-
-  fName->append(pgNum)->append("_")->append(imgnum)->append(".")->append(ext);
-  delete pgNum;
-  delete imgnum;
-
-  return fName;
+  return GooString::format("{0:s}-{1:d}_{2:d}.{3:s}", Docname->c_str(), pageNum, pages->getNumImages() + 1, ext);
 }
 
 void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index a6866295..7f09c056 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -36,6 +36,7 @@
 
 #include <stdio.h>
 #include "goo/GooList.h"
+#include "goo/gbasename.h"
 #include "GfxFont.h"
 #include "OutputDev.h"
 #include "HtmlLinks.h"
@@ -63,7 +64,6 @@ enum UnicodeTextDirection {
   textDirTopBottom
 };
 
-
 class HtmlString {
 public:
 
@@ -116,7 +116,7 @@ class HtmlPage {
 public:
 
   // Constructor.
-  HtmlPage(bool rawOrder, const char *imgExtVal);
+  HtmlPage(bool rawOrder);
 
   // Destructor.
   ~HtmlPage();
@@ -159,7 +159,7 @@ public:
   // number of images on the current page
   int  getNumImages() { return imgList->getLength(); }
 
-  void dump(FILE *f, int pageNum);
+  void dump(FILE *f, int pageNum, const std::vector<std::string>& backgroundImages);
 
   // Clear the page.
   void clear();
@@ -179,7 +179,7 @@ private:
   
   void setDocName(const char* fname);
   void dumpAsXML(FILE* f,int page);
-  void dumpComplex(FILE* f, int page);
+  void dumpComplex(FILE* f, int page, const std::vector<std::string>& backgroundImages);
   int dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page);
 
   // marks the position of the fonts that belong to current page (for noframes)
@@ -189,7 +189,6 @@ private:
   GooList   *imgList;
   
   GooString *DocName;
-  GooString *imgExt;
   int pageWidth;
   int pageHeight;
   int firstPage;                // used to begin the numeration of pages
@@ -234,7 +233,6 @@ public:
 	  const char *keywords,
 	  const char *subject,
 	  const char *date,
-	  const char *extension,
 	  bool rawOrder,
 	  int firstPage = 1,
 	  bool outline = 0);
@@ -283,6 +281,10 @@ public:
   // End a page.
   void endPage() override;
 
+  // add a background image to the list of background images,
+  // as this seems to be done outside other processing. takes ownership of img.
+  void addBackgroundImage(const std::string& img);
+
   //----- update text state
   void updateFont(GfxState *state) override;
 
@@ -345,6 +347,7 @@ private:
   GooList *glMetaVars;
   Catalog *catalog;
   Page *docPage;
+  std::vector<std::string> backgroundImages;
   friend class HtmlPage;
 };
 
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index bbd98237..864ece99 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -46,6 +46,8 @@
 #include <time.h>
 #include "parseargs.h"
 #include "goo/GooString.h"
+#include "goo/gbase64.h"
+#include "goo/gbasename.h"
 #include "goo/gmem.h"
 #include "Object.h"
 #include "Stream.h"
@@ -68,6 +70,7 @@
 #include "DateInfo.h"
 #include "goo/gfile.h"
 #include "Win32Console.h"
+#include "InMemoryFile.h"
 
 static int firstPage = 1;
 static int lastPage = 0;
@@ -77,6 +80,7 @@ static bool printHelp = false;
 bool printHtml = false;
 bool complexMode=false;
 bool singleHtml=false; // singleHtml
+bool dataUrls = false;
 bool ignore=false;
 static char extension[5]="png";
 static double scale=1.5;
@@ -123,6 +127,10 @@ static const ArgDesc argDesc[] = {
    "generate complex document"},
   {"-s",      argFlag,     &singleHtml,          0,
    "generate single document that includes all pages"},
+#ifdef HAVE_IN_MEMORY_FILE
+  {"-dataurls", argFlag,   &dataUrls,      0,
+   "use data URLs instead of external images in HTML"},
+#endif
   {"-i",      argFlag,     &ignore,        0,
    "ignore images"},
   {"-noframes", argFlag,   &noframes,      0,
@@ -366,7 +374,6 @@ int main(int argc, char *argv[]) {
 	  keywords ? keywords->c_str() : nullptr, 
           subject ? subject->c_str() : nullptr, 
 	  date ? date->c_str() : nullptr,
-	  extension,
 	  rawOrder, 
 	  firstPage,
 	  doOutline);
@@ -387,13 +394,6 @@ int main(int argc, char *argv[]) {
   {
       delete date;
   }
-
-  if (htmlOut->isOk())
-  {
-    doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0,
-		      true, false, false);
-    htmlOut->dumpDocOutline(doc);
-  }
   
   if ((complexMode || singleHtml) && !xml && !ignore) {
 #ifdef HAVE_SPLASH
@@ -409,6 +409,7 @@ int main(int argc, char *argv[]) {
     splashOut->startDoc(doc);
 
     for (int pg = firstPage; pg <= lastPage; ++pg) {
+      InMemoryFile imf;
       doc->displayPage(splashOut, pg,
                        72 * scale, 72 * scale,
                        0, true, false, false);
@@ -416,10 +417,22 @@ int main(int argc, char *argv[]) {
 
       imgFileName = GooString::format("{0:s}{1:03d}.{2:s}", 
           htmlFileName->c_str(), pg, extension);
-
-      bitmap->writeImgFile(format, imgFileName->c_str(),
-                           72 * scale, 72 * scale);
-
+      auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName->c_str(), "wb");
+      if (!f1) {
+        fprintf(stderr, "Could not open %s\n", imgFileName->c_str());
+        delete imgFileName;
+        continue;
+      }
+      bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale);
+      fclose(f1);
+      if (dataUrls) {
+        htmlOut->addBackgroundImage(
+          std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") +
+          gbase64Encode(imf.getBuffer())
+        );
+      } else {
+        htmlOut->addBackgroundImage(gbasename(imgFileName->c_str()));
+      }
       delete imgFileName;
     }
 
@@ -434,7 +447,14 @@ int main(int argc, char *argv[]) {
     return -1;
 #endif
   }
-  
+
+  if (htmlOut->isOk())
+  {
+    doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0,
+		      true, false, false);
+    htmlOut->dumpDocOutline(doc);
+  }
+
   delete htmlOut;
 
   exit_status = EXIT_SUCCESS;
commit 44da4d785cffeb5d4bbb1460479add6ce01edea2
Author: Greg Knight <lyngvi at gmail.com>
Date:   Sun Feb 10 10:31:36 2019 +0100

    Introduce gbase64

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74294ca8..bf187ab4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ configure_file(config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
 configure_file(poppler/poppler-config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h)
 
 set(poppler_SRCS
+  goo/gbase64.cc
   goo/gbasename.cc
   goo/gfile.cc
   goo/GooTimer.cc
diff --git a/goo/gbase64.cc b/goo/gbase64.cc
new file mode 100644
index 00000000..e0da77ff
--- /dev/null
+++ b/goo/gbase64.cc
@@ -0,0 +1,50 @@
+//========================================================================
+//
+// gbase64.cc
+//
+// Implementation of a base64 encoder, because another one did not immediately
+// avail itself.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#include "gbase64.h"
+#include <sstream>
+
+static void b64encodeTriplet(char output[4], unsigned char a, unsigned char b, unsigned char c)
+{
+    static const char* base64table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    output[0] = base64table[((a >> 2) & 0x3f) ]; // upper 6 of first byte
+    output[1] = base64table[((a << 4) & 0x30) | ((b >> 4) & 0x0f)]; // lower 2 of first byte, upper 4 of second byte
+    output[2] = base64table[((b << 2) & 0x3c) | ((c >> 6) & 0x03)]; // lower 4 of second byte, upper 2 of third byte
+    output[3] = base64table[((c     ) & 0x3f)]; // lower 6 of third byte
+}
+
+std::string gbase64Encode(const void* input, size_t len)
+{
+    char quad[4];
+    size_t pos = 0;
+    std::stringstream buf;
+    auto bytes = static_cast<const unsigned char*>(input);
+    for ( ; pos + 3 <= len; pos += 3) {
+        b64encodeTriplet(quad, bytes[0], bytes[1], bytes[2]);
+        buf.write(&quad[0], 4);
+        bytes += 3;
+    }
+    switch (len - pos) {
+        case 1:
+            b64encodeTriplet(quad, bytes[0], 0, 0);
+            quad[2] = quad[3] = '=';
+            buf.write(&quad[0], 4);
+            break;
+        case 2:
+            b64encodeTriplet(quad, bytes[0], bytes[1], 0);
+            quad[3] = '=';
+            buf.write(&quad[0], 4);
+            break;
+    }
+    return buf.str();
+}
diff --git a/goo/gbase64.h b/goo/gbase64.h
new file mode 100644
index 00000000..06e2e8b6
--- /dev/null
+++ b/goo/gbase64.h
@@ -0,0 +1,28 @@
+//========================================================================
+//
+// gbase64.h
+//
+// Implementation of a base64 encoder, because another one did not immediately
+// avail itself.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#ifndef GOO_GBASE64_H
+#define GOO_GBASE64_H
+
+#include <string>
+#include <vector>
+
+std::string gbase64Encode(const void* input, size_t sz);
+
+inline std::string gbase64Encode(const std::vector<char>& input)
+    { return input.empty() ? std::string() : gbase64Encode(&input[0], input.size()); }
+
+inline std::string gbase64Encode(const std::vector<unsigned char>& input)
+    { return input.empty() ? std::string() : gbase64Encode(&input[0], input.size()); }
+
+#endif // ndef GOO_GBASE64_H
commit 2ba81611e9ccdcb49275ee247308bd0dcba3e64d
Author: Greg Knight <lyngvi at gmail.com>
Date:   Sun Feb 10 10:28:26 2019 +0100

    Introduce gbasename

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d49721c..74294ca8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ configure_file(config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
 configure_file(poppler/poppler-config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h)
 
 set(poppler_SRCS
+  goo/gbasename.cc
   goo/gfile.cc
   goo/GooTimer.cc
   goo/GooString.cc
diff --git a/goo/gbasename.cc b/goo/gbasename.cc
new file mode 100644
index 00000000..dd4607b4
--- /dev/null
+++ b/goo/gbasename.cc
@@ -0,0 +1,51 @@
+//========================================================================
+//
+// gbasename.cc
+//
+// Wrapper for libgen's basename() call which returns a std::string.
+// This is a convenience method working around questionable behavior
+// in the copy of basename() provided by libgen.h.
+//
+// According to man 3 basename:
+//
+//    Both dirname() and basename() may modify the contents of path, so it
+//    may be desirable to pass a copy when calling one of these functions.
+//
+//    ...
+//
+//    These functions may return pointers to statically allocated memory
+//    which may be overwritten by subsequent calls.  Alternatively, they
+//    may return a pointer to some part of path, so that the string
+//    referred to by path should not be modified or freed until the pointer
+//    returned by the function is no longer required.
+//
+// Because basename can modify filename (for some reason), we have to
+// duplicate our input into a mutable buffer before we can call it.
+// The return value might be part of this mutable temporary, but not
+// generally the front, so 'char *' cannot be used as our return value.
+// The return value might also be a statically allocated string,
+// rendering basename (and thus gbasename) non-thread-safe. Because
+// we don't know how basename()'s return value is lifecycled, we need
+// to duplicate it again into something whose lifecycle we can predict.
+//
+// This is how a method that should amount to finding the last slash
+// in a string ends up requiring two memory allocations while managing
+// not to be thread-safe. In a way, it's kind of impressive.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#include "gbasename.h"
+#include <libgen.h>
+#include <string.h>
+
+std::string gbasename(const char* filename)
+{
+  char* mutabl = strdup(filename);
+  std::string retu = basename(mutabl);
+  free(mutabl);
+  return retu;
+}
diff --git a/goo/gbasename.h b/goo/gbasename.h
new file mode 100644
index 00000000..3c5e0065
--- /dev/null
+++ b/goo/gbasename.h
@@ -0,0 +1,22 @@
+//========================================================================
+//
+// gbasename.h
+//
+// Wrapper for libgen's basename() call which returns a std::string.
+// This is a convenience method working around questionable behavior
+// in the copy of basename() provided by libgen.h.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#ifndef GBASENAME_H
+#define GBASENAME_H
+
+#include <string>
+
+std::string gbasename(const char* input);
+
+#endif // ndef GBASENAME_H
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index d83319e2..69a93724 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -58,6 +58,7 @@
 #include <math.h>
 #include <iostream>
 #include "goo/GooString.h"
+#include "goo/gbasename.h"
 #include "goo/GooList.h"
 #include "UnicodeMap.h"
 #include "goo/gmem.h"
@@ -116,16 +117,6 @@ extern double wordBreakThreshold;
 static bool debug = false;
 static GooString *gstr_buff0 = nullptr; // a workspace in which I format strings
 
-static GooString* basename(GooString* str){
-  
-  const char *p=str->c_str();
-  int len=str->getLength();
-  for (int i=len-1;i>=0;i--)
-    if (*(p+i)==SLASH) 
-      return new GooString((p+i+1),len-i-1);
-  return new GooString(str);
-}
-
 #if 0
 static GooString* Dirname(GooString* str){
   
@@ -860,13 +851,12 @@ int HtmlPage::dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page)
 
 void HtmlPage::dumpComplex(FILE *file, int page){
   FILE* pageFile;
-  GooString* tmp;
 
   if( firstPage == -1 ) firstPage = page; 
   
   if (dumpComplexHeaders(file, pageFile, page)) { error(errIO, -1, "Couldn't write headers."); return; }
 
-  tmp=basename(DocName);
+  const std::string str = gbasename(DocName->c_str());
    
   fputs("<style type=\"text/css\">\n<!--\n",pageFile);
   fputs("\tp {margin: 0; padding: 0;}",pageFile);
@@ -894,12 +884,10 @@ void HtmlPage::dumpComplex(FILE *file, int page){
   {
     fprintf(pageFile,
 	    "<img width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\"/>\n",
-	    pageWidth, pageHeight, tmp->c_str(), 
+	    pageWidth, pageHeight, str.c_str(),
 		(page-firstPage+1), imgExt->c_str());
   }
   
-  delete tmp;
-  
   for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){
     if (tmp1->htext){
       fprintf(pageFile,
@@ -1070,7 +1058,7 @@ void HtmlOutputDev::doFrame(int firstPage){
   
   delete fName;
     
-  fName=basename(Docname);
+  const std::string baseName = gbasename(Docname->c_str());
   fputs(DOCTYPE, fContentsFrame);
   fputs("\n<html>",fContentsFrame);
   fputs("\n<head>",fContentsFrame);
@@ -1080,16 +1068,15 @@ void HtmlOutputDev::doFrame(int firstPage){
   dumpMetaVars(fContentsFrame);
   fprintf(fContentsFrame, "</head>\n");
   fputs("<frameset cols=\"100,*\">\n",fContentsFrame);
-  fprintf(fContentsFrame,"<frame name=\"links\" src=\"%s_ind.html\"/>\n",fName->c_str());
+  fprintf(fContentsFrame,"<frame name=\"links\" src=\"%s_ind.html\"/>\n", baseName.c_str());
   fputs("<frame name=\"contents\" src=",fContentsFrame); 
   if (complexMode) 
-      fprintf(fContentsFrame,"\"%s-%d.html\"",fName->c_str(), firstPage);
+      fprintf(fContentsFrame,"\"%s-%d.html\"", baseName.c_str(), firstPage);
   else
-      fprintf(fContentsFrame,"\"%ss.html\"",fName->c_str());
+      fprintf(fContentsFrame,"\"%ss.html\"", baseName.c_str());
   
   fputs("/>\n</frameset>\n</html>\n",fContentsFrame);
  
-  delete fName;
   delete htmlEncoding;
   fclose(fContentsFrame);  
 }
@@ -1149,9 +1136,9 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char
 
          if (doOutline)
          {
-             GooString *str = basename(Docname);
-             fprintf(fContentsFrame, "<a href=\"%s%s\" target=\"contents\">Outline</a><br/>", str->c_str(), complexMode ? "-outline.html" : "s.html#outline");
-             delete str;
+             fprintf(fContentsFrame, "<a href=\"%s%s\" target=\"contents\">Outline</a><br/>",
+                 gbasename(Docname->c_str()).c_str(),
+                 complexMode ? "-outline.html" : "s.html#outline");
          }
      }
 	if (!complexMode)
@@ -1256,24 +1243,22 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) {
 #endif
 
   this->pageNum = pageNum;
-  GooString *str=basename(Docname);
+  const std::string str = gbasename(Docname->c_str());
   pages->clear(); 
   if(!noframes)
   {
     if (fContentsFrame)
 	{
       if (complexMode)
-		fprintf(fContentsFrame,"<a href=\"%s-%d.html\"",str->c_str(),pageNum);
+		fprintf(fContentsFrame,"<a href=\"%s-%d.html\"", str.c_str(), pageNum);
       else 
-		fprintf(fContentsFrame,"<a href=\"%ss.html#%d\"",str->c_str(),pageNum);
+		fprintf(fContentsFrame,"<a href=\"%ss.html#%d\"", str.c_str(), pageNum);
       fprintf(fContentsFrame," target=\"contents\" >Page %d</a><br/>\n",pageNum);
     }
   }
 
   pages->pageWidth=static_cast<int>(state->getPageWidth());
   pages->pageHeight=static_cast<int>(state->getPageHeight());
-
-  delete str;
 } 
 
 
@@ -1561,8 +1546,8 @@ GooString* HtmlOutputDev::getLinkDest(AnnotLink *link){
   switch(link->getAction()->getKind()) 
   {
       case actionGoTo:
-	  { 
-	  GooString* file=basename(Docname);
+	  {
+	  GooString* file = new GooString(gbasename(Docname->c_str()));
 	  int page=1;
 	  LinkGoTo *ha=(LinkGoTo *)link->getAction();
 	  LinkDest *dest=nullptr;
@@ -1781,7 +1766,7 @@ bool HtmlOutputDev::newHtmlOutlineLevel(FILE *output, const GooList *outlines, i
 				frames		file-4.html	files.html#4
 				noframes	file.html#4	file.html#4
 				*/
-				linkName=basename(Docname);
+				linkName = new GooString(gbasename(Docname->c_str()));
 				GooString *str=GooString::fromInt(page);
 				if (noframes) {
 					linkName->append(".html#");
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index 30a7f1d3..a6866295 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -44,13 +44,6 @@
 #include "Catalog.h"
 #include "UnicodeMap.h"
 
-
-#ifdef _WIN32
-#  define SLASH '\\'
-#else
-#  define SLASH '/'
-#endif
-
 #define xoutRound(x) ((int)(x + 0.5))
 
 #define DOCTYPE "<!DOCTYPE html>"
diff --git a/utils/pdfsig.cc b/utils/pdfsig.cc
index 64299690..3a432b36 100644
--- a/utils/pdfsig.cc
+++ b/utils/pdfsig.cc
@@ -24,10 +24,10 @@
 #include <time.h>
 #include <hasht.h>
 #include <fstream>
-#include <libgen.h>
 #include "parseargs.h"
 #include "Object.h"
 #include "Array.h"
+#include "goo/gbasename.h"
 #include "Page.h"
 #include "PDFDoc.h"
 #include "PDFDocFactory.h"
@@ -108,9 +108,7 @@ static void dumpSignature(int sig_num, int sigCount, FormWidgetSignature *sig_wi
     // since { is the magic character to replace things we need to put it twice where
     // we don't want it to be replaced
     GooString *format = GooString::format("{{0:s}}.sig{{1:{0:d}d}}", sigCountLength);
-    char *filenameCopy = strdup(filename);
-    GooString *path = GooString::format(format->c_str(), basename(filenameCopy), sig_num);
-    free(filenameCopy);
+    GooString *path = GooString::format(format->c_str(), gbasename(filename).c_str(), sig_num);
     printf("Signature #%d (%u bytes) => %s\n", sig_num, signature->getLength(), path->c_str());
     std::ofstream outfile(path->c_str(), std::ofstream::binary);
     outfile.write(signature->c_str(), signature->getLength());
commit 7f4da59665969f624c18a1ba3e1f1ac1ca3478b1
Author: Greg Knight <lyngvi at gmail.com>
Date:   Fri Nov 23 19:37:37 2018 -0500

    pdftohtml data urls: adding InMemoryFile utility class

diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index 34d96475..3516479e 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -121,6 +121,7 @@ install(FILES pdftotext.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
 
 # pdftohtml
 set(pdftohtml_SOURCES ${common_srcs}
+  InMemoryFile.cc
   pdftohtml.cc
   HtmlFonts.cc
   HtmlLinks.cc
diff --git a/utils/InMemoryFile.cc b/utils/InMemoryFile.cc
new file mode 100644
index 00000000..d4ed0f48
--- /dev/null
+++ b/utils/InMemoryFile.cc
@@ -0,0 +1,75 @@
+//========================================================================
+//
+// InMemoryFile.cc
+//
+// Represents a file in-memory with GNU's stdio wrappers.
+// NOTE as of this writing, open() depends on the glibc 'fopencookie'
+// extension and is not supported on other platforms. The
+// HAVE_IN_MEMORY_FILE macro is intended to reflect whether this class is
+// usable.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#include "InMemoryFile.h"
+
+#include <string.h>
+#include <sstream>
+
+InMemoryFile::InMemoryFile()
+    : iohead(0)
+    , fptr(nullptr)
+{
+}
+
+ssize_t InMemoryFile::_read(char* buf, size_t sz)
+{
+    auto toRead = std::min<size_t>(data.size() - iohead, sz);
+    memcpy(&buf[0], &data[iohead], toRead);
+    iohead += toRead;
+    return toRead;
+}
+
+ssize_t InMemoryFile::_write(const char* buf, size_t sz)
+{
+    if (iohead + sz > data.size())
+        data.resize(iohead + sz);
+    memcpy(&data[iohead], buf, sz);
+    iohead += sz;
+    return sz;        
+}
+
+int InMemoryFile::_seek(off64_t* offset, int whence)
+{
+    switch (whence) {
+        case SEEK_SET: iohead  = (*offset); break;
+        case SEEK_CUR: iohead += (*offset); break;
+        case SEEK_END: iohead -= (*offset); break;
+    }
+    (*offset) = std::min<off64_t>(std::max<off64_t>(iohead, 0l), data.size());
+    iohead = static_cast<size_t>(*offset);
+    return 0;
+}
+
+FILE* InMemoryFile::open(const char* mode)
+{
+#if HAVE_IN_MEMORY_FILE_FOPENCOOKIE
+    if (fptr != nullptr) {
+        fprintf(stderr, "InMemoryFile: BUG: Why is this opened more than once?");
+        return nullptr; // maybe there's some legit reason for it, whoever comes up with one can remove this line
+    }
+    static cookie_io_functions_t methods = {
+        /* .read = */ [](void* self, char* buf, size_t sz) { return ((InMemoryFile*)self)->_read(buf, sz); },
+        /* .write = */ [](void* self, const char* buf, size_t sz) { return ((InMemoryFile*)self)->_write(buf, sz); },
+        /* .seek = */ [](void* self, off64_t* offset, int whence) { return ((InMemoryFile*)self)->_seek(offset, whence); },
+        /* .close = */ [](void* self) { ((InMemoryFile*)self)->fptr = nullptr; return 0; },
+    };
+    return fptr = fopencookie(this, mode, methods);
+#else
+    fprintf (stderr, "If you can read this, your platform does not support the features necessary to achieve your goals.");
+    return nullptr;
+#endif
+}
diff --git a/utils/InMemoryFile.h b/utils/InMemoryFile.h
new file mode 100644
index 00000000..6af7d503
--- /dev/null
+++ b/utils/InMemoryFile.h
@@ -0,0 +1,51 @@
+//========================================================================
+//
+// InMemoryFile.h
+//
+// Represents a file in-memory with GNU's stdio wrappers.
+// NOTE as of this writing, open() depends on the glibc 'fopencookie'
+// extension and is not supported on other platforms. The
+// HAVE_IN_MEMORY_FILE macro is intended to reflect whether this class is
+// usable.
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2018 Greg Knight <lyngvi at gmail.com>
+//
+//========================================================================
+
+#ifndef IN_MEMORY_FILE_H
+#define IN_MEMORY_FILE_H
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(__USE_GNU) && !defined(__ANDROID_API__)
+#  define HAVE_IN_MEMORY_FILE (1)
+#  define HAVE_IN_MEMORY_FILE_FOPENCOOKIE (1) // used internally
+#endif
+
+class InMemoryFile {
+private:
+    size_t iohead;
+    std::vector<char> data;
+    FILE *fptr;
+
+    ssize_t _read(char* buf, size_t sz);
+    ssize_t _write(const char* buf, size_t sz);
+    int _seek(off64_t* offset, int whence);
+
+public:
+    InMemoryFile();
+
+public:
+    /* Returns a file handle for this file. This is scoped to this object
+     * and must be fclosed() by the caller before destruction. */
+    FILE* open(const char* mode);
+
+    const std::vector<char>& getBuffer() const
+        { return data; }
+};
+
+#endif // IN_MEMORY_FILE_H


More information about the poppler mailing list