[poppler] utils/HtmlFonts.h utils/pdftohtml.cc

Mon Jun 20 15:27:13 PDT 2011

utils/HtmlFonts.h  |    3 ++-
 utils/pdftohtml.cc |   41 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

New commits:
commit c5601bde9d8f3f56e558a6f63e563c9d337810eb
Author: Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
Date:   Mon Jun 20 23:25:43 2011 +0100

    Fix encoding of PDF document metadata in output of pdftohtml
    
    pdftohtml simply copies the PDF document title into the <title> HTML
    tag, which fails when the title is UCS-2 encoded, or if it contains
    characters which are in pdfDocEncoding (a ISO 8859-1 superset), but not
    in ISO 8859-1.  This patch fixes the problem by decoding UCS-2 or
    pdfDocEncoding into Unicode, then encoding this in the desired output
    encoding.  HTML escaping wasn't being done either, so I have used the
    existing function HtmlFont::HtmlFilter to perform both HTML escaping
    and character set encoding. This static method had to be made public
    to call it from pdftohtml. See bug #37900.

diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h
index a0ca78a..2cdea4b 100644
--- a/utils/HtmlFonts.h
+++ b/utils/HtmlFonts.h
@@ -19,6 +19,7 @@
 //
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
 // Copyright (C) 2010 Albert Astals Cid <aacid at kde.org>
+// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -65,7 +66,6 @@ class HtmlFont{
    static GooString *DefaultFont;
    GooString *FontName;
    HtmlFontColor color;
-   static GooString* HtmlFilter(Unicode* u, int uLen); //char* s);
 public:  
 
    HtmlFont(){FontName=NULL;};
@@ -84,6 +84,7 @@ public:
    GooString* getFontName();
    static GooString* getDefaultFont();
    static void setDefaultFont(GooString* defaultFont);
+   static GooString* HtmlFilter(Unicode* u, int uLen); //char* s);
    GBool isEqual(const HtmlFont& x) const;
    GBool isEqualIgnoreBold(const HtmlFont& x) const;
    static GooString* simple(HtmlFont *font, Unicode *content, int uLen);
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index b46bf1b..fa00ae1 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -18,6 +18,7 @@
 // Copyright (C) 2010 Mike Slegeir <tehpola at yahoo.com>
 // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
+// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -53,6 +54,7 @@
 #endif
 #include "PSOutputDev.h"
 #include "GlobalParams.h"
+#include "PDFDocEncoding.h"
 #include "Error.h"
 #include "DateInfo.h"
 #include "goo/gfile.h"
@@ -511,13 +513,46 @@ int main(int argc, char *argv[]) {
 
 static GooString* getInfoString(Dict *infoDict, char *key) {
   Object obj;
-  GooString *s1 = NULL;
+  // Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
+  GooString *rawString;
+  // Value converted to unicode
+  Unicode *unicodeString;
+  int unicodeLength;
+  // Value HTML escaped and converted to desired encoding
+  GooString *encodedString = NULL;
+  // Is rawString UCS2 (as opposed to pdfDocEncoding)
+  GBool isUnicode;
 
   if (infoDict->lookup(key, &obj)->isString()) {
-    s1 = new GooString(obj.getString());
+    rawString = obj.getString();
+
+    // Convert rawString to unicode
+    encodedString = new GooString();
+    if (rawString->hasUnicodeMarker()) {
+      isUnicode = gTrue;
+      unicodeLength = (obj.getString()->getLength() - 2) / 2;
+    } else {
+      isUnicode = gFalse;
+      unicodeLength = obj.getString()->getLength();
+    }
+    unicodeString = new Unicode[unicodeLength];
+
+    for (int i=0; i<unicodeLength; i++) {
+      if (isUnicode) {
+        unicodeString[i] = ((rawString->getChar((i+1)*2) & 0xff) << 8) |
+          (rawString->getChar(((i+1)*2)+1) & 0xff);
+      } else {
+        unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff];
+      }
+    }
+
+    // HTML escape and encode unicode
+    encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength);
+    delete[] unicodeString;
   }
+
   obj.free();
-  return s1;
+  return encodedString;
 }
 
 static GooString* getInfoDate(Dict *infoDict, char *key) {