[poppler] utils/HtmlFonts.h utils/pdftohtml.cc
Albert Astals Cid
aacid at kemper.freedesktop.org
Mon Jun 20 15:27:13 PDT 2011
utils/HtmlFonts.h | 3 ++-
utils/pdftohtml.cc | 41 ++++++++++++++++++++++++++++++++++++++---
2 files changed, 40 insertions(+), 4 deletions(-)
New commits:
commit c5601bde9d8f3f56e558a6f63e563c9d337810eb
Author: Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
Date: Mon Jun 20 23:25:43 2011 +0100
Fix encoding of PDF document metadata in output of pdftohtml
pdftohtml simply copies the PDF document title into the <title> HTML
tag, which fails when the title is UCS-2 encoded, or if it contains
characters which are in pdfDocEncoding (a ISO 8859-1 superset), but not
in ISO 8859-1. This patch fixes the problem by decoding UCS-2 or
pdfDocEncoding into Unicode, then encoding this in the desired output
encoding. HTML escaping wasn't being done either, so I have used the
existing function HtmlFont::HtmlFilter to perform both HTML escaping
and character set encoding. This static method had to be made public
to call it from pdftohtml. See bug #37900.
diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h
index a0ca78a..2cdea4b 100644
--- a/utils/HtmlFonts.h
+++ b/utils/HtmlFonts.h
@@ -19,6 +19,7 @@
//
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
// Copyright (C) 2010 Albert Astals Cid <aacid at kde.org>
+// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -65,7 +66,6 @@ class HtmlFont{
static GooString *DefaultFont;
GooString *FontName;
HtmlFontColor color;
- static GooString* HtmlFilter(Unicode* u, int uLen); //char* s);
public:
HtmlFont(){FontName=NULL;};
@@ -84,6 +84,7 @@ public:
GooString* getFontName();
static GooString* getDefaultFont();
static void setDefaultFont(GooString* defaultFont);
+ static GooString* HtmlFilter(Unicode* u, int uLen); //char* s);
GBool isEqual(const HtmlFont& x) const;
GBool isEqualIgnoreBold(const HtmlFont& x) const;
static GooString* simple(HtmlFont *font, Unicode *content, int uLen);
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index b46bf1b..fa00ae1 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -18,6 +18,7 @@
// Copyright (C) 2010 Mike Slegeir <tehpola at yahoo.com>
// Copyright (C) 2010 Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
+// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -53,6 +54,7 @@
#endif
#include "PSOutputDev.h"
#include "GlobalParams.h"
+#include "PDFDocEncoding.h"
#include "Error.h"
#include "DateInfo.h"
#include "goo/gfile.h"
@@ -511,13 +513,46 @@ int main(int argc, char *argv[]) {
static GooString* getInfoString(Dict *infoDict, char *key) {
Object obj;
- GooString *s1 = NULL;
+ // Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
+ GooString *rawString;
+ // Value converted to unicode
+ Unicode *unicodeString;
+ int unicodeLength;
+ // Value HTML escaped and converted to desired encoding
+ GooString *encodedString = NULL;
+ // Is rawString UCS2 (as opposed to pdfDocEncoding)
+ GBool isUnicode;
if (infoDict->lookup(key, &obj)->isString()) {
- s1 = new GooString(obj.getString());
+ rawString = obj.getString();
+
+ // Convert rawString to unicode
+ encodedString = new GooString();
+ if (rawString->hasUnicodeMarker()) {
+ isUnicode = gTrue;
+ unicodeLength = (obj.getString()->getLength() - 2) / 2;
+ } else {
+ isUnicode = gFalse;
+ unicodeLength = obj.getString()->getLength();
+ }
+ unicodeString = new Unicode[unicodeLength];
+
+ for (int i=0; i<unicodeLength; i++) {
+ if (isUnicode) {
+ unicodeString[i] = ((rawString->getChar((i+1)*2) & 0xff) << 8) |
+ (rawString->getChar(((i+1)*2)+1) & 0xff);
+ } else {
+ unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff];
+ }
+ }
+
+ // HTML escape and encode unicode
+ encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength);
+ delete[] unicodeString;
}
+
obj.free();
- return s1;
+ return encodedString;
}
static GooString* getInfoDate(Dict *infoDict, char *key) {
More information about the poppler
mailing list