[poppler] utils/pdftotext.cc

Sat Jun 4 12:23:25 PDT 2011

utils/pdftotext.cc |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

New commits:
commit 7b123bf2b11ac81f24a966186a06de739d3c8f02
Author: Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
Date:   Sat Jun 4 20:22:52 2011 +0100

    Fix pdftotext -htmlmeta to correctly output U+2019 in PDF metadata
    
    In PDF documents, right single quotation mark (U+2019) may be encoded as
    0x90 because PDFDocEncoding uses some of the reserved characters in
    ISO 8859-1. However, pdftotext -htmlmeta assumes that characters are either
    UCS-2 or ISO 8859-1. Thus when a right single quotation mark is encoded as
    0x90, it is output as unicode 0x90 (which is a control character). pdfinfo
    does the right thing by first converting from PDFDocEncoding to Unicode
    with pdfDocEncoding[], before encoding it in the desired character set.
    This patch applies the same logic to pdftotext. pdftohtml is broken in the
    same way, but this patch does not attempt to fix it.
    
    Bug 37900

diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 5d1cfb5..eae4058 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -50,6 +50,7 @@
 #include "TextOutputDev.h"
 #include "CharTypes.h"
 #include "UnicodeMap.h"
+#include "PDFDocEncoding.h"
 #include "Error.h"
 #include <string>
 
@@ -452,7 +453,7 @@ static void printInfoString(FILE *f, Dict *infoDict, char *key,
 	    (s1->getChar(i+1) & 0xff);
 	i += 2;
       } else {
-	u = s1->getChar(i) & 0xff;
+	u = pdfDocEncoding[s1->getChar(i) & 0xff];
 	++i;
       }
       n = uMap->mapUnicode(u, buf, sizeof(buf));