[poppler] poppler/PDFDocEncoding.cc

Carlos Garcia Campos carlosgc at kemper.freedesktop.org
Thu Feb 14 03:52:45 PST 2008


 poppler/PDFDocEncoding.cc |   23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

New commits:
commit a69bd442e52f4495f8d6bfd3bb58b3ebd1be1a63
Author: Michael Vrable <mvrable at cs.ucsd.edu>
Date:   Thu Feb 14 12:52:22 2008 +0100

    Provide Unicode mappings for some control characters in PDFDocEncoding.
    
    Though they do not represent glyphs, values such as carriage return can be
    found in text strings in PDFDocEncoding.  Provide mappings for these bytes
    to Unicode.
    
    Additionally, map unknown characters to U+FFFD instead of U+0000, so that
    unknown characters do not result in nulls (which can truncate strings
    early, particularly if the string is later re-encoded into null-terminated
    UTF-8).

diff --git a/poppler/PDFDocEncoding.cc b/poppler/PDFDocEncoding.cc
index 89dc382..1d3ea74 100644
--- a/poppler/PDFDocEncoding.cc
+++ b/poppler/PDFDocEncoding.cc
@@ -8,10 +8,21 @@
 
 #include "PDFDocEncoding.h"
 
+// Mapping of PDFDocEncoding (used to represent text values such as document
+// metadata or annotation text) to Unicode codepoints.  Not all 8-bit values in
+// PDFDocEncoding are defined; undefined bytes are mapped to U+FFFD (Unicode
+// replacement character).
+//
+// PDFDocEncoding is only directly defined for printable characters, but some
+// control characters such as carriage return will still be used.  We define
+// mappings of the standard whitespace control characters (tabs, newlines) to
+// the corresponding Unicode values.  Other control characters are left
+// undefined.
+
 Unicode pdfDocEncoding[256] = {
-  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 00
-  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 10
+  0x0000, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 00
+  0xfffd, 0x0009, 0x000a, 0xfffd, 0x000c, 0x000d, 0xfffd, 0xfffd,
+  0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 10
   0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
   0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20
   0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
@@ -24,13 +35,13 @@ Unicode pdfDocEncoding[256] = {
   0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60
   0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
   0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70
-  0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
+  0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0xfffd,
   0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, // 80
   0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
   0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, // 90
-  0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
+  0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0xfffd,
   0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
-  0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
+  0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0xfffd, 0x00ae, 0x00af,
   0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
   0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
   0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0


More information about the poppler mailing list