[poppler] poppler/PDFDocEncoding.cc
Carlos Garcia Campos
carlosgc at kemper.freedesktop.org
Thu Feb 14 03:52:45 PST 2008
poppler/PDFDocEncoding.cc | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
New commits:
commit a69bd442e52f4495f8d6bfd3bb58b3ebd1be1a63
Author: Michael Vrable <mvrable at cs.ucsd.edu>
Date: Thu Feb 14 12:52:22 2008 +0100
Provide Unicode mappings for some control characters in PDFDocEncoding.
Though they do not represent glyphs, values such as carriage return can be
found in text strings in PDFDocEncoding. Provide mappings for these bytes
to Unicode.
Additionally, map unknown characters to U+FFFD instead of U+0000, so that
unknown characters do not result in nulls (which can truncate strings
early, particularly if the string is later re-encoded into null-terminated
UTF-8).
diff --git a/poppler/PDFDocEncoding.cc b/poppler/PDFDocEncoding.cc
index 89dc382..1d3ea74 100644
--- a/poppler/PDFDocEncoding.cc
+++ b/poppler/PDFDocEncoding.cc
@@ -8,10 +8,21 @@
#include "PDFDocEncoding.h"
+// Mapping of PDFDocEncoding (used to represent text values such as document
+// metadata or annotation text) to Unicode codepoints. Not all 8-bit values in
+// PDFDocEncoding are defined; undefined bytes are mapped to U+FFFD (Unicode
+// replacement character).
+//
+// PDFDocEncoding is only directly defined for printable characters, but some
+// control characters such as carriage return will still be used. We define
+// mappings of the standard whitespace control characters (tabs, newlines) to
+// the corresponding Unicode values. Other control characters are left
+// undefined.
+
Unicode pdfDocEncoding[256] = {
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 00
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 10
+ 0x0000, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 00
+ 0xfffd, 0x0009, 0x000a, 0xfffd, 0x000c, 0x000d, 0xfffd, 0xfffd,
+ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 10
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
@@ -24,13 +35,13 @@ Unicode pdfDocEncoding[256] = {
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70
- 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
+ 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0xfffd,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, // 80
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, // 90
- 0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
+ 0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0xfffd,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
- 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
+ 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0xfffd, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
More information about the poppler
mailing list