[poppler] poppler/GfxFont.cc
Albert Astals Cid
aacid at kemper.freedesktop.org
Tue Dec 18 11:20:24 PST 2007
poppler/GfxFont.cc | 183 ++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 154 insertions(+), 29 deletions(-)
New commits:
commit bb49e1e3909fc7392c197dc67d9b7f5312fd0dad
Author: Ed Catmur <ed at catmur.co.uk>
Date: Tue Dec 18 20:20:11 2007 +0100
Implement Adobe Glyph Naming convention
diff --git a/poppler/GfxFont.cc b/poppler/GfxFont.cc
index dfd51ce..9a573fe 100644
--- a/poppler/GfxFont.cc
+++ b/poppler/GfxFont.cc
@@ -96,6 +96,10 @@ static StdFontMapEntry stdFontMap[] = {
{ "TimesNewRomanPSMT,Italic", "Times-Italic" }
};
+static int parseCharName(char *charName, Unicode *uBuf, int uLen,
+ GBool names, GBool ligatures,
+ GBool numeric, GBool hex, GBool variants);
+
//------------------------------------------------------------------------
// GfxFont
//------------------------------------------------------------------------
@@ -787,35 +791,24 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, char *tagA, Ref idA, GooString *nameA,
}
}
- // pass 2: try to fill in the missing chars, looking for names of
- // the form 'Axx', 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B'
- // are any letters, 'xx' is two hex digits, and 'nn' is 2-4
- // decimal digits
- if (missing && globalParams->getMapNumericCharNames()) {
+ // construct the char code -> Unicode mapping object
+ ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode);
+
+ // pass 2: try to fill in the missing chars, looking for ligatures, numeric
+ // references and variants
+ if (missing) {
for (code = 0; code < 256; ++code) {
if ((charName = enc[code]) && !toUnicode[code] &&
strcmp(charName, ".notdef")) {
- n = strlen(charName);
- code2 = -1;
- if (hex && n == 3 && isalpha(charName[0]) &&
- isxdigit(charName[1]) && isxdigit(charName[2])) {
- sscanf(charName+1, "%x", &code2);
- } else if (hex && n == 2 &&
- isxdigit(charName[0]) && isxdigit(charName[1])) {
- sscanf(charName, "%x", &code2);
- } else if (!hex && n >= 2 && n <= 4 &&
- isdigit(charName[0]) && isdigit(charName[1])) {
- code2 = atoi(charName);
- } else if (n >= 3 && n <= 5 &&
- isdigit(charName[1]) && isdigit(charName[2])) {
- code2 = atoi(charName+1);
- } else if (n >= 4 && n <= 6 &&
- isdigit(charName[2]) && isdigit(charName[3])) {
- code2 = atoi(charName+2);
- }
- if (code2 >= 0 && code2 <= 0xff) {
- toUnicode[code] = (Unicode)code2;
- }
+ if ((n = parseCharName(charName, uBuf, sizeof(uBuf)/sizeof(*uBuf),
+ gFalse, // don't check simple names (pass 1)
+ gTrue, // do check ligatures
+ globalParams->getMapNumericCharNames(),
+ hex,
+ gTrue))) // do check variants
+ ctu->setMapping((CharCode)code, uBuf, n);
+ else
+ error(-1, "Could not parse charref for nameToUnicode: %s", charName);
}
}
@@ -829,9 +822,6 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, char *tagA, Ref idA, GooString *nameA,
}
}
- // construct the char code -> Unicode mapping object
- ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode);
-
// merge in a ToUnicode CMap, if there is one -- this overwrites
// existing entries in ctu, i.e., the ToUnicode CMap takes
// precedence, but the other encoding info is allowed to fill in any
@@ -961,6 +951,141 @@ Gfx8BitFont::~Gfx8BitFont() {
}
}
+// This function is in part a derived work of the Adobe Glyph Mapping
+// Convention: http://www.adobe.com/devnet/opentype/archives/glyph.html
+// Algorithmic comments are excerpted from that document to aid
+// maintainability.
+static int parseCharName(char *charName, Unicode *uBuf, int uLen,
+ GBool names, GBool ligatures,
+ GBool numeric, GBool hex, GBool variants)
+{
+ if (uLen <= 0) {
+ error(-1, "Zero-length output buffer (recursion overflow?) in "
+ "nameToUnicode: %s", charName);
+ return 0;
+ }
+ // Step 1: drop all the characters from the glyph name starting with the
+ // first occurrence of a period (U+002E FULL STOP), if any.
+ if (variants) {
+ char *var_part = strchr(charName, '.');
+ if (var_part == charName) {
+ return 0; // .notdef or similar
+ } else if (var_part != NULL) {
+ // parse names of the form 7.oldstyle, P.swash, s.sc, etc.
+ char *main_part = strndup(charName, var_part - charName);
+ GBool namesRecurse = gTrue, variantsRecurse = gFalse;
+ int n = parseCharName(main_part, uBuf, uLen, namesRecurse, ligatures,
+ numeric, hex, variantsRecurse);
+ gfree(main_part);
+ return n;
+ }
+ }
+ // Step 2: split the remaining string into a sequence of components, using
+ // underscore (U+005F LOW LINE) as the delimiter.
+ if (ligatures && strchr(charName, '_')) {
+ // parse names of the form A_a (e.g. f_i, T_h, l_quotesingle)
+ char *lig_part, *lig_end, *lig_copy;
+ int n = 0, m;
+ lig_part = lig_copy = copyString(charName);
+ do {
+ if ((lig_end = strchr(lig_part, '_')))
+ *lig_end = '\0';
+ if (lig_part[0] != '\0') {
+ GBool namesRecurse = gTrue, ligaturesRecurse = gFalse;
+ if ((m = parseCharName(lig_part, uBuf + n, uLen - n, namesRecurse,
+ ligaturesRecurse, numeric, hex, variants)))
+ n += m;
+ else
+ error(-1, "Could not parse ligature component in charref for "
+ "nameToUnicode: %s", charName);
+ }
+ lig_part = lig_end + 1;
+ } while (lig_end && n < uLen);
+ gfree(lig_copy);
+ return n;
+ }
+ // Step 3: map each component to a character string according to the
+ // procedure below, and concatenate those strings; the result is the
+ // character string to which the glyph name is mapped.
+ // 3.1. if the font is Zapf Dingbats (PostScript FontName ZapfDingbats), and
+ // the component is in the ZapfDingbats list, then map it to the
+ // corresponding character in that list.
+ // 3.2. otherwise, if the component is in the Adobe Glyph List, then map it
+ // to the corresponding character in that list.
+ if (names && (uBuf[0] = globalParams->mapNameToUnicode(charName))) {
+ return 1;
+ }
+ if (numeric) {
+ // Not in Adobe Glyph Mapping convention: look for names of the form 'Axx',
+ // 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' are any letters, 'xx' is
+ // two hex digits, and 'nn' is 2-4 decimal digits
+ unsigned int n = strlen(charName);
+ if (hex && n == 3 && isalpha(charName[0]) &&
+ isxdigit(charName[1]) && isxdigit(charName[2])) {
+ sscanf(charName+1, "%x", (unsigned int *)uBuf);
+ return 1;
+ } else if (hex && n == 2 &&
+ isxdigit(charName[0]) && isxdigit(charName[1])) {
+ sscanf(charName, "%x", (unsigned int *)uBuf);
+ return 1;
+ } else if (!hex && n >= 2 && n <= 4 &&
+ isdigit(charName[0]) && isdigit(charName[1])) {
+ uBuf[0] = (Unicode)atoi(charName);
+ return 1;
+ } else if (n >= 3 && n <= 5 &&
+ isdigit(charName[1]) && isdigit(charName[2])) {
+ uBuf[0] = (Unicode)atoi(charName+1);
+ return 1;
+ } else if (n >= 4 && n <= 6 &&
+ isdigit(charName[2]) && isdigit(charName[3])) {
+ uBuf[0] = (Unicode)atoi(charName+2);
+ return 1;
+ }
+ // 3.3. otherwise, if the component is of the form "uni" (U+0075 U+006E
+ // U+0069) followed by a sequence of uppercase hexadecimal digits (0 .. 9,
+ // A .. F, i.e. U+0030 .. U+0039, U+0041 .. U+0046), the length of that
+ // sequence is a multiple of four, and each group of four digits represents
+ // a number in the set {0x0000 .. 0xD7FF, 0xE000 .. 0xFFFF}, then interpret
+ // each such number as a Unicode scalar value and map the component to the
+ // string made of those scalar values. Note that the range and digit length
+ // restrictions mean that the "uni" prefix can be used only with Unicode
+ // values from the Basic Multilingual Plane (BMP).
+ if (n >= 7 && (n % 4) == 3 && !strncmp(charName, "uni", 3)) {
+ unsigned int i, m;
+ for (i = 0, m = 3; i < uLen && m < n; m += 4) {
+ if (isxdigit(charName[m]) && isxdigit(charName[m + 1]) &&
+ isxdigit(charName[m + 2]) && isxdigit(charName[m + 3])) {
+ unsigned int u;
+ sscanf(charName + m, "%4x", &u);
+ if (u <= 0xD7FF || (0xE000 <= u && u <= 0xFFFF)) {
+ uBuf[i++] = u;
+ }
+ }
+ }
+ return i;
+ }
+ // 3.4. otherwise, if the component is of the form "u" (U+0075) followed by
+ // a sequence of four to six uppercase hexadecimal digits {0 .. 9, A .. F}
+ // (U+0030 .. U+0039, U+0041 .. U+0046), and those digits represent a
+ // number in {0x0000 .. 0xD7FF, 0xE000 .. 0x10FFFF}, then interpret this
+ // number as a Unicode scalar value and map the component to the string
+ // made of this scalar value.
+ if (n >= 5 && n <= 7 && charName[0] == 'u' && isxdigit(charName[1]) &&
+ isxdigit(charName[2]) && isxdigit(charName[3]) && isxdigit(charName[4])
+ && (n <= 5 || isxdigit(charName[5]))
+ && (n <= 6 || isxdigit(charName[6]))) {
+ unsigned int u;
+ sscanf(charName + 1, "%x", &u);
+ if (u <= 0xD7FF || (0xE000 <= u && u <= 0x10FFFF)) {
+ uBuf[0] = u;
+ return 1;
+ }
+ }
+ }
+ // 3.5. otherwise, map the component to the empty string
+ return 0;
+}
+
int Gfx8BitFont::getNextChar(char *s, int len, CharCode *code,
Unicode *u, int uSize, int *uLen,
double *dx, double *dy, double *ox, double *oy) {
More information about the poppler
mailing list