[poppler] poppler/GfxFont.cc

Tue Dec 18 11:20:24 PST 2007

poppler/GfxFont.cc |  183 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 154 insertions(+), 29 deletions(-)

New commits:
commit bb49e1e3909fc7392c197dc67d9b7f5312fd0dad
Author: Ed Catmur <ed at catmur.co.uk>
Date:   Tue Dec 18 20:20:11 2007 +0100

    Implement Adobe Glyph Naming convention

diff --git a/poppler/GfxFont.cc b/poppler/GfxFont.cc
index dfd51ce..9a573fe 100644
--- a/poppler/GfxFont.cc
+++ b/poppler/GfxFont.cc
@@ -96,6 +96,10 @@ static StdFontMapEntry stdFontMap[] = {
   { "TimesNewRomanPSMT,Italic",     "Times-Italic" }
 };
 
+static int parseCharName(char *charName, Unicode *uBuf, int uLen,
+			 GBool names, GBool ligatures, 
+			 GBool numeric, GBool hex, GBool variants);
+
 //------------------------------------------------------------------------
 // GfxFont
 //------------------------------------------------------------------------
@@ -787,35 +791,24 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, char *tagA, Ref idA, GooString *nameA,
     }
   }
 
-  // pass 2: try to fill in the missing chars, looking for names of
-  // the form 'Axx', 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B'
-  // are any letters, 'xx' is two hex digits, and 'nn' is 2-4
-  // decimal digits
-  if (missing && globalParams->getMapNumericCharNames()) {
+  // construct the char code -> Unicode mapping object
+  ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode);
+
+  // pass 2: try to fill in the missing chars, looking for ligatures, numeric
+  // references and variants
+  if (missing) {
     for (code = 0; code < 256; ++code) {
       if ((charName = enc[code]) && !toUnicode[code] &&
 	  strcmp(charName, ".notdef")) {
-	n = strlen(charName);
-	code2 = -1;
-	if (hex && n == 3 && isalpha(charName[0]) &&
-	    isxdigit(charName[1]) && isxdigit(charName[2])) {
-	  sscanf(charName+1, "%x", &code2);
-	} else if (hex && n == 2 &&
-		   isxdigit(charName[0]) && isxdigit(charName[1])) {
-	  sscanf(charName, "%x", &code2);
-	} else if (!hex && n >= 2 && n <= 4 &&
-		   isdigit(charName[0]) && isdigit(charName[1])) {
-	  code2 = atoi(charName);
-	} else if (n >= 3 && n <= 5 &&
-		   isdigit(charName[1]) && isdigit(charName[2])) {
-	  code2 = atoi(charName+1);
-	} else if (n >= 4 && n <= 6 &&
-		   isdigit(charName[2]) && isdigit(charName[3])) {
-	  code2 = atoi(charName+2);
-	}
-	if (code2 >= 0 && code2 <= 0xff) {
-	  toUnicode[code] = (Unicode)code2;
-	}
+	if ((n = parseCharName(charName, uBuf, sizeof(uBuf)/sizeof(*uBuf), 
+			       gFalse, // don't check simple names (pass 1)
+			       gTrue, // do check ligatures
+			       globalParams->getMapNumericCharNames(),
+			       hex,
+			       gTrue))) // do check variants
+	  ctu->setMapping((CharCode)code, uBuf, n);
+	else
+	  error(-1, "Could not parse charref for nameToUnicode: %s", charName);
       }
     }
 
@@ -829,9 +822,6 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, char *tagA, Ref idA, GooString *nameA,
     }
   }
 
-  // construct the char code -> Unicode mapping object
-  ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode);
-
   // merge in a ToUnicode CMap, if there is one -- this overwrites
   // existing entries in ctu, i.e., the ToUnicode CMap takes
   // precedence, but the other encoding info is allowed to fill in any
@@ -961,6 +951,141 @@ Gfx8BitFont::~Gfx8BitFont() {
   }
 }
 
+// This function is in part a derived work of the Adobe Glyph Mapping
+// Convention: http://www.adobe.com/devnet/opentype/archives/glyph.html
+// Algorithmic comments are excerpted from that document to aid
+// maintainability.
+static int parseCharName(char *charName, Unicode *uBuf, int uLen,
+			 GBool names, GBool ligatures,
+			 GBool numeric, GBool hex, GBool variants)
+{
+  if (uLen <= 0) {
+    error(-1, "Zero-length output buffer (recursion overflow?) in "
+	  "nameToUnicode: %s", charName);
+    return 0;
+  }
+  // Step 1: drop all the characters from the glyph name starting with the
+  // first occurrence of a period (U+002E FULL STOP), if any.
+  if (variants) {
+    char *var_part = strchr(charName, '.');
+    if (var_part == charName) {
+      return 0;	// .notdef or similar
+    } else if (var_part != NULL) {
+      // parse names of the form 7.oldstyle, P.swash, s.sc, etc.
+      char *main_part = strndup(charName, var_part - charName);
+      GBool namesRecurse = gTrue, variantsRecurse = gFalse;
+      int n = parseCharName(main_part, uBuf, uLen, namesRecurse, ligatures,
+			    numeric, hex, variantsRecurse);
+      gfree(main_part);
+      return n;
+    }
+  }
+  // Step 2: split the remaining string into a sequence of components, using
+  // underscore (U+005F LOW LINE) as the delimiter.
+  if (ligatures && strchr(charName, '_')) {
+    // parse names of the form A_a (e.g. f_i, T_h, l_quotesingle)
+    char *lig_part, *lig_end, *lig_copy;
+    int n = 0, m;
+    lig_part = lig_copy = copyString(charName);
+    do {
+      if ((lig_end = strchr(lig_part, '_')))
+	*lig_end = '\0';
+      if (lig_part[0] != '\0') {
+	GBool namesRecurse = gTrue, ligaturesRecurse = gFalse;
+	if ((m = parseCharName(lig_part, uBuf + n, uLen - n, namesRecurse,
+			       ligaturesRecurse, numeric, hex, variants)))
+	  n += m;
+	else
+	  error(-1, "Could not parse ligature component in charref for "
+		"nameToUnicode: %s", charName);
+      }
+      lig_part = lig_end + 1;
+    } while (lig_end && n < uLen);
+    gfree(lig_copy);
+    return n;
+  }
+  // Step 3: map each component to a character string according to the
+  // procedure below, and concatenate those strings; the result is the
+  // character string to which the glyph name is mapped.
+  // 3.1. if the font is Zapf Dingbats (PostScript FontName ZapfDingbats), and
+  // the component is in the ZapfDingbats list, then map it to the
+  // corresponding character in that list.
+  // 3.2. otherwise, if the component is in the Adobe Glyph List, then map it
+  // to the corresponding character in that list.
+  if (names && (uBuf[0] = globalParams->mapNameToUnicode(charName))) {
+    return 1;
+  }
+  if (numeric) {
+    // Not in Adobe Glyph Mapping convention: look for names of the form 'Axx',
+    // 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' are any letters, 'xx' is
+    // two hex digits, and 'nn' is 2-4 decimal digits
+    unsigned int n = strlen(charName);
+    if (hex && n == 3 && isalpha(charName[0]) &&
+	isxdigit(charName[1]) && isxdigit(charName[2])) {
+      sscanf(charName+1, "%x", (unsigned int *)uBuf);
+      return 1;
+    } else if (hex && n == 2 &&
+	       isxdigit(charName[0]) && isxdigit(charName[1])) {
+      sscanf(charName, "%x", (unsigned int *)uBuf);
+      return 1;
+    } else if (!hex && n >= 2 && n <= 4 &&
+	       isdigit(charName[0]) && isdigit(charName[1])) {
+      uBuf[0] = (Unicode)atoi(charName);
+      return 1;
+    } else if (n >= 3 && n <= 5 &&
+	       isdigit(charName[1]) && isdigit(charName[2])) {
+      uBuf[0] = (Unicode)atoi(charName+1);
+      return 1;
+    } else if (n >= 4 && n <= 6 &&
+	       isdigit(charName[2]) && isdigit(charName[3])) {
+      uBuf[0] = (Unicode)atoi(charName+2);
+      return 1;
+    }
+    // 3.3. otherwise, if the component is of the form "uni" (U+0075 U+006E
+    // U+0069) followed by a sequence of uppercase hexadecimal digits (0 .. 9,
+    // A .. F, i.e. U+0030 .. U+0039, U+0041 .. U+0046), the length of that
+    // sequence is a multiple of four, and each group of four digits represents
+    // a number in the set {0x0000 .. 0xD7FF, 0xE000 .. 0xFFFF}, then interpret
+    // each such number as a Unicode scalar value and map the component to the
+    // string made of those scalar values. Note that the range and digit length
+    // restrictions mean that the "uni" prefix can be used only with Unicode
+    // values from the Basic Multilingual Plane (BMP).
+    if (n >= 7 && (n % 4) == 3 && !strncmp(charName, "uni", 3)) {
+      unsigned int i, m;
+      for (i = 0, m = 3; i < uLen && m < n; m += 4) {
+	if (isxdigit(charName[m]) && isxdigit(charName[m + 1]) && 
+	    isxdigit(charName[m + 2]) && isxdigit(charName[m + 3])) {
+	  unsigned int u;
+	  sscanf(charName + m, "%4x", &u);
+	  if (u <= 0xD7FF || (0xE000 <= u && u <= 0xFFFF)) {
+	    uBuf[i++] = u;
+	  }
+	}
+      }
+      return i;
+    }
+    // 3.4. otherwise, if the component is of the form "u" (U+0075) followed by
+    // a sequence of four to six uppercase hexadecimal digits {0 .. 9, A .. F}
+    // (U+0030 .. U+0039, U+0041 .. U+0046), and those digits represent a
+    // number in {0x0000 .. 0xD7FF, 0xE000 .. 0x10FFFF}, then interpret this
+    // number as a Unicode scalar value and map the component to the string
+    // made of this scalar value.
+    if (n >= 5 && n <= 7 && charName[0] == 'u' && isxdigit(charName[1]) &&
+	isxdigit(charName[2]) && isxdigit(charName[3]) && isxdigit(charName[4])
+	&& (n <= 5 || isxdigit(charName[5]))
+	&& (n <= 6 || isxdigit(charName[6]))) {
+      unsigned int u;
+      sscanf(charName + 1, "%x", &u);
+      if (u <= 0xD7FF || (0xE000 <= u && u <= 0x10FFFF)) {
+	uBuf[0] = u;
+	return 1;
+      }
+    }
+  }
+  // 3.5. otherwise, map the component to the empty string
+  return 0;
+}
+
 int Gfx8BitFont::getNextChar(char *s, int len, CharCode *code,
 			     Unicode *u, int uSize, int *uLen,
 			     double *dx, double *dy, double *ox, double *oy) {