[HarfBuzz] harfbuzz: Branch 'master' - 5 commits

Behdad Esfahbod behdad at kemper.freedesktop.org
Wed Jul 16 12:37:03 PDT 2014


 src/hb-buffer-private.hh             |    1 
 src/hb-buffer.cc                     |   79 ++++++++-
 src/hb-buffer.h                      |   23 ++
 src/hb-ot-layout-gsubgpos-private.hh |   25 --
 src/hb-utf-private.hh                |  306 ++++++++++++++++++-----------------
 test/api/test-buffer.c               |   65 +++++++
 6 files changed, 328 insertions(+), 171 deletions(-)

New commits:
commit 976c8f455221eb599d1c446eafd88d51d7d2aa65
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed Jul 16 15:34:20 2014 -0400

    New API: hb_buffer_[sg]et_replacement_codepoint()
    
    With this change, we now by default replace broken UTF-8/16/32 bits
    with U+FFFD.  This can be changed by calling new API on the buffer.
    Previously the replacement value used to be (hb_codepoint_t)-1.
    
    Note that hb_buffer_clear_contents() does NOT reset the replacement
    character.
    
    See discussion here:
    
    https://github.com/behdad/harfbuzz/commit/6f13b6d62daae4989e3cc2fe4b168e5c59650964
    
    New API:
    
      hb_buffer_set_replacement_codepoint()
      hb_buffer_get_replacement_codepoint()

diff --git a/src/hb-buffer-private.hh b/src/hb-buffer-private.hh
index 3a2b9ab..5eccd3c 100644
--- a/src/hb-buffer-private.hh
+++ b/src/hb-buffer-private.hh
@@ -52,6 +52,7 @@ struct hb_buffer_t {
   hb_unicode_funcs_t *unicode; /* Unicode functions */
   hb_segment_properties_t props; /* Script, language, direction */
   hb_buffer_flags_t flags; /* BOT / EOT / etc. */
+  hb_codepoint_t replacement; /* U+FFFD or something else. */
 
   /* Buffer contents */
 
diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index 242cded..2377ba4 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -178,6 +178,7 @@ hb_buffer_t::reset (void)
 
   hb_unicode_funcs_destroy (unicode);
   unicode = hb_unicode_funcs_get_default ();
+  replacement = HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT;
 
   clear ();
 }
@@ -703,6 +704,7 @@ hb_buffer_get_empty (void)
     const_cast<hb_unicode_funcs_t *> (&_hb_unicode_funcs_nil),
     HB_SEGMENT_PROPERTIES_DEFAULT,
     HB_BUFFER_FLAG_DEFAULT,
+    HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT,
 
     HB_BUFFER_CONTENT_TYPE_INVALID,
     true, /* in_error */
@@ -1048,6 +1050,42 @@ hb_buffer_get_flags (hb_buffer_t *buffer)
 
 
 /**
+ * hb_buffer_set_replacement_codepoint:
+ * @buffer: a buffer.
+ * @replacement: 
+ *
+ * 
+ *
+ * Since: 1.0
+ **/
+void
+hb_buffer_set_replacement_codepoint (hb_buffer_t    *buffer,
+				     hb_codepoint_t  replacement)
+{
+  if (unlikely (hb_object_is_inert (buffer)))
+    return;
+
+  buffer->replacement = replacement;
+}
+
+/**
+ * hb_buffer_get_replacement_codepoint:
+ * @buffer: a buffer.
+ *
+ * 
+ *
+ * Return value: 
+ *
+ * Since: 1.0
+ **/
+hb_codepoint_t
+hb_buffer_get_replacement_codepoint (hb_buffer_t    *buffer)
+{
+  return buffer->replacement;
+}
+
+
+/**
  * hb_buffer_reset:
  * @buffer: a buffer.
  *
@@ -1299,6 +1337,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
 		   int           item_length)
 {
   typedef hb_utf_t<T, true> utf_t;
+  const hb_codepoint_t replacement = buffer->replacement;
 
   assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
 	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
@@ -1330,7 +1369,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
     while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH)
     {
       hb_codepoint_t u;
-      prev = utf_t::prev (prev, start, &u);
+      prev = utf_t::prev (prev, start, &u, replacement);
       buffer->context[0][buffer->context_len[0]++] = u;
     }
   }
@@ -1341,7 +1380,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   {
     hb_codepoint_t u;
     const T *old_next = next;
-    next = utf_t::next (next, end, &u);
+    next = utf_t::next (next, end, &u, replacement);
     buffer->add (u, old_next - (const T *) text);
   }
 
@@ -1351,7 +1390,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH)
   {
     hb_codepoint_t u;
-    next = utf_t::next (next, end, &u);
+    next = utf_t::next (next, end, &u, replacement);
     buffer->context[1][buffer->context_len[1]++] = u;
   }
 
diff --git a/src/hb-buffer.h b/src/hb-buffer.h
index 777c3d9..7b0c920 100644
--- a/src/hb-buffer.h
+++ b/src/hb-buffer.h
@@ -186,12 +186,25 @@ hb_buffer_flags_t
 hb_buffer_get_flags (hb_buffer_t *buffer);
 
 
+
+#define HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT 0xFFFDu
+
+/* Sets codepoint used to replace invalid UTF-8/16/32 entries.
+ * Default is 0xFFFDu. */
+void
+hb_buffer_set_replacement_codepoint (hb_buffer_t    *buffer,
+				     hb_codepoint_t  replacement);
+
+hb_codepoint_t
+hb_buffer_get_replacement_codepoint (hb_buffer_t    *buffer);
+
+
 /* Resets the buffer.  Afterwards it's as if it was just created,
  * except that it has a larger buffer allocated perhaps... */
 void
 hb_buffer_reset (hb_buffer_t *buffer);
 
-/* Like reset, but does NOT clear unicode_funcs. */
+/* Like reset, but does NOT clear unicode_funcs and replacement_codepoint. */
 void
 hb_buffer_clear_contents (hb_buffer_t *buffer);
 
diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index cbacd67..68216c4 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -40,7 +40,8 @@ struct hb_utf_t<uint8_t, true>
   static inline const uint8_t *
   next (const uint8_t *text,
 	const uint8_t *end,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     /* Written to only accept well-formed sequences.
      * Based on ideas from ICU's U8_NEXT.
@@ -101,23 +102,24 @@ struct hb_utf_t<uint8_t, true>
     return text;
 
   error:
-    *unicode = -1;
+    *unicode = replacement;
     return text;
   }
 
   static inline const uint8_t *
   prev (const uint8_t *text,
 	const uint8_t *start,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     const uint8_t *end = text--;
     while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
       text--;
 
-    if (likely (next (text, end, unicode) == end))
+    if (likely (next (text, end, unicode, replacement) == end))
       return text;
 
-    *unicode = -1;
+    *unicode = replacement;
     return end - 1;
   }
 
@@ -137,7 +139,8 @@ struct hb_utf_t<uint16_t, true>
   static inline const uint16_t *
   next (const uint16_t *text,
 	const uint16_t *end,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     hb_codepoint_t c = *text++;
 
@@ -161,14 +164,15 @@ struct hb_utf_t<uint16_t, true>
     }
 
     /* Lonely / out-of-order surrogate. */
-    *unicode = -1;
+    *unicode = replacement;
     return text;
   }
 
   static inline const uint16_t *
   prev (const uint16_t *text,
 	const uint16_t *start,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     const uint16_t *end = text--;
     hb_codepoint_t c = *text;
@@ -182,10 +186,10 @@ struct hb_utf_t<uint16_t, true>
     if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
       text--;
 
-    if (likely (next (text, end, unicode) == end))
+    if (likely (next (text, end, unicode, replacement) == end))
       return text;
 
-    *unicode = -1;
+    *unicode = replacement;
     return end - 1;
   }
 
@@ -208,7 +212,8 @@ struct hb_utf_t<uint32_t, validate>
   static inline const uint32_t *
   next (const uint32_t *text,
 	const uint32_t *end HB_UNUSED,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     hb_codepoint_t c = *text++;
     if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
@@ -217,16 +222,17 @@ struct hb_utf_t<uint32_t, validate>
     return text;
 
   error:
-    *unicode = -1;
+    *unicode = replacement;
     return text;
   }
 
   static inline const uint32_t *
   prev (const uint32_t *text,
 	const uint32_t *start HB_UNUSED,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
-    next (text - 1, text, unicode);
+    next (text - 1, text, unicode, replacement);
     return text - 1;
   }
 
diff --git a/test/api/test-buffer.c b/test/api/test-buffer.c
index 1be6931..af73c3f 100644
--- a/test/api/test-buffer.c
+++ b/test/api/test-buffer.c
@@ -374,6 +374,7 @@ test_buffer_utf8_conversion (void)
   unsigned int bytes, chars, i, j, len;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
 
   for (i = 0; i < G_N_ELEMENTS (utf8_conversion_tests); i++)
   {
@@ -388,7 +389,7 @@ test_buffer_utf8_conversion (void)
     for (chars = 0; test->codepoints[chars]; chars++)
       ;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf8 (b, test->utf8, bytes,  1, bytes - 2);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);
@@ -660,6 +661,7 @@ test_buffer_utf8_validity (void)
   unsigned int i;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
 
   for (i = 0; i < G_N_ELEMENTS (utf8_validity_tests); i++)
   {
@@ -678,7 +680,7 @@ test_buffer_utf8_validity (void)
     else
       segment_bytes = test->max_len;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf8 (b, test->utf8, text_bytes,  0, segment_bytes);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);
@@ -718,6 +720,7 @@ test_buffer_utf16_conversion (void)
   unsigned int i;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
 
   for (i = 0; i < G_N_ELEMENTS (utf16_conversion_tests); i++)
   {
@@ -732,7 +735,7 @@ test_buffer_utf16_conversion (void)
     for (chars = 0; test->codepoints[chars]; chars++)
       ;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf16 (b, test->utf16, u_len,  1, u_len - 2);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);
@@ -752,15 +755,15 @@ typedef struct {
 
 /* note: we skip the first and last item from utf32 when adding to buffer */
 static const utf32_conversion_test_t utf32_conversion_tests[] = {
-  {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -1, -1}},
+  {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -3, -3}},
   {{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}},
-  {{0x41, 0xD800, 0xDF02, 0x61}, {-1, -1}},
-  {{0x41, 0xD800, 0xDF02}, {-1}},
-  {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -1}},
-  {{0x41, 0xD800, 0x61, 0xDF02}, {-1, 0x61}},
-  {{0x41, 0xDF00, 0x61}, {-1}},
+  {{0x41, 0xD800, 0xDF02, 0x61}, {-3, -3}},
+  {{0x41, 0xD800, 0xDF02}, {-3}},
+  {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -3}},
+  {{0x41, 0xD800, 0x61, 0xDF02}, {-3, 0x61}},
+  {{0x41, 0xDF00, 0x61}, {-3}},
   {{0x41, 0x10FFFF, 0x61}, {0x10FFFF}},
-  {{0x41, 0x110000, 0x61}, {-1}},
+  {{0x41, 0x110000, 0x61}, {-3}},
   {{0x41, 0x61}, {0}}
 };
 
@@ -771,6 +774,7 @@ test_buffer_utf32_conversion (void)
   unsigned int i;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -3);
 
   for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++)
   {
@@ -785,7 +789,7 @@ test_buffer_utf32_conversion (void)
     for (chars = 0; test->codepoints[chars]; chars++)
       ;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf32 (b, test->utf32, u_len,  1, u_len - 2);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);
commit bcba8b45024e1eca8be77ca2657de1dc44dbf8fb
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed Jul 16 14:59:04 2014 -0400

    New API hb_buffer_add_codepoints()
    
    Like hb_buffer_add_utf32, but doesn't do any Unicode validation.
    This is like what hb_buffer_add_utf32 used to be until a couple
    commits ago.

diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index d920552..242cded 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -1290,7 +1290,7 @@ hb_buffer_guess_segment_properties (hb_buffer_t *buffer)
   buffer->guess_segment_properties ();
 }
 
-template <typename T>
+template <bool validate, typename T>
 static inline void
 hb_buffer_add_utf (hb_buffer_t  *buffer,
 		   const T      *text,
@@ -1298,7 +1298,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
 		   unsigned int  item_offset,
 		   int           item_length)
 {
-  typedef hb_utf_t<T> utf_t;
+  typedef hb_utf_t<T, true> utf_t;
 
   assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
 	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
@@ -1377,7 +1377,7 @@ hb_buffer_add_utf8 (hb_buffer_t  *buffer,
 		    unsigned int  item_offset,
 		    int           item_length)
 {
-  hb_buffer_add_utf (buffer, (const uint8_t *) text, text_length, item_offset, item_length);
+  hb_buffer_add_utf<true> (buffer, (const uint8_t *) text, text_length, item_offset, item_length);
 }
 
 /**
@@ -1399,7 +1399,7 @@ hb_buffer_add_utf16 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int             item_length)
 {
-  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
+  hb_buffer_add_utf<true> (buffer, text, text_length, item_offset, item_length);
 }
 
 /**
@@ -1421,7 +1421,29 @@ hb_buffer_add_utf32 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int             item_length)
 {
-  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
+  hb_buffer_add_utf<true> (buffer, text, text_length, item_offset, item_length);
+}
+
+/**
+ * hb_buffer_add_codepoints:
+ * @buffer: a buffer.
+ * @text: (array length=text_length):
+ * @text_length: 
+ * @item_offset: 
+ * @item_length: 
+ *
+ * 
+ *
+ * Since: 1.0
+ **/
+void
+hb_buffer_add_codepoints (hb_buffer_t          *buffer,
+			  const hb_codepoint_t *text,
+			  int                   text_length,
+			  unsigned int          item_offset,
+			  int                   item_length)
+{
+  hb_buffer_add_utf<false> (buffer, text, text_length, item_offset, item_length);
 }
 
 
diff --git a/src/hb-buffer.h b/src/hb-buffer.h
index 3086851..777c3d9 100644
--- a/src/hb-buffer.h
+++ b/src/hb-buffer.h
@@ -240,6 +240,14 @@ hb_buffer_add_utf32 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int             item_length);
 
+/* Like add_utf32 but does NOT check for invalid Unicode codepoints. */
+void
+hb_buffer_add_codepoints (hb_buffer_t          *buffer,
+			  const hb_codepoint_t *text,
+			  int                   text_length,
+			  unsigned int          item_offset,
+			  int                   item_length);
+
 
 /* Clears any new items added at the end */
 hb_bool_t
commit 625dbf141a05f1ae81a7b8cbc529996370101284
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed Jul 16 14:49:55 2014 -0400

    [buffer] Templatize UTF-* functions

diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index 76bb10c..d920552 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -1298,6 +1298,8 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
 		   unsigned int  item_offset,
 		   int           item_length)
 {
+  typedef hb_utf_t<T> utf_t;
+
   assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
 	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
 
@@ -1305,7 +1307,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
     return;
 
   if (text_length == -1)
-    text_length = hb_utf_strlen (text);
+    text_length = utf_t::strlen (text);
 
   if (item_length == -1)
     item_length = text_length - item_offset;
@@ -1328,7 +1330,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
     while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH)
     {
       hb_codepoint_t u;
-      prev = hb_utf_prev (prev, start, &u);
+      prev = utf_t::prev (prev, start, &u);
       buffer->context[0][buffer->context_len[0]++] = u;
     }
   }
@@ -1339,7 +1341,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   {
     hb_codepoint_t u;
     const T *old_next = next;
-    next = hb_utf_next (next, end, &u);
+    next = utf_t::next (next, end, &u);
     buffer->add (u, old_next - (const T *) text);
   }
 
@@ -1349,7 +1351,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH)
   {
     hb_codepoint_t u;
-    next = hb_utf_next (next, end, &u);
+    next = utf_t::next (next, end, &u);
     buffer->context[1][buffer->context_len[1]++] = u;
   }
 
diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index 398f73c..cbacd67 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -29,202 +29,215 @@
 
 #include "hb-private.hh"
 
+template <typename T, bool validate=true> struct hb_utf_t;
+
 
 /* UTF-8 */
 
-static inline const uint8_t *
-hb_utf_next (const uint8_t *text,
-	     const uint8_t *end,
-	     hb_codepoint_t *unicode)
+template <>
+struct hb_utf_t<uint8_t, true>
 {
-  /* Written to only accept well-formed sequences.
-   * Based on ideas from ICU's U8_NEXT.
-   * Generates a -1 for each ill-formed byte. */
+  static inline const uint8_t *
+  next (const uint8_t *text,
+	const uint8_t *end,
+	hb_codepoint_t *unicode)
+  {
+    /* Written to only accept well-formed sequences.
+     * Based on ideas from ICU's U8_NEXT.
+     * Generates a -1 for each ill-formed byte. */
 
-  hb_codepoint_t c = *text++;
+    hb_codepoint_t c = *text++;
 
-  if (c > 0x7Fu)
-  {
-    if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
+    if (c > 0x7Fu)
     {
-      unsigned int t1;
-      if (likely (text < end &&
-		  (t1 = text[0] - 0x80u) <= 0x3Fu))
+      if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
       {
-	c = ((c&0x1Fu)<<6) | t1;
-	text++;
+	unsigned int t1;
+	if (likely (text < end &&
+		    (t1 = text[0] - 0x80u) <= 0x3Fu))
+	{
+	  c = ((c&0x1Fu)<<6) | t1;
+	  text++;
+	}
+	else
+	  goto error;
       }
-      else
-	goto error;
-    }
-    else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
-    {
-      unsigned int t1, t2;
-      if (likely (1 < end - text &&
-		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
-		  (t2 = text[1] - 0x80u) <= 0x3Fu))
+      else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
       {
-	c = ((c&0xFu)<<12) | (t1<<6) | t2;
-	if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
+	unsigned int t1, t2;
+	if (likely (1 < end - text &&
+		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
+		    (t2 = text[1] - 0x80u) <= 0x3Fu))
+	{
+	  c = ((c&0xFu)<<12) | (t1<<6) | t2;
+	  if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
+	    goto error;
+	  text += 2;
+	}
+	else
 	  goto error;
-	text += 2;
       }
-      else
-	goto error;
-    }
-    else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
-    {
-      unsigned int t1, t2, t3;
-      if (likely (2 < end - text &&
-		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
-		  (t2 = text[1] - 0x80u) <= 0x3Fu &&
-		  (t3 = text[2] - 0x80u) <= 0x3Fu))
+      else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
       {
-	c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
-	if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
+	unsigned int t1, t2, t3;
+	if (likely (2 < end - text &&
+		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
+		    (t2 = text[1] - 0x80u) <= 0x3Fu &&
+		    (t3 = text[2] - 0x80u) <= 0x3Fu))
+	{
+	  c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
+	  if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
+	    goto error;
+	  text += 3;
+	}
+	else
 	  goto error;
-	text += 3;
       }
       else
 	goto error;
     }
-    else
-      goto error;
-  }
-
-  *unicode = c;
-  return text;
-
-error:
-  *unicode = -1;
-  return text;
-}
 
-static inline const uint8_t *
-hb_utf_prev (const uint8_t *text,
-	     const uint8_t *start,
-	     hb_codepoint_t *unicode)
-{
-  const uint8_t *end = text--;
-  while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
-    text--;
+    *unicode = c;
+    return text;
 
-  if (likely (hb_utf_next (text, end, unicode) == end))
+  error:
+    *unicode = -1;
     return text;
+  }
 
-  *unicode = -1;
-  return end - 1;
-}
+  static inline const uint8_t *
+  prev (const uint8_t *text,
+	const uint8_t *start,
+	hb_codepoint_t *unicode)
+  {
+    const uint8_t *end = text--;
+    while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
+      text--;
 
+    if (likely (next (text, end, unicode) == end))
+      return text;
 
-static inline unsigned int
-hb_utf_strlen (const uint8_t *text)
-{
-  return strlen ((const char *) text);
-}
+    *unicode = -1;
+    return end - 1;
+  }
+
+  static inline unsigned int
+  strlen (const uint8_t *text)
+  {
+    return ::strlen ((const char *) text);
+  }
+};
 
 
 /* UTF-16 */
 
-static inline const uint16_t *
-hb_utf_next (const uint16_t *text,
-	     const uint16_t *end,
-	     hb_codepoint_t *unicode)
+template <>
+struct hb_utf_t<uint16_t, true>
 {
-  hb_codepoint_t c = *text++;
-
-  if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
+  static inline const uint16_t *
+  next (const uint16_t *text,
+	const uint16_t *end,
+	hb_codepoint_t *unicode)
   {
-    *unicode = c;
+    hb_codepoint_t c = *text++;
+
+    if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
+    {
+      *unicode = c;
+      return text;
+    }
+
+    if (likely (hb_in_range (c, 0xD800u, 0xDBFFu)))
+    {
+      /* High-surrogate in c */
+      hb_codepoint_t l;
+      if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu))))
+      {
+	/* Low-surrogate in l */
+	*unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
+	 text++;
+	 return text;
+      }
+    }
+
+    /* Lonely / out-of-order surrogate. */
+    *unicode = -1;
     return text;
   }
 
-  if (likely (hb_in_range (c, 0xD800u, 0xDBFFu)))
+  static inline const uint16_t *
+  prev (const uint16_t *text,
+	const uint16_t *start,
+	hb_codepoint_t *unicode)
   {
-    /* High-surrogate in c */
-    hb_codepoint_t l;
-    if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu))))
+    const uint16_t *end = text--;
+    hb_codepoint_t c = *text;
+
+    if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
     {
-      /* Low-surrogate in l */
-      *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
-       text++;
-       return text;
+      *unicode = c;
+      return text;
     }
-  }
 
-  /* Lonely / out-of-order surrogate. */
-  *unicode = -1;
-  return text;
-}
+    if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
+      text--;
 
-static inline const uint16_t *
-hb_utf_prev (const uint16_t *text,
-	     const uint16_t *start,
-	     hb_codepoint_t *unicode)
-{
-  const uint16_t *end = text--;
-  hb_codepoint_t c = *text;
+    if (likely (next (text, end, unicode) == end))
+      return text;
 
-  if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
-  {
-    *unicode = c;
-    return text;
+    *unicode = -1;
+    return end - 1;
   }
 
-  if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
-    text--;
 
-  if (likely (hb_utf_next (text, end, unicode) == end))
-    return text;
+  static inline unsigned int
+  strlen (const uint16_t *text)
+  {
+    unsigned int l = 0;
+    while (*text++) l++;
+    return l;
+  }
+};
 
-  *unicode = -1;
-  return end - 1;
-}
 
+/* UTF-32 */
 
-static inline unsigned int
-hb_utf_strlen (const uint16_t *text)
+template <bool validate>
+struct hb_utf_t<uint32_t, validate>
 {
-  unsigned int l = 0;
-  while (*text++) l++;
-  return l;
-}
-
+  static inline const uint32_t *
+  next (const uint32_t *text,
+	const uint32_t *end HB_UNUSED,
+	hb_codepoint_t *unicode)
+  {
+    hb_codepoint_t c = *text++;
+    if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
+      goto error;
+    *unicode = c;
+    return text;
 
-/* UTF-32 */
+  error:
+    *unicode = -1;
+    return text;
+  }
 
-static inline const uint32_t *
-hb_utf_next (const uint32_t *text,
-	     const uint32_t *end HB_UNUSED,
-	     hb_codepoint_t *unicode)
-{
-  hb_codepoint_t c = *text++;
-  if (unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
-    goto error;
-  *unicode = c;
-  return text;
-
-error:
-  *unicode = -1;
-  return text;
-}
-
-static inline const uint32_t *
-hb_utf_prev (const uint32_t *text,
-	     const uint32_t *start HB_UNUSED,
-	     hb_codepoint_t *unicode)
-{
-  hb_utf_next (text - 1, text, unicode);
-  return text - 1;
-}
+  static inline const uint32_t *
+  prev (const uint32_t *text,
+	const uint32_t *start HB_UNUSED,
+	hb_codepoint_t *unicode)
+  {
+    next (text - 1, text, unicode);
+    return text - 1;
+  }
 
-static inline unsigned int
-hb_utf_strlen (const uint32_t *text)
-{
-  unsigned int l = 0;
-  while (*text++) l++;
-  return l;
-}
+  static inline unsigned int
+  strlen (const uint32_t *text)
+  {
+    unsigned int l = 0;
+    while (*text++) l++;
+    return l;
+  }
+};
 
 
 #endif /* HB_UTF_PRIVATE_HH */
commit e634fed4285ce440d277345727ed01757df6d779
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed Jul 16 14:17:26 2014 -0400

    [buffer] Validate UTF-32 input
    
    Same as what we do for UTF-8 and UTF-16.

diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index a4c6236..398f73c 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -198,7 +198,14 @@ hb_utf_next (const uint32_t *text,
 	     const uint32_t *end HB_UNUSED,
 	     hb_codepoint_t *unicode)
 {
-  *unicode = *text++;
+  hb_codepoint_t c = *text++;
+  if (unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
+    goto error;
+  *unicode = c;
+  return text;
+
+error:
+  *unicode = -1;
   return text;
 }
 
@@ -207,8 +214,8 @@ hb_utf_prev (const uint32_t *text,
 	     const uint32_t *start HB_UNUSED,
 	     hb_codepoint_t *unicode)
 {
-  *unicode = *--text;
-  return text;
+  hb_utf_next (text - 1, text, unicode);
+  return text - 1;
 }
 
 static inline unsigned int
diff --git a/test/api/test-buffer.c b/test/api/test-buffer.c
index 1956c92..1be6931 100644
--- a/test/api/test-buffer.c
+++ b/test/api/test-buffer.c
@@ -744,6 +744,60 @@ test_buffer_utf16_conversion (void)
   hb_buffer_destroy (b);
 }
 
+
+typedef struct {
+  const uint32_t utf32[8];
+  const uint32_t codepoints[8];
+} utf32_conversion_test_t;
+
+/* note: we skip the first and last item from utf32 when adding to buffer */
+static const utf32_conversion_test_t utf32_conversion_tests[] = {
+  {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -1, -1}},
+  {{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}},
+  {{0x41, 0xD800, 0xDF02, 0x61}, {-1, -1}},
+  {{0x41, 0xD800, 0xDF02}, {-1}},
+  {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -1}},
+  {{0x41, 0xD800, 0x61, 0xDF02}, {-1, 0x61}},
+  {{0x41, 0xDF00, 0x61}, {-1}},
+  {{0x41, 0x10FFFF, 0x61}, {0x10FFFF}},
+  {{0x41, 0x110000, 0x61}, {-1}},
+  {{0x41, 0x61}, {0}}
+};
+
+static void
+test_buffer_utf32_conversion (void)
+{
+  hb_buffer_t *b;
+  unsigned int i;
+
+  b = hb_buffer_create ();
+
+  for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++)
+  {
+    const utf32_conversion_test_t *test = &utf32_conversion_tests[i];
+    unsigned int u_len, chars, j, len;
+    hb_glyph_info_t *glyphs;
+
+    g_test_message ("UTF-32 test #%d", i);
+
+    for (u_len = 0; test->utf32[u_len]; u_len++)
+      ;
+    for (chars = 0; test->codepoints[chars]; chars++)
+      ;
+
+    hb_buffer_reset (b);
+    hb_buffer_add_utf32 (b, test->utf32, u_len,  1, u_len - 2);
+
+    glyphs = hb_buffer_get_glyph_infos (b, &len);
+    g_assert_cmpint (len, ==, chars);
+    for (j = 0; j < chars; j++)
+      g_assert_cmphex (glyphs[j].codepoint, ==, test->codepoints[j]);
+  }
+
+  hb_buffer_destroy (b);
+}
+
+
 static void
 test_empty (hb_buffer_t *b)
 {
@@ -810,6 +864,7 @@ main (int argc, char **argv)
   hb_test_add (test_buffer_utf8_conversion);
   hb_test_add (test_buffer_utf8_validity);
   hb_test_add (test_buffer_utf16_conversion);
+  hb_test_add (test_buffer_utf32_conversion);
   hb_test_add (test_buffer_empty);
 
   return hb_test_run();
commit b98c5db32d15fcfb27ce2f6737203ce1ad124319
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed Jul 16 13:44:01 2014 -0400

    Minor refactoring

diff --git a/src/hb-ot-layout-gsubgpos-private.hh b/src/hb-ot-layout-gsubgpos-private.hh
index 470c353..546ff4b 100644
--- a/src/hb-ot-layout-gsubgpos-private.hh
+++ b/src/hb-ot-layout-gsubgpos-private.hh
@@ -349,11 +349,7 @@ struct hb_apply_context_t
     may_skip (const hb_apply_context_t *c,
 	      const hb_glyph_info_t    &info) const
     {
-      unsigned int property;
-
-      property = _hb_glyph_info_get_glyph_props (&info);
-
-      if (!c->match_properties (info.codepoint, property, lookup_props))
+      if (!c->check_glyph_property (&info, lookup_props))
 	return SKIP_YES;
 
       if (unlikely (_hb_glyph_info_is_default_ignorable (&info) &&
@@ -537,10 +533,12 @@ struct hb_apply_context_t
   }
 
   inline bool
-  match_properties (hb_codepoint_t  glyph,
-		    unsigned int    glyph_props,
-		    unsigned int    lookup_props) const
+  check_glyph_property (const hb_glyph_info_t *info,
+			unsigned int  lookup_props) const
   {
+    hb_codepoint_t glyph = info->codepoint;
+    unsigned int glyph_props = _hb_glyph_info_get_glyph_props (info);
+
     /* Not covered, if, for example, glyph class is ligature and
      * lookup_props includes LookupFlags::IgnoreLigatures
      */
@@ -553,17 +551,6 @@ struct hb_apply_context_t
     return true;
   }
 
-  inline bool
-  check_glyph_property (hb_glyph_info_t *info,
-			unsigned int  lookup_props) const
-  {
-    unsigned int property;
-
-    property = _hb_glyph_info_get_glyph_props (info);
-
-    return match_properties (info->codepoint, property, lookup_props);
-  }
-
   inline void _set_glyph_props (hb_codepoint_t glyph_index,
 			  unsigned int class_guess = 0,
 			  bool ligature = false,


More information about the HarfBuzz mailing list