[HarfBuzz] harfbuzz: Branch 'master' - 11 commits

Tue Sep 25 18:36:56 PDT 2012

src/Makefile.am                   |    7 -
 src/hb-buffer-private.hh          |   13 ++
 src/hb-buffer.cc                  |  185 +++++++++++++---------------------
 src/hb-buffer.h                   |    2 
 src/hb-ot-shape-complex-arabic.cc |   31 +++++
 src/hb-ot-shape.cc                |    6 -
 src/hb-utf-private.hh             |  204 ++++++++++++++++++++++++++++++++++++++
 src/indic.cc                      |   51 ---------
 8 files changed, 323 insertions(+), 176 deletions(-)

New commits:
commit f2eb3fa9dca8d21fae20c9b9dddad3245df74a05
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 21:35:35 2012 -0400

    [OT] Only insert dottedcircle if at the beginning of paragraph
    
    If the first char in the run is a combining mark, but there is text
    before the run, don't insert dottedcircle.
    
    Part of addressing:
    https://bugzilla.redhat.com/show_bug.cgi?id=858736

diff --git a/src/hb-ot-shape.cc b/src/hb-ot-shape.cc
index 9e44c5d..3f72fd0 100644
--- a/src/hb-ot-shape.cc
+++ b/src/hb-ot-shape.cc
@@ -237,10 +237,8 @@ hb_set_unicode_props (hb_buffer_t *buffer)
 static void
 hb_insert_dotted_circle (hb_buffer_t *buffer, hb_font_t *font)
 {
-  /* TODO One day, when we keep _before_ text for the buffer, take
-   * that into consideration.  For now, insert dotted-circle if the
-   * very first character is a non-spacing mark. */
-  if (_hb_glyph_info_get_general_category (&buffer->info[0]) !=
+  if (buffer->context_len[0] ||
+      _hb_glyph_info_get_general_category (&buffer->info[0]) !=
       HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)
     return;
 
commit bdc2fc8294da7f374701aafe9f5a82d60633946f
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 21:32:35 2012 -0400

    [Arabic] Respect Arabic joining from neighboring context
    
    Now we respect Arabic joining across runs.

diff --git a/src/hb-ot-shape-complex-arabic.cc b/src/hb-ot-shape-complex-arabic.cc
index 56949dd..5720891 100644
--- a/src/hb-ot-shape-complex-arabic.cc
+++ b/src/hb-ot-shape-complex-arabic.cc
@@ -229,10 +229,23 @@ static void
 arabic_joining (hb_buffer_t *buffer)
 {
   unsigned int count = buffer->len;
-  unsigned int prev = 0, state = 0;
+  unsigned int prev = (unsigned int) -1, state = 0;
 
   HB_BUFFER_ALLOCATE_VAR (buffer, arabic_shaping_action);
 
+  /* Check pre-context */
+  for (unsigned int i = 0; i < buffer->context_len[0]; i++)
+  {
+    unsigned int this_type = get_joining_type (buffer->context[0][i], buffer->unicode->general_category (buffer->context[0][i]));
+
+    if (unlikely (this_type == JOINING_TYPE_T))
+      continue;
+
+    const arabic_state_table_entry *entry = &arabic_state_table[state][this_type];
+    state = entry->next_state;
+    break;
+  }
+
   for (unsigned int i = 0; i < count; i++)
   {
     unsigned int this_type = get_joining_type (buffer->info[i].codepoint, _hb_glyph_info_get_general_category (&buffer->info[i]));
@@ -244,7 +257,7 @@ arabic_joining (hb_buffer_t *buffer)
 
     const arabic_state_table_entry *entry = &arabic_state_table[state][this_type];
 
-    if (entry->prev_action != NONE)
+    if (entry->prev_action != NONE && prev != (unsigned int) -1)
       buffer->info[prev].arabic_shaping_action() = entry->prev_action;
 
     buffer->info[i].arabic_shaping_action() = entry->curr_action;
@@ -253,6 +266,20 @@ arabic_joining (hb_buffer_t *buffer)
     state = entry->next_state;
   }
 
+  for (unsigned int i = 0; i < buffer->context_len[1]; i++)
+  {
+    unsigned int this_type = get_joining_type (buffer->context[1][i], buffer->unicode->general_category (buffer->context[0][i]));
+
+    if (unlikely (this_type == JOINING_TYPE_T))
+      continue;
+
+    const arabic_state_table_entry *entry = &arabic_state_table[state][this_type];
+    if (entry->prev_action != NONE && prev != (unsigned int) -1)
+      buffer->info[prev].arabic_shaping_action() = entry->prev_action;
+    break;
+  }
+
+
   HB_BUFFER_DEALLOCATE_VAR (buffer, arabic_shaping_action);
 }
 
commit 05207a79e0ae1769c5feaebe3fd99bdf9cfcf834
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 17:44:53 2012 -0400

    [buffer] Save pre/post textual context
    
    To be used for a variety of purposes.  We save up to five characters
    in each direction.  No public API changes, everything is taken care
    of already.  All clients need to do is to call hb_buffer_add_utf* with
    the full text + segment info (or at least some context) instead of
    just passing in the segment.
    
    Various operations (hb_buffer_reset, hb_buffer_set_length,
    hb_buffer_add*) automatically reset the relevant contexts.

diff --git a/src/hb-buffer-private.hh b/src/hb-buffer-private.hh
index f5d64f3..67a2752 100644
--- a/src/hb-buffer-private.hh
+++ b/src/hb-buffer-private.hh
@@ -1,7 +1,7 @@
 /*
  * Copyright Â© 1998-2004  David Turner and Werner Lemberg
  * Copyright Â© 2004,2007,2009,2010  Red Hat, Inc.
- * Copyright Â© 2011  Google, Inc.
+ * Copyright Â© 2011,2012  Google, Inc.
  *
  *  This is part of HarfBuzz, a text shaping library.
  *
@@ -117,9 +117,18 @@ struct hb_buffer_t {
   inline hb_glyph_info_t prev (void) const { return info[out_len - 1]; }
 
   unsigned int serial;
+
+  /* These reflect current allocations of the bytes in glyph_info_t's var1 and var2. */
   uint8_t allocated_var_bytes[8];
   const char *allocated_var_owner[8];
 
+  /* Text before / after the main buffer contents.
+   * Always in Unicode, and ordered outward.
+   * Index 0 is for "pre-context", 1 for "post-context". */
+  static const unsigned int CONTEXT_LENGTH = 5;
+  hb_codepoint_t context[2][CONTEXT_LENGTH];
+  unsigned int context_len[2];
+
 
   /* Methods */
 
@@ -206,6 +215,8 @@ struct hb_buffer_t {
   HB_INTERNAL bool make_room_for (unsigned int num_in, unsigned int num_out);
 
   HB_INTERNAL void *get_scratch_buffer (unsigned int *size);
+
+  inline void clear_context (unsigned int side) { context_len[side] = 0; }
 };
 
 
diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index 6da196f..2f8f511 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -1,7 +1,7 @@
 /*
  * Copyright Â© 1998-2004  David Turner and Werner Lemberg
  * Copyright Â© 2004,2007,2009,2010  Red Hat, Inc.
- * Copyright Â© 2011  Google, Inc.
+ * Copyright Â© 2011,2012  Google, Inc.
  *
  *  This is part of HarfBuzz, a text shaping library.
  *
@@ -158,6 +158,9 @@ hb_buffer_t::reset (void)
   serial = 0;
   memset (allocated_var_bytes, 0, sizeof allocated_var_bytes);
   memset (allocated_var_owner, 0, sizeof allocated_var_owner);
+
+  memset (context, 0, sizeof context);
+  memset (context_len, 0, sizeof context_len);
 }
 
 void
@@ -570,6 +573,8 @@ hb_buffer_get_empty (void)
     true, /* in_error */
     true, /* have_output */
     true  /* have_positions */
+
+    /* Zero is good enough for everything else. */
   };
 
   return const_cast<hb_buffer_t *> (&_hb_buffer_nil);
@@ -723,6 +728,7 @@ hb_buffer_add (hb_buffer_t    *buffer,
 	       unsigned int    cluster)
 {
   buffer->add (codepoint, mask, cluster);
+  buffer->clear_context (1);
 }
 
 hb_bool_t
@@ -743,6 +749,11 @@ hb_buffer_set_length (hb_buffer_t  *buffer,
   }
 
   buffer->len = length;
+
+  if (!length)
+    buffer->clear_context (0);
+  buffer->clear_context (1);
+
   return true;
 }
 
@@ -817,13 +828,38 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
 
   buffer->ensure (buffer->len + item_length * sizeof (T) / 4);
 
-  const T *next = (const T *) text + item_offset;
+  if (!buffer->len)
+  {
+    /* Add pre-context */
+    buffer->clear_context (0);
+    const T *prev = text + item_offset;
+    const T *start = text;
+    while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH)
+    {
+      hb_codepoint_t u;
+      prev = hb_utf_prev (prev, start, &u);
+      buffer->context[0][buffer->context_len[0]++] = u;
+    }
+  }
+
+  const T *next = text + item_offset;
   const T *end = next + item_length;
-  while (next < end) {
+  while (next < end)
+  {
     hb_codepoint_t u;
     const T *old_next = next;
     next = hb_utf_next (next, end, &u);
-    hb_buffer_add (buffer, u, 1,  old_next - (const T *) text);
+    buffer->add (u, 1,  old_next - (const T *) text);
+  }
+
+  /* Add post-context */
+  buffer->clear_context (1);
+  end = text + text_length;
+  while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH)
+  {
+    hb_codepoint_t u;
+    next = hb_utf_next (next, end, &u);
+    buffer->context[1][buffer->context_len[1]++] = u;
   }
 
   buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
diff --git a/src/hb-buffer.h b/src/hb-buffer.h
index d89dce3..dc63d1b 100644
--- a/src/hb-buffer.h
+++ b/src/hb-buffer.h
@@ -1,7 +1,7 @@
 /*
  * Copyright Â© 1998-2004  David Turner and Werner Lemberg
  * Copyright Â© 2004,2007,2009  Red Hat, Inc.
- * Copyright Â© 2011  Google, Inc.
+ * Copyright Â© 2011,2012  Google, Inc.
  *
  *  This is part of HarfBuzz, a text shaping library.
  *
commit 89ac39dbbe028e6379f64392f2e590e3f1fdd847
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 13:59:24 2012 -0400

    Add hb_utf_prev()

diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index a77139f..8cde827 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -72,6 +72,39 @@ hb_utf_next (const uint8_t *text,
   }
 }
 
+static inline const uint8_t *
+hb_utf_prev (const uint8_t *text,
+	     const uint8_t *start,
+	     hb_codepoint_t *unicode)
+{
+  const uint8_t *end = text;
+  while (start < text && (*--text & 0xc0) == 0x80 && end - text < 4)
+    text--;
+
+  hb_codepoint_t c = *text, mask;
+  unsigned int len;
+
+  /* TODO check for overlong sequences? */
+
+  HB_UTF8_COMPUTE (c, mask, len);
+  if (unlikely (!len || (unsigned int) (end - text) != len)) {
+    *unicode = -1;
+    return end - 1;
+  } else {
+    hb_codepoint_t result;
+    unsigned int i;
+    result = c & mask;
+    for (i = 1; i < len; i++)
+      {
+	result <<= 6;
+	result |= (text[i] & 0x3f);
+      }
+    *unicode = result;
+    return text;
+  }
+}
+
+
 static inline unsigned int
 hb_utf_strlen (const uint8_t *text)
 {
@@ -105,6 +138,31 @@ hb_utf_next (const uint16_t *text,
   return text;
 }
 
+static inline const uint16_t *
+hb_utf_prev (const uint16_t *text,
+	     const uint16_t *start,
+	     hb_codepoint_t *unicode)
+{
+  hb_codepoint_t c = *--text;
+
+  if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xdc00, 0xdfff)))
+  {
+    /* low surrogate */
+    hb_codepoint_t h;
+    if (start < text && ((h = *(text - 1)), likely (hb_in_range<hb_codepoint_t> (h, 0xd800, 0xdbff))))
+    {
+      /* high surrogate */
+      *unicode = (h << 10) + c - ((0xd800 << 10) - 0x10000 + 0xdc00);
+       text--;
+    } else
+      *unicode = -1;
+  } else
+    *unicode = c;
+
+  return text;
+}
+
+
 static inline unsigned int
 hb_utf_strlen (const uint16_t *text)
 {
@@ -121,8 +179,17 @@ hb_utf_next (const uint32_t *text,
 	     const uint32_t *end,
 	     hb_codepoint_t *unicode)
 {
-  *unicode = *text;
-  return text + 1;
+  *unicode = *text++;
+  return text;
+}
+
+static inline const uint32_t *
+hb_utf_prev (const uint32_t *text,
+	     const uint32_t *start,
+	     hb_codepoint_t *unicode)
+{
+  *unicode = *--text;
+  return text;
 }
 
 static inline unsigned int
commit 70ea4ac6887c7057113d714a98e55738b6196562
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 12:30:16 2012 -0400

    Slightly optimize UTF-8 parsing

diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index f89aa23..a77139f 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -44,8 +44,8 @@ hb_utf_next (const uint8_t *text,
 	     const uint8_t *end,
 	     hb_codepoint_t *unicode)
 {
-  uint8_t c = *text;
-  unsigned int mask, len;
+  hb_codepoint_t c = *text, mask;
+  unsigned int len;
 
   /* TODO check for overlong sequences? */
 
commit 4445e5e2ecc257a5d0fa2f2715bb7181a47346da
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 12:26:12 2012 -0400

    [buffer] Cleanup / optimize UTF-16 parsing a bit

diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index 2224a03..f89aa23 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -86,14 +86,16 @@ hb_utf_next (const uint16_t *text,
 	     const uint16_t *end,
 	     hb_codepoint_t *unicode)
 {
-  uint16_t c = *text++;
+  hb_codepoint_t c = *text++;
 
-  if (unlikely (c >= 0xd800 && c < 0xdc00)) {
+  if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xd800, 0xdbff)))
+  {
     /* high surrogate */
-    uint16_t l;
-    if (text < end && ((l = *text), likely (l >= 0xdc00 && l < 0xe000))) {
+    hb_codepoint_t l;
+    if (text < end && ((l = *text), likely (hb_in_range<hb_codepoint_t> (l, 0xdc00, 0xdfff))))
+    {
       /* low surrogate */
-      *unicode = ((hb_codepoint_t) ((c) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000);
+      *unicode = (c << 10) + l - ((0xd800 << 10) - 0x10000 + 0xdc00);
        text++;
     } else
       *unicode = -1;
commit 1f66c3c1a0eb869c0d85a015235313177e0cec62
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 11:42:16 2012 -0400

    Add hb_utf_strlen()
    
    Speeds up UTF-8 parsing by calling strlen().

diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index f84511d..6da196f 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -30,9 +30,6 @@
 #include "hb-buffer-private.hh"
 #include "hb-utf-private.hh"
 
-#include <string.h>
-
-
 
 #ifndef HB_DEBUG_BUFFER
 #define HB_DEBUG_BUFFER (HB_DEBUG+0)
@@ -812,14 +809,8 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   if (unlikely (hb_object_is_inert (buffer)))
     return;
 
-  if (text_length == -1) {
-    text_length = 0;
-    const T *p = (const T *) text;
-    while (*p) {
-      text_length++;
-      p++;
-    }
-  }
+  if (text_length == -1)
+    text_length = hb_utf_strlen (text);
 
   if (item_length == -1)
     item_length = text_length - item_offset;
diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index 829ca50..2224a03 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -72,6 +72,12 @@ hb_utf_next (const uint8_t *text,
   }
 }
 
+static inline unsigned int
+hb_utf_strlen (const uint8_t *text)
+{
+  return strlen ((const char *) text);
+}
+
 
 /* UTF-16 */
 
@@ -97,6 +103,14 @@ hb_utf_next (const uint16_t *text,
   return text;
 }
 
+static inline unsigned int
+hb_utf_strlen (const uint16_t *text)
+{
+  unsigned int l = 0;
+  while (*text++) l++;
+  return l;
+}
+
 
 /* UTF-32 */
 
@@ -109,5 +123,13 @@ hb_utf_next (const uint32_t *text,
   return text + 1;
 }
 
+static inline unsigned int
+hb_utf_strlen (const uint32_t *text)
+{
+  unsigned int l = 0;
+  while (*text++) l++;
+  return l;
+}
+
 
 #endif /* HB_UTF_PRIVATE_HH */
commit 7f19ae7b9f806a2e35206b2ad41651c5f80b2537
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 11:22:28 2012 -0400

    [buffer] Templatize UTF handling
    
    Also move UTF routines into a separate file, to be reused from shapers
    that need it.

diff --git a/src/Makefile.am b/src/Makefile.am
index 4aae7ec..d1a94cd 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -51,6 +51,7 @@ HBSOURCES =  \
 	hb-tt-font.cc \
 	hb-unicode-private.hh \
 	hb-unicode.cc \
+	hb-utf-private.hh \
 	hb-warning.cc \
 	$(NULL)
 HBHEADERS = \
diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index 5471634..f84511d 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -28,6 +28,7 @@
  */
 
 #include "hb-buffer-private.hh"
+#include "hb-utf-private.hh"
 
 #include <string.h>
 
@@ -797,68 +798,44 @@ hb_buffer_guess_properties (hb_buffer_t *buffer)
   buffer->guess_properties ();
 }
 
-#define ADD_UTF(T) \
-	HB_STMT_START { \
-	  if (text_length == -1) { \
-	    text_length = 0; \
-	    const T *p = (const T *) text; \
-	    while (*p) { \
-	      text_length++; \
-	      p++; \
-	    } \
-	  } \
-	  if (item_length == -1) \
-	    item_length = text_length - item_offset; \
-	  buffer->ensure (buffer->len + item_length * sizeof (T) / 4); \
-	  const T *next = (const T *) text + item_offset; \
-	  const T *end = next + item_length; \
-	  while (next < end) { \
-	    hb_codepoint_t u; \
-	    const T *old_next = next; \
-	    next = UTF_NEXT (next, end, u); \
-	    hb_buffer_add (buffer, u, 1,  old_next - (const T *) text); \
-	  } \
-	} HB_STMT_END
-
-
-#define UTF8_COMPUTE(Char, Mask, Len) \
-  if (Char < 128) { Len = 1; Mask = 0x7f; } \
-  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
-  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
-  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
-  else Len = 0;
-
-static inline const uint8_t *
-hb_utf8_next (const uint8_t *text,
-	      const uint8_t *end,
-	      hb_codepoint_t *unicode)
-{
-  uint8_t c = *text;
-  unsigned int mask, len;
-
-  /* TODO check for overlong sequences? */
-
-  UTF8_COMPUTE (c, mask, len);
-  if (unlikely (!len || (unsigned int) (end - text) < len)) {
-    *unicode = -1;
-    return text + 1;
-  } else {
-    hb_codepoint_t result;
-    unsigned int i;
-    result = c & mask;
-    for (i = 1; i < len; i++)
-      {
-	if (unlikely ((text[i] & 0xc0) != 0x80))
-	  {
-	    *unicode = -1;
-	    return text + 1;
-	  }
-	result <<= 6;
-	result |= (text[i] & 0x3f);
-      }
-    *unicode = result;
-    return text + len;
+template <typename T>
+static inline void
+hb_buffer_add_utf (hb_buffer_t  *buffer,
+		   const T      *text,
+		   int           text_length,
+		   unsigned int  item_offset,
+		   int           item_length)
+{
+  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
+	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
+
+  if (unlikely (hb_object_is_inert (buffer)))
+    return;
+
+  if (text_length == -1) {
+    text_length = 0;
+    const T *p = (const T *) text;
+    while (*p) {
+      text_length++;
+      p++;
+    }
+  }
+
+  if (item_length == -1)
+    item_length = text_length - item_offset;
+
+  buffer->ensure (buffer->len + item_length * sizeof (T) / 4);
+
+  const T *next = (const T *) text + item_offset;
+  const T *end = next + item_length;
+  while (next < end) {
+    hb_codepoint_t u;
+    const T *old_next = next;
+    next = hb_utf_next (next, end, &u);
+    hb_buffer_add (buffer, u, 1,  old_next - (const T *) text);
   }
+
+  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
 }
 
 void
@@ -868,36 +845,7 @@ hb_buffer_add_utf8 (hb_buffer_t  *buffer,
 		    unsigned int  item_offset,
 		    int           item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
-	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
-  if (unlikely (hb_object_is_inert (buffer)))
-    return;
-  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
-#define UTF_NEXT(S, E, U)	hb_utf8_next (S, E, &(U))
-  ADD_UTF (uint8_t);
-#undef UTF_NEXT
-}
-
-static inline const uint16_t *
-hb_utf16_next (const uint16_t *text,
-	       const uint16_t *end,
-	       hb_codepoint_t *unicode)
-{
-  uint16_t c = *text++;
-
-  if (unlikely (c >= 0xd800 && c < 0xdc00)) {
-    /* high surrogate */
-    uint16_t l;
-    if (text < end && ((l = *text), likely (l >= 0xdc00 && l < 0xe000))) {
-      /* low surrogate */
-      *unicode = ((hb_codepoint_t) ((c) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000);
-       text++;
-    } else
-      *unicode = -1;
-  } else
-    *unicode = c;
-
-  return text;
+  hb_buffer_add_utf (buffer, (const uint8_t *) text, text_length, item_offset, item_length);
 }
 
 void
@@ -907,23 +855,7 @@ hb_buffer_add_utf16 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int            item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
-	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
-  if (unlikely (hb_object_is_inert (buffer)))
-    return;
-  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
-#define UTF_NEXT(S, E, U)	hb_utf16_next (S, E, &(U))
-  ADD_UTF (uint16_t);
-#undef UTF_NEXT
-}
-
-static inline const uint32_t *
-hb_utf32_next (const uint32_t *text,
-	       const uint32_t *end,
-	       hb_codepoint_t *unicode)
-{
-  *unicode = *text;
-  return text + 1;
+  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
 }
 
 void
@@ -933,14 +865,7 @@ hb_buffer_add_utf32 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int             item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
-	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
-  if (unlikely (hb_object_is_inert (buffer)))
-    return;
-  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
-#define UTF_NEXT(S, E, U)	hb_utf32_next (S, E, &(U))
-  ADD_UTF (uint32_t);
-#undef UTF_NEXT
+  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
 }
 
 
diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
new file mode 100644
index 0000000..829ca50
--- /dev/null
+++ b/src/hb-utf-private.hh
@@ -0,0 +1,113 @@
+/*
+ * Copyright Â© 2011,2012  Google, Inc.
+ *
+ *  This is part of HarfBuzz, a text shaping library.
+ *
+ * Permission is hereby granted, without written agreement and without
+ * license or royalty fees, to use, copy, modify, and distribute this
+ * software and its documentation for any purpose, provided that the
+ * above copyright notice and the following two paragraphs appear in
+ * all copies of this software.
+ *
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
+ * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
+ * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+ * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
+ * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+ *
+ * Google Author(s): Behdad Esfahbod
+ */
+
+#ifndef HB_UTF_PRIVATE_HH
+#define HB_UTF_PRIVATE_HH
+
+#include "hb-private.hh"
+
+
+/* UTF-8 */
+
+#define HB_UTF8_COMPUTE(Char, Mask, Len) \
+  if (Char < 128) { Len = 1; Mask = 0x7f; } \
+  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
+  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
+  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
+  else Len = 0;
+
+static inline const uint8_t *
+hb_utf_next (const uint8_t *text,
+	     const uint8_t *end,
+	     hb_codepoint_t *unicode)
+{
+  uint8_t c = *text;
+  unsigned int mask, len;
+
+  /* TODO check for overlong sequences? */
+
+  HB_UTF8_COMPUTE (c, mask, len);
+  if (unlikely (!len || (unsigned int) (end - text) < len)) {
+    *unicode = -1;
+    return text + 1;
+  } else {
+    hb_codepoint_t result;
+    unsigned int i;
+    result = c & mask;
+    for (i = 1; i < len; i++)
+      {
+	if (unlikely ((text[i] & 0xc0) != 0x80))
+	  {
+	    *unicode = -1;
+	    return text + 1;
+	  }
+	result <<= 6;
+	result |= (text[i] & 0x3f);
+      }
+    *unicode = result;
+    return text + len;
+  }
+}
+
+
+/* UTF-16 */
+
+static inline const uint16_t *
+hb_utf_next (const uint16_t *text,
+	     const uint16_t *end,
+	     hb_codepoint_t *unicode)
+{
+  uint16_t c = *text++;
+
+  if (unlikely (c >= 0xd800 && c < 0xdc00)) {
+    /* high surrogate */
+    uint16_t l;
+    if (text < end && ((l = *text), likely (l >= 0xdc00 && l < 0xe000))) {
+      /* low surrogate */
+      *unicode = ((hb_codepoint_t) ((c) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000);
+       text++;
+    } else
+      *unicode = -1;
+  } else
+    *unicode = c;
+
+  return text;
+}
+
+
+/* UTF-32 */
+
+static inline const uint32_t *
+hb_utf_next (const uint32_t *text,
+	     const uint32_t *end,
+	     hb_codepoint_t *unicode)
+{
+  *unicode = *text;
+  return text + 1;
+}
+
+
+#endif /* HB_UTF_PRIVATE_HH */
commit 0e0a4da9b7677a09e00d27313236e1f333864dd6
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 11:09:04 2012 -0400

    [buffer] Towards template'izing different UTF adders

diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index 6c0f349..5471634 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -917,6 +917,15 @@ hb_buffer_add_utf16 (hb_buffer_t    *buffer,
 #undef UTF_NEXT
 }
 
+static inline const uint32_t *
+hb_utf32_next (const uint32_t *text,
+	       const uint32_t *end,
+	       hb_codepoint_t *unicode)
+{
+  *unicode = *text;
+  return text + 1;
+}
+
 void
 hb_buffer_add_utf32 (hb_buffer_t    *buffer,
 		     const uint32_t *text,
@@ -929,7 +938,7 @@ hb_buffer_add_utf32 (hb_buffer_t    *buffer,
   if (unlikely (hb_object_is_inert (buffer)))
     return;
   buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
-#define UTF_NEXT(S, E, U)	((U) = *(S), (S)+1)
+#define UTF_NEXT(S, E, U)	hb_utf32_next (S, E, &(U))
   ADD_UTF (uint32_t);
 #undef UTF_NEXT
 }
commit 7d37280600c157f3c4eae8746e99511702a58e8f
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 11:04:41 2012 -0400

    Minor

diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index fec9225..6c0f349 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -994,7 +994,7 @@ void
 hb_buffer_normalize_glyphs (hb_buffer_t *buffer)
 {
   assert (buffer->have_positions);
-  /* XXX assert (buffer->have_glyphs); */
+  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_GLYPHS);
 
   bool backward = HB_DIRECTION_IS_BACKWARD (buffer->props.direction);
 
commit 54d5da4ee9d902ff36473cec558137eef8f23825
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Tue Sep 25 10:50:41 2012 -0400

    Remove unused indic.cc

diff --git a/src/Makefile.am b/src/Makefile.am
index 3bf3efc..4aae7ec 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -233,17 +233,13 @@ hb-ot-shape-complex-indic-machine.hh: hb-ot-shape-complex-indic-machine.rl
 	$(AM_V_GEN)$(top_srcdir)/missing --run ragel -e -F1 -o "$@.tmp" "$<" && \
 	mv "$@.tmp" "$@" || ( $(RM) "$@.tmp" && false )
 
-noinst_PROGRAMS = main indic test-would-substitute
+noinst_PROGRAMS = main test-would-substitute
 bin_PROGRAMS =
 
 main_SOURCES = main.cc
 main_CPPFLAGS = $(HBCFLAGS)
 main_LDADD = libharfbuzz.la $(HBLIBS)
 
-indic_SOURCES = indic.cc
-indic_CPPFLAGS = $(HBCFLAGS)
-indic_LDADD = libharfbuzz.la $(HBLIBS)
-
 test_would_substitute_SOURCES = test-would-substitute.cc
 test_would_substitute_CPPFLAGS = $(HBCFLAGS) $(FREETYPE_CFLAGS)
 test_would_substitute_LDADD = libharfbuzz.la $(HBLIBS) $(FREETYPE_LIBS)
diff --git a/src/indic.cc b/src/indic.cc
deleted file mode 100644
index 991a772..0000000
--- a/src/indic.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright Â© 2012  Google, Inc.
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- *
- * Google Author(s): Behdad Esfahbod
- */
-
-#include "hb-ot-shape-complex-indic-private.hh"
-
-int
-main (void)
-{
-  hb_unicode_funcs_t *funcs = hb_unicode_funcs_get_default ();
-
-  printf ("There are split matras without a Unicode decomposition:\n");
-  for (hb_codepoint_t u = 0; u < 0x110000; u++)
-  {
-    unsigned int type = get_indic_categories (u);
-
-    unsigned int category = type & 0x0F;
-    unsigned int position = type >> 4;
-
-    hb_unicode_general_category_t cat = hb_unicode_general_category (funcs, u);
-    unsigned int ccc = hb_unicode_combining_class (funcs, u);
-    if (category == OT_M && ccc)
-      printf ("U+%04X %d\n", u, ccc);
-
-//    hb_codepoint_t a, b;
-//    if (!hb_unicode_decompose (funcs, u, &a, &b))
-//      printf ("U+%04X %x %x\n", u, category, position);
-  }
-}