[HarfBuzz] harfbuzz-ng: Branch 'master' - 5 commits

Behdad Esfahbod behdad at kemper.freedesktop.org
Thu Jul 21 17:59:12 PDT 2011


 src/hb-glib.cc                     |   10 ++--
 src/hb-icu.cc                      |   20 +++++---
 src/hb-ot-shape-complex-arabic.cc  |    6 ++
 src/hb-ot-shape-complex-indic.cc   |    7 ++
 src/hb-ot-shape-complex-misc.cc    |    6 ++
 src/hb-ot-shape-complex-private.hh |   31 +++++++++++-
 src/hb-ot-shape-normalize.cc       |   92 ++++++++++++++++++++++++++++++-------
 src/hb-ot-shape-private.hh         |    4 -
 src/hb-ot-shape.cc                 |    6 +-
 test/test-unicode.c                |   11 +++-
 10 files changed, 154 insertions(+), 39 deletions(-)

New commits:
commit 63c0ef4a0763e579c9c80887bbfbd2651de05067
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Thu Jul 21 20:58:42 2011 -0400

    Fix decompose() implementations to work with non-starter non-composables
    
    Add tests.

diff --git a/src/hb-glib.cc b/src/hb-glib.cc
index fbf8cf5..76e1dfd 100644
--- a/src/hb-glib.cc
+++ b/src/hb-glib.cc
@@ -296,16 +296,16 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
     *b = 0;
     ret = *a != ab;
   } else if (len == 2) {
+    *a = g_utf8_get_char (normalized);
+    *b = g_utf8_get_char (g_utf8_next_char (normalized));
     /* Here's the ugly part: if ab decomposes to a single character and
      * that character decomposes again, we have to detect that and undo
      * the second part :-(. */
     gchar *recomposed = g_utf8_normalize (normalized, -1, G_NORMALIZE_NFC);
-    if (g_utf8_get_char (recomposed) != ab) {
-      *a = g_utf8_get_char (recomposed);
+    hb_codepoint_t c = g_utf8_get_char (recomposed);
+    if (c != ab && c != *a) {
+      *a = c;
       *b = 0;
-    } else {
-      *a = g_utf8_get_char (normalized);
-      *b = g_utf8_get_char (g_utf8_next_char (normalized));
     }
     g_free (recomposed);
     ret = TRUE;
diff --git a/src/hb-icu.cc b/src/hb-icu.cc
index 7b85cd5..7fe78d2 100644
--- a/src/hb-icu.cc
+++ b/src/hb-icu.cc
@@ -214,6 +214,10 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
   hb_bool_t ret, err;
   UErrorCode icu_err;
 
+  /* This function is a monster! Maybe it wasn't a good idea adding a
+   * pairwise decompose API... */
+  /* Watchout for the dragons.  Err, watchout for macros changing len. */
+
   len = 0;
   err = FALSE;
   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
@@ -232,21 +236,23 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
     *b = 0;
     ret = *a != ab;
   } else if (len == 2) {
+    len =0;
+    U16_NEXT_UNSAFE (normalized, len, *a);
+    U16_NEXT_UNSAFE (normalized, len, *b);
+
     /* Here's the ugly part: if ab decomposes to a single character and
      * that character decomposes again, we have to detect that and undo
      * the second part :-(. */
     UChar recomposed[20];
     icu_err = U_ZERO_ERROR;
-    len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
+    unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
     if (icu_err)
       return FALSE;
-    U16_GET_UNSAFE (recomposed, 0, *a);
-    if (*a != ab) {
+    hb_codepoint_t c;
+    U16_GET_UNSAFE (recomposed, 0, c);
+    if (c != *a && c != ab) {
+      *a = c;
       *b = 0;
-    } else {
-      len =0;
-      U16_NEXT_UNSAFE (normalized, len, *a);
-      U16_GET_UNSAFE (normalized, len, *b);
     }
     ret = TRUE;
   } else {
diff --git a/test/test-unicode.c b/test/test-unicode.c
index c614c7d..9f526d7 100644
--- a/test/test-unicode.c
+++ b/test/test-unicode.c
@@ -800,6 +800,10 @@ test_unicode_normalization (gconstpointer user_data)
   g_assert (!hb_unicode_compose (uf, 0x2126, 0, &ab) && ab == 0);
   g_assert (!hb_unicode_compose (uf, 0x03A9, 0, &ab) && ab == 0);
 
+  /* Non-starter pairs should not compose */
+  g_assert (!hb_unicode_compose (uf, 0x0308, 0x0301, &ab) && ab == 0); /* !0x0344 */
+  g_assert (!hb_unicode_compose (uf, 0x0F71, 0x0F72, &ab) && ab == 0); /* !0x0F73 */
+
   /* Pairs */
   g_assert (hb_unicode_compose (uf, 0x0041, 0x030A, &ab) && ab == 0x00C5);
   g_assert (hb_unicode_compose (uf, 0x006F, 0x0302, &ab) && ab == 0x00F4);
@@ -822,12 +826,13 @@ test_unicode_normalization (gconstpointer user_data)
   g_assert (!hb_unicode_decompose (uf, 0xFB01, &a, &b) && a == 0xFB01 && b == 0);
 
   /* Singletons */
-  g_assert (hb_unicode_decompose (uf, 0x212B, &a, &b));
-  g_assert_cmphex (a, ==, 0x00C5);
-  g_assert_cmphex (b, ==, 0);
   g_assert (hb_unicode_decompose (uf, 0x212B, &a, &b) && a == 0x00C5 && b == 0);
   g_assert (hb_unicode_decompose (uf, 0x2126, &a, &b) && a == 0x03A9 && b == 0);
 
+  /* Non-starter pairs decompose, but not compose */
+  g_assert (hb_unicode_decompose (uf, 0x0344, &a, &b) && a == 0x0308 && b == 0x0301);
+  g_assert (hb_unicode_decompose (uf, 0x0F73, &a, &b) && a == 0x0F71 && b == 0x0F72);
+
   /* Pairs */
   g_assert (hb_unicode_decompose (uf, 0x00C5, &a, &b) && a == 0x0041 && b == 0x030A);
   g_assert (hb_unicode_decompose (uf, 0x00F4, &a, &b) && a == 0x006F && b == 0x0302);
commit 5d90a342e319068716429bf7af76c3896b61a0e5
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Thu Jul 21 15:25:01 2011 -0400

    Document normalization design

diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc
index a791e7c..6832779 100644
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -25,20 +25,55 @@
  */
 
 #include "hb-ot-shape-private.hh"
+#include "hb-ot-shape-complex-private.hh"
 
 HB_BEGIN_DECLS
 
+/*
+ * HIGHLEVEL DESIGN:
+ *
+ * This file exports one main function: _hb_ot_shape_normalize().
+ *
+ * This function closely reflects the Unicode Normalization Algorithm,
+ * yet it's different.  The shaper an either prefer decomposed (NFD) or
+ * composed (NFC).
+ *
+ * In general what happens is that: each grapheme is decomposed in a chain
+ * of 1:2 decompositions, marks reordered, and then recomposed if desires,
+ * so far it's like Unicode Normalization.  However, the decomposition and
+ * recomposition only happens if the font supports the resulting characters.
+ *
+ * The goals are:
+ *
+ *   - Try to render all canonically equivalent strings similarly.  To really
+ *     achieve this we have to always do the full decomposition and then
+ *     selectively recompose from there.  It's kinda too expensive though, so
+ *     we skip some cases.  For example, if composed is desired, we simply
+ *     don't touch 1-character clusters that are supported by the font, even
+ *     though their NFC may be different.
+ *
+ *   - When a font has a precomposed character for a sequence but the 'ccmp'
+ *     feature in the font is not adequate, form use the precomposed character
+ *     which typically has better mark positioning.
+ *
+ *   - When a font does not support a character but supports its decomposition,
+ *     well, use the decomposition.
+ *
+ *   - The Indic shaper requests decomposed output.  This will handle splitting
+ *     matra for the Indic shaper.
+ */
+
 static bool
 get_glyph (hb_ot_shape_context_t *c, unsigned int i)
 {
-  hb_buffer_t *b = c->buffer;
   hb_codepoint_t glyph;
 
-  return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &glyph);
+  return hb_font_get_glyph (c->font, c->buffer->info[i].codepoint, 0, &glyph);
 }
 
 static bool
 decompose_single_char_cluster (hb_ot_shape_context_t *c,
+			       bool recompose,
 			       unsigned int i)
 {
   return FALSE;
@@ -46,22 +81,24 @@ decompose_single_char_cluster (hb_ot_shape_context_t *c,
 
 static bool
 handle_single_char_cluster (hb_ot_shape_context_t *c,
+			    bool recompose,
 			    unsigned int i)
 {
-  /* If the single char is supported by the font, we're good. */
-  if (get_glyph (c, i))
+  /* If recomposing and the single char is supported by the font, we're good. */
+  if (recompose && get_glyph (c, i))
     return FALSE;
 
   /* Decompose */
-  return decompose_single_char_cluster (c, i);
+  return decompose_single_char_cluster (c, recompose, i);
 }
 
 static bool
 handle_multi_char_cluster (hb_ot_shape_context_t *c,
+			   bool recompose,
 			   unsigned int start,
 			   unsigned int end)
 {
-  /* If there's a variation-selector, give-up, it's just too hard. */
+  /* TODO Currently if there's a variation-selector we give-up, it's just too hard. */
   for (unsigned int i = start; i < end; i++)
     if (unlikely (is_variation_selector (c->buffer->info[i].codepoint)))
       return FALSE;
@@ -70,24 +107,33 @@ handle_multi_char_cluster (hb_ot_shape_context_t *c,
 }
 
 bool
-_hb_normalize (hb_ot_shape_context_t *c)
+_hb_ot_shape_normalize (hb_ot_shape_context_t *c)
 {
-  hb_buffer_t *b = c->buffer;
+  hb_buffer_t *buffer = c->buffer;
   bool changed = FALSE;
+  bool recompose = !hb_ot_shape_complex_prefer_decomposed (c->plan->shaper);
+
+  buffer->clear_output ();
+
+  unsigned int count = buffer->len;
+  for (buffer->i = 0; buffer->i < count;)
+  {
 
-  unsigned int count = b->len;
-  for (unsigned int i = 0; i < count;) {
     unsigned int end;
-    for (end = i + 1; end < count; end++)
-      if (b->info[i].cluster != b->info[end].cluster)
+    for (end = buffer->i + 1; end < count; end++)
+      if (buffer->info[buffer->i].cluster != buffer->info[end].cluster)
         break;
-    if (i + 1 == end)
-      changed |= handle_single_char_cluster (c, i);
+
+    if (buffer->i + 1 == end)
+      changed |= handle_single_char_cluster (c, recompose, buffer->i);
     else
-      changed |= handle_multi_char_cluster (c, i, end);
-    i = end;
+      changed |= handle_multi_char_cluster (c, recompose, buffer->i, end);
+    while (buffer->i < end)
+      c->buffer->next_glyph ();
   }
 
+  buffer->swap ();
+
   return changed;
 }
 
diff --git a/src/hb-ot-shape-private.hh b/src/hb-ot-shape-private.hh
index 96c436d..17b3c99 100644
--- a/src/hb-ot-shape-private.hh
+++ b/src/hb-ot-shape-private.hh
@@ -100,7 +100,7 @@ is_variation_selector (hb_codepoint_t unicode)
 }
 
 
-HB_INTERNAL bool _hb_normalize (hb_ot_shape_context_t *c);
+HB_INTERNAL bool _hb_ot_shape_normalize (hb_ot_shape_context_t *c);
 
 HB_END_DECLS
 
diff --git a/src/hb-ot-shape.cc b/src/hb-ot-shape.cc
index bffd075..d1c495f 100644
--- a/src/hb-ot-shape.cc
+++ b/src/hb-ot-shape.cc
@@ -254,11 +254,13 @@ static void
 hb_map_glyphs (hb_font_t    *font,
 	       hb_buffer_t  *buffer)
 {
+  hb_codepoint_t glyph;
+
   if (unlikely (!buffer->len))
     return;
 
-  hb_codepoint_t glyph;
   buffer->clear_output ();
+
   unsigned int count = buffer->len - 1;
   for (buffer->i = 0; buffer->i < count;) {
     if (unlikely (is_variation_selector (buffer->info[buffer->i + 1].codepoint))) {
@@ -363,7 +365,7 @@ hb_ot_shape_execute_internal (hb_ot_shape_context_t *c)
 
   hb_ensure_native_direction (c->buffer);
 
-  if (_hb_normalize (c))
+  if (_hb_ot_shape_normalize (c))
     /* Buffer contents changed, reset unicode_props */
     hb_set_unicode_props (c->buffer); /* BUFFER: Set general_category and combining_class in var1 */
 
commit 02cdf743c2ec345a44d4fcf865594b6ac13fccd0
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Thu Jul 21 12:23:12 2011 -0400

    Add prefer_decomposed() complex-shaper callback
    
    This allows the Indic shaper to request decomposed characters.  This will
    handle split matra for free.  Other shapers prefer precomposed
    characters.

diff --git a/src/hb-ot-shape-complex-arabic.cc b/src/hb-ot-shape-complex-arabic.cc
index 53e7a9b..dc63db2 100644
--- a/src/hb-ot-shape-complex-arabic.cc
+++ b/src/hb-ot-shape-complex-arabic.cc
@@ -183,6 +183,12 @@ _hb_ot_shape_complex_collect_features_arabic (hb_ot_map_builder_t *map, const hb
   map->add_bool_feature (HB_TAG('c','s','w','h'));
 }
 
+bool
+_hb_ot_shape_complex_prefer_decomposed_arabic (void)
+{
+  return FALSE;
+}
+
 void
 _hb_ot_shape_complex_setup_masks_arabic (hb_ot_map_t *map, hb_buffer_t *buffer)
 {
diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc
index 03ea10f..cf5a049 100644
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@@ -309,6 +309,13 @@ _hb_ot_shape_complex_collect_features_indic (hb_ot_map_builder_t *map, const hb_
 }
 
 
+bool
+_hb_ot_shape_complex_prefer_decomposed_indic (void)
+{
+  /* We want split matras decomposed by the common shaping logic. */
+  return TRUE;
+}
+
 static void
 found_syllable (hb_ot_map_t *map, hb_buffer_t *buffer,
 		unsigned int start, unsigned int end)
diff --git a/src/hb-ot-shape-complex-misc.cc b/src/hb-ot-shape-complex-misc.cc
index 92dee49..b2de3ff 100644
--- a/src/hb-ot-shape-complex-misc.cc
+++ b/src/hb-ot-shape-complex-misc.cc
@@ -42,6 +42,12 @@ _hb_ot_shape_complex_collect_features_default (hb_ot_map_builder_t *map, const h
 {
 }
 
+bool
+_hb_ot_shape_complex_prefer_decomposed_default (void)
+{
+  return FALSE;
+}
+
 void
 _hb_ot_shape_complex_setup_masks_default (hb_ot_map_t *map, hb_buffer_t *buffer)
 {
diff --git a/src/hb-ot-shape-complex-private.hh b/src/hb-ot-shape-complex-private.hh
index c10fdf9..4bfd855 100644
--- a/src/hb-ot-shape-complex-private.hh
+++ b/src/hb-ot-shape-complex-private.hh
@@ -140,7 +140,34 @@ hb_ot_shape_complex_collect_features (hb_ot_complex_shaper_t shaper,
   switch (shaper) {
     default:
 #define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
-    case hb_ot_complex_shaper_##name:	_hb_ot_shape_complex_collect_features_##name	(map, props);	return;
+    case hb_ot_complex_shaper_##name:	_hb_ot_shape_complex_collect_features_##name (map, props); return;
+    HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
+#undef HB_COMPLEX_SHAPER_IMPLEMENT
+  }
+}
+
+
+/*
+ * prefer_decomposed()
+ *
+ * Called during shape_execute().
+ *
+ * Shapers should return TRUE if it prefers decomposed (NFD) input rather than precomposed (NFC).
+ */
+
+typedef bool hb_ot_shape_complex_prefer_decomposed_func_t (void);
+#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
+  HB_INTERNAL hb_ot_shape_complex_prefer_decomposed_func_t _hb_ot_shape_complex_prefer_decomposed_##name;
+  HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
+#undef HB_COMPLEX_SHAPER_IMPLEMENT
+
+static inline bool
+hb_ot_shape_complex_prefer_decomposed (hb_ot_complex_shaper_t shaper)
+{
+  switch (shaper) {
+    default:
+#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
+    case hb_ot_complex_shaper_##name:	return _hb_ot_shape_complex_prefer_decomposed_##name ();
     HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
 #undef HB_COMPLEX_SHAPER_IMPLEMENT
   }
@@ -168,7 +195,7 @@ hb_ot_shape_complex_setup_masks (hb_ot_complex_shaper_t shaper,
   switch (shaper) {
     default:
 #define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
-    case hb_ot_complex_shaper_##name:	_hb_ot_shape_complex_setup_masks_##name	(map, buffer);	return;
+    case hb_ot_complex_shaper_##name:	_hb_ot_shape_complex_setup_masks_##name (map, buffer); return;
     HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
 #undef HB_COMPLEX_SHAPER_IMPLEMENT
   }
commit d6b9c6d20041b4f4fa11befc179aee757c41904d
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Thu Jul 21 12:16:45 2011 -0400

    More kicking

diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc
index 0a245b0..a791e7c 100644
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -38,23 +38,34 @@ get_glyph (hb_ot_shape_context_t *c, unsigned int i)
 }
 
 static bool
+decompose_single_char_cluster (hb_ot_shape_context_t *c,
+			       unsigned int i)
+{
+  return FALSE;
+}
+
+static bool
 handle_single_char_cluster (hb_ot_shape_context_t *c,
 			    unsigned int i)
 {
+  /* If the single char is supported by the font, we're good. */
   if (get_glyph (c, i))
     return FALSE;
 
   /* Decompose */
-
-  return FALSE;
+  return decompose_single_char_cluster (c, i);
 }
 
 static bool
 handle_multi_char_cluster (hb_ot_shape_context_t *c,
-			   unsigned int i,
+			   unsigned int start,
 			   unsigned int end)
 {
   /* If there's a variation-selector, give-up, it's just too hard. */
+  for (unsigned int i = start; i < end; i++)
+    if (unlikely (is_variation_selector (c->buffer->info[i].codepoint)))
+      return FALSE;
+
   return FALSE;
 }
 
commit 192445aef2e50087049243ce54ce7059ec441ffa
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Thu Jul 21 12:13:04 2011 -0400

    Remove intermittent_glyph()
    
    Lets not worry about performance for now...

diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc
index f6e962c..0a245b0 100644
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -32,8 +32,9 @@ static bool
 get_glyph (hb_ot_shape_context_t *c, unsigned int i)
 {
   hb_buffer_t *b = c->buffer;
+  hb_codepoint_t glyph;
 
-  return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &b->info[i].intermittent_glyph());
+  return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &glyph);
 }
 
 static bool
diff --git a/src/hb-ot-shape-private.hh b/src/hb-ot-shape-private.hh
index 03dd4ed..96c436d 100644
--- a/src/hb-ot-shape-private.hh
+++ b/src/hb-ot-shape-private.hh
@@ -102,8 +102,6 @@ is_variation_selector (hb_codepoint_t unicode)
 
 HB_INTERNAL bool _hb_normalize (hb_ot_shape_context_t *c);
 
-#define intermittent_glyph() var2.u32
-
 HB_END_DECLS
 
 #endif /* HB_OT_SHAPE_PRIVATE_HH */



More information about the HarfBuzz mailing list