[HarfBuzz] harfbuzz-ng: Branch 'master' - 5 commits
Behdad Esfahbod
behdad at kemper.freedesktop.org
Thu Jul 21 17:59:12 PDT 2011
src/hb-glib.cc | 10 ++--
src/hb-icu.cc | 20 +++++---
src/hb-ot-shape-complex-arabic.cc | 6 ++
src/hb-ot-shape-complex-indic.cc | 7 ++
src/hb-ot-shape-complex-misc.cc | 6 ++
src/hb-ot-shape-complex-private.hh | 31 +++++++++++-
src/hb-ot-shape-normalize.cc | 92 ++++++++++++++++++++++++++++++-------
src/hb-ot-shape-private.hh | 4 -
src/hb-ot-shape.cc | 6 +-
test/test-unicode.c | 11 +++-
10 files changed, 154 insertions(+), 39 deletions(-)
New commits:
commit 63c0ef4a0763e579c9c80887bbfbd2651de05067
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Thu Jul 21 20:58:42 2011 -0400
Fix decompose() implementations to work with non-starter non-composables
Add tests.
diff --git a/src/hb-glib.cc b/src/hb-glib.cc
index fbf8cf5..76e1dfd 100644
--- a/src/hb-glib.cc
+++ b/src/hb-glib.cc
@@ -296,16 +296,16 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
*b = 0;
ret = *a != ab;
} else if (len == 2) {
+ *a = g_utf8_get_char (normalized);
+ *b = g_utf8_get_char (g_utf8_next_char (normalized));
/* Here's the ugly part: if ab decomposes to a single character and
* that character decomposes again, we have to detect that and undo
* the second part :-(. */
gchar *recomposed = g_utf8_normalize (normalized, -1, G_NORMALIZE_NFC);
- if (g_utf8_get_char (recomposed) != ab) {
- *a = g_utf8_get_char (recomposed);
+ hb_codepoint_t c = g_utf8_get_char (recomposed);
+ if (c != ab && c != *a) {
+ *a = c;
*b = 0;
- } else {
- *a = g_utf8_get_char (normalized);
- *b = g_utf8_get_char (g_utf8_next_char (normalized));
}
g_free (recomposed);
ret = TRUE;
diff --git a/src/hb-icu.cc b/src/hb-icu.cc
index 7b85cd5..7fe78d2 100644
--- a/src/hb-icu.cc
+++ b/src/hb-icu.cc
@@ -214,6 +214,10 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
hb_bool_t ret, err;
UErrorCode icu_err;
+ /* This function is a monster! Maybe it wasn't a good idea adding a
+ * pairwise decompose API... */
+ /* Watchout for the dragons. Err, watchout for macros changing len. */
+
len = 0;
err = FALSE;
U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
@@ -232,21 +236,23 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
*b = 0;
ret = *a != ab;
} else if (len == 2) {
+ len =0;
+ U16_NEXT_UNSAFE (normalized, len, *a);
+ U16_NEXT_UNSAFE (normalized, len, *b);
+
/* Here's the ugly part: if ab decomposes to a single character and
* that character decomposes again, we have to detect that and undo
* the second part :-(. */
UChar recomposed[20];
icu_err = U_ZERO_ERROR;
- len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
+ unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
if (icu_err)
return FALSE;
- U16_GET_UNSAFE (recomposed, 0, *a);
- if (*a != ab) {
+ hb_codepoint_t c;
+ U16_GET_UNSAFE (recomposed, 0, c);
+ if (c != *a && c != ab) {
+ *a = c;
*b = 0;
- } else {
- len =0;
- U16_NEXT_UNSAFE (normalized, len, *a);
- U16_GET_UNSAFE (normalized, len, *b);
}
ret = TRUE;
} else {
diff --git a/test/test-unicode.c b/test/test-unicode.c
index c614c7d..9f526d7 100644
--- a/test/test-unicode.c
+++ b/test/test-unicode.c
@@ -800,6 +800,10 @@ test_unicode_normalization (gconstpointer user_data)
g_assert (!hb_unicode_compose (uf, 0x2126, 0, &ab) && ab == 0);
g_assert (!hb_unicode_compose (uf, 0x03A9, 0, &ab) && ab == 0);
+ /* Non-starter pairs should not compose */
+ g_assert (!hb_unicode_compose (uf, 0x0308, 0x0301, &ab) && ab == 0); /* !0x0344 */
+ g_assert (!hb_unicode_compose (uf, 0x0F71, 0x0F72, &ab) && ab == 0); /* !0x0F73 */
+
/* Pairs */
g_assert (hb_unicode_compose (uf, 0x0041, 0x030A, &ab) && ab == 0x00C5);
g_assert (hb_unicode_compose (uf, 0x006F, 0x0302, &ab) && ab == 0x00F4);
@@ -822,12 +826,13 @@ test_unicode_normalization (gconstpointer user_data)
g_assert (!hb_unicode_decompose (uf, 0xFB01, &a, &b) && a == 0xFB01 && b == 0);
/* Singletons */
- g_assert (hb_unicode_decompose (uf, 0x212B, &a, &b));
- g_assert_cmphex (a, ==, 0x00C5);
- g_assert_cmphex (b, ==, 0);
g_assert (hb_unicode_decompose (uf, 0x212B, &a, &b) && a == 0x00C5 && b == 0);
g_assert (hb_unicode_decompose (uf, 0x2126, &a, &b) && a == 0x03A9 && b == 0);
+ /* Non-starter pairs decompose, but not compose */
+ g_assert (hb_unicode_decompose (uf, 0x0344, &a, &b) && a == 0x0308 && b == 0x0301);
+ g_assert (hb_unicode_decompose (uf, 0x0F73, &a, &b) && a == 0x0F71 && b == 0x0F72);
+
/* Pairs */
g_assert (hb_unicode_decompose (uf, 0x00C5, &a, &b) && a == 0x0041 && b == 0x030A);
g_assert (hb_unicode_decompose (uf, 0x00F4, &a, &b) && a == 0x006F && b == 0x0302);
commit 5d90a342e319068716429bf7af76c3896b61a0e5
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Thu Jul 21 15:25:01 2011 -0400
Document normalization design
diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc
index a791e7c..6832779 100644
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -25,20 +25,55 @@
*/
#include "hb-ot-shape-private.hh"
+#include "hb-ot-shape-complex-private.hh"
HB_BEGIN_DECLS
+/*
+ * HIGHLEVEL DESIGN:
+ *
+ * This file exports one main function: _hb_ot_shape_normalize().
+ *
+ * This function closely reflects the Unicode Normalization Algorithm,
+ * yet it's different. The shaper an either prefer decomposed (NFD) or
+ * composed (NFC).
+ *
+ * In general what happens is that: each grapheme is decomposed in a chain
+ * of 1:2 decompositions, marks reordered, and then recomposed if desires,
+ * so far it's like Unicode Normalization. However, the decomposition and
+ * recomposition only happens if the font supports the resulting characters.
+ *
+ * The goals are:
+ *
+ * - Try to render all canonically equivalent strings similarly. To really
+ * achieve this we have to always do the full decomposition and then
+ * selectively recompose from there. It's kinda too expensive though, so
+ * we skip some cases. For example, if composed is desired, we simply
+ * don't touch 1-character clusters that are supported by the font, even
+ * though their NFC may be different.
+ *
+ * - When a font has a precomposed character for a sequence but the 'ccmp'
+ * feature in the font is not adequate, form use the precomposed character
+ * which typically has better mark positioning.
+ *
+ * - When a font does not support a character but supports its decomposition,
+ * well, use the decomposition.
+ *
+ * - The Indic shaper requests decomposed output. This will handle splitting
+ * matra for the Indic shaper.
+ */
+
static bool
get_glyph (hb_ot_shape_context_t *c, unsigned int i)
{
- hb_buffer_t *b = c->buffer;
hb_codepoint_t glyph;
- return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &glyph);
+ return hb_font_get_glyph (c->font, c->buffer->info[i].codepoint, 0, &glyph);
}
static bool
decompose_single_char_cluster (hb_ot_shape_context_t *c,
+ bool recompose,
unsigned int i)
{
return FALSE;
@@ -46,22 +81,24 @@ decompose_single_char_cluster (hb_ot_shape_context_t *c,
static bool
handle_single_char_cluster (hb_ot_shape_context_t *c,
+ bool recompose,
unsigned int i)
{
- /* If the single char is supported by the font, we're good. */
- if (get_glyph (c, i))
+ /* If recomposing and the single char is supported by the font, we're good. */
+ if (recompose && get_glyph (c, i))
return FALSE;
/* Decompose */
- return decompose_single_char_cluster (c, i);
+ return decompose_single_char_cluster (c, recompose, i);
}
static bool
handle_multi_char_cluster (hb_ot_shape_context_t *c,
+ bool recompose,
unsigned int start,
unsigned int end)
{
- /* If there's a variation-selector, give-up, it's just too hard. */
+ /* TODO Currently if there's a variation-selector we give-up, it's just too hard. */
for (unsigned int i = start; i < end; i++)
if (unlikely (is_variation_selector (c->buffer->info[i].codepoint)))
return FALSE;
@@ -70,24 +107,33 @@ handle_multi_char_cluster (hb_ot_shape_context_t *c,
}
bool
-_hb_normalize (hb_ot_shape_context_t *c)
+_hb_ot_shape_normalize (hb_ot_shape_context_t *c)
{
- hb_buffer_t *b = c->buffer;
+ hb_buffer_t *buffer = c->buffer;
bool changed = FALSE;
+ bool recompose = !hb_ot_shape_complex_prefer_decomposed (c->plan->shaper);
+
+ buffer->clear_output ();
+
+ unsigned int count = buffer->len;
+ for (buffer->i = 0; buffer->i < count;)
+ {
- unsigned int count = b->len;
- for (unsigned int i = 0; i < count;) {
unsigned int end;
- for (end = i + 1; end < count; end++)
- if (b->info[i].cluster != b->info[end].cluster)
+ for (end = buffer->i + 1; end < count; end++)
+ if (buffer->info[buffer->i].cluster != buffer->info[end].cluster)
break;
- if (i + 1 == end)
- changed |= handle_single_char_cluster (c, i);
+
+ if (buffer->i + 1 == end)
+ changed |= handle_single_char_cluster (c, recompose, buffer->i);
else
- changed |= handle_multi_char_cluster (c, i, end);
- i = end;
+ changed |= handle_multi_char_cluster (c, recompose, buffer->i, end);
+ while (buffer->i < end)
+ c->buffer->next_glyph ();
}
+ buffer->swap ();
+
return changed;
}
diff --git a/src/hb-ot-shape-private.hh b/src/hb-ot-shape-private.hh
index 96c436d..17b3c99 100644
--- a/src/hb-ot-shape-private.hh
+++ b/src/hb-ot-shape-private.hh
@@ -100,7 +100,7 @@ is_variation_selector (hb_codepoint_t unicode)
}
-HB_INTERNAL bool _hb_normalize (hb_ot_shape_context_t *c);
+HB_INTERNAL bool _hb_ot_shape_normalize (hb_ot_shape_context_t *c);
HB_END_DECLS
diff --git a/src/hb-ot-shape.cc b/src/hb-ot-shape.cc
index bffd075..d1c495f 100644
--- a/src/hb-ot-shape.cc
+++ b/src/hb-ot-shape.cc
@@ -254,11 +254,13 @@ static void
hb_map_glyphs (hb_font_t *font,
hb_buffer_t *buffer)
{
+ hb_codepoint_t glyph;
+
if (unlikely (!buffer->len))
return;
- hb_codepoint_t glyph;
buffer->clear_output ();
+
unsigned int count = buffer->len - 1;
for (buffer->i = 0; buffer->i < count;) {
if (unlikely (is_variation_selector (buffer->info[buffer->i + 1].codepoint))) {
@@ -363,7 +365,7 @@ hb_ot_shape_execute_internal (hb_ot_shape_context_t *c)
hb_ensure_native_direction (c->buffer);
- if (_hb_normalize (c))
+ if (_hb_ot_shape_normalize (c))
/* Buffer contents changed, reset unicode_props */
hb_set_unicode_props (c->buffer); /* BUFFER: Set general_category and combining_class in var1 */
commit 02cdf743c2ec345a44d4fcf865594b6ac13fccd0
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Thu Jul 21 12:23:12 2011 -0400
Add prefer_decomposed() complex-shaper callback
This allows the Indic shaper to request decomposed characters. This will
handle split matra for free. Other shapers prefer precomposed
characters.
diff --git a/src/hb-ot-shape-complex-arabic.cc b/src/hb-ot-shape-complex-arabic.cc
index 53e7a9b..dc63db2 100644
--- a/src/hb-ot-shape-complex-arabic.cc
+++ b/src/hb-ot-shape-complex-arabic.cc
@@ -183,6 +183,12 @@ _hb_ot_shape_complex_collect_features_arabic (hb_ot_map_builder_t *map, const hb
map->add_bool_feature (HB_TAG('c','s','w','h'));
}
+bool
+_hb_ot_shape_complex_prefer_decomposed_arabic (void)
+{
+ return FALSE;
+}
+
void
_hb_ot_shape_complex_setup_masks_arabic (hb_ot_map_t *map, hb_buffer_t *buffer)
{
diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc
index 03ea10f..cf5a049 100644
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@@ -309,6 +309,13 @@ _hb_ot_shape_complex_collect_features_indic (hb_ot_map_builder_t *map, const hb_
}
+bool
+_hb_ot_shape_complex_prefer_decomposed_indic (void)
+{
+ /* We want split matras decomposed by the common shaping logic. */
+ return TRUE;
+}
+
static void
found_syllable (hb_ot_map_t *map, hb_buffer_t *buffer,
unsigned int start, unsigned int end)
diff --git a/src/hb-ot-shape-complex-misc.cc b/src/hb-ot-shape-complex-misc.cc
index 92dee49..b2de3ff 100644
--- a/src/hb-ot-shape-complex-misc.cc
+++ b/src/hb-ot-shape-complex-misc.cc
@@ -42,6 +42,12 @@ _hb_ot_shape_complex_collect_features_default (hb_ot_map_builder_t *map, const h
{
}
+bool
+_hb_ot_shape_complex_prefer_decomposed_default (void)
+{
+ return FALSE;
+}
+
void
_hb_ot_shape_complex_setup_masks_default (hb_ot_map_t *map, hb_buffer_t *buffer)
{
diff --git a/src/hb-ot-shape-complex-private.hh b/src/hb-ot-shape-complex-private.hh
index c10fdf9..4bfd855 100644
--- a/src/hb-ot-shape-complex-private.hh
+++ b/src/hb-ot-shape-complex-private.hh
@@ -140,7 +140,34 @@ hb_ot_shape_complex_collect_features (hb_ot_complex_shaper_t shaper,
switch (shaper) {
default:
#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
- case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_collect_features_##name (map, props); return;
+ case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_collect_features_##name (map, props); return;
+ HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
+#undef HB_COMPLEX_SHAPER_IMPLEMENT
+ }
+}
+
+
+/*
+ * prefer_decomposed()
+ *
+ * Called during shape_execute().
+ *
+ * Shapers should return TRUE if it prefers decomposed (NFD) input rather than precomposed (NFC).
+ */
+
+typedef bool hb_ot_shape_complex_prefer_decomposed_func_t (void);
+#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
+ HB_INTERNAL hb_ot_shape_complex_prefer_decomposed_func_t _hb_ot_shape_complex_prefer_decomposed_##name;
+ HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
+#undef HB_COMPLEX_SHAPER_IMPLEMENT
+
+static inline bool
+hb_ot_shape_complex_prefer_decomposed (hb_ot_complex_shaper_t shaper)
+{
+ switch (shaper) {
+ default:
+#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
+ case hb_ot_complex_shaper_##name: return _hb_ot_shape_complex_prefer_decomposed_##name ();
HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
#undef HB_COMPLEX_SHAPER_IMPLEMENT
}
@@ -168,7 +195,7 @@ hb_ot_shape_complex_setup_masks (hb_ot_complex_shaper_t shaper,
switch (shaper) {
default:
#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \
- case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_setup_masks_##name (map, buffer); return;
+ case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_setup_masks_##name (map, buffer); return;
HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS
#undef HB_COMPLEX_SHAPER_IMPLEMENT
}
commit d6b9c6d20041b4f4fa11befc179aee757c41904d
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Thu Jul 21 12:16:45 2011 -0400
More kicking
diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc
index 0a245b0..a791e7c 100644
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -38,23 +38,34 @@ get_glyph (hb_ot_shape_context_t *c, unsigned int i)
}
static bool
+decompose_single_char_cluster (hb_ot_shape_context_t *c,
+ unsigned int i)
+{
+ return FALSE;
+}
+
+static bool
handle_single_char_cluster (hb_ot_shape_context_t *c,
unsigned int i)
{
+ /* If the single char is supported by the font, we're good. */
if (get_glyph (c, i))
return FALSE;
/* Decompose */
-
- return FALSE;
+ return decompose_single_char_cluster (c, i);
}
static bool
handle_multi_char_cluster (hb_ot_shape_context_t *c,
- unsigned int i,
+ unsigned int start,
unsigned int end)
{
/* If there's a variation-selector, give-up, it's just too hard. */
+ for (unsigned int i = start; i < end; i++)
+ if (unlikely (is_variation_selector (c->buffer->info[i].codepoint)))
+ return FALSE;
+
return FALSE;
}
commit 192445aef2e50087049243ce54ce7059ec441ffa
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Thu Jul 21 12:13:04 2011 -0400
Remove intermittent_glyph()
Lets not worry about performance for now...
diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc
index f6e962c..0a245b0 100644
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -32,8 +32,9 @@ static bool
get_glyph (hb_ot_shape_context_t *c, unsigned int i)
{
hb_buffer_t *b = c->buffer;
+ hb_codepoint_t glyph;
- return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &b->info[i].intermittent_glyph());
+ return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &glyph);
}
static bool
diff --git a/src/hb-ot-shape-private.hh b/src/hb-ot-shape-private.hh
index 03dd4ed..96c436d 100644
--- a/src/hb-ot-shape-private.hh
+++ b/src/hb-ot-shape-private.hh
@@ -102,8 +102,6 @@ is_variation_selector (hb_codepoint_t unicode)
HB_INTERNAL bool _hb_normalize (hb_ot_shape_context_t *c);
-#define intermittent_glyph() var2.u32
-
HB_END_DECLS
#endif /* HB_OT_SHAPE_PRIVATE_HH */
More information about the HarfBuzz
mailing list