[HarfBuzz] hangul shaper patches
Jonathan Kew
jfkthame at googlemail.com
Sun Jan 19 17:30:26 PST 2014
Hi Behdad,
I'm attaching a series of patches for improvements to the Hangul shaper.
These provide support for Old Hangul sequences that do not have a
precomposed Unicode form, and handle the tone-mark reordering.
With these patches, we exactly match uniscribe on the wikipedia test
corpus using malgun.ttf, except for (a) cases where there's a character
that's not supported in the font, so uniscribe gives .notdef but
harfbuzz finds a compatibility fallback, and (b) a handful of words
where there's an <LV, T> sequence that uniscribe doesn't support (it has
no corresponding LVT syllable), but we handle by decomposing to <L, V,
T> and applying jamo features.
JK
-------------- next part --------------
commit 311490e6109c997887bc8012e41473b870b67ceb
Author: Jonathan Kew <jfkthame at gmail.com>
Date: Sun Jan 19 23:31:16 2014 +0000
[ot-hangul] Apply the appropriate *jmo features to decomposed syllables, including Old Hangul sequences that don't have Unicode compositions. Merge clusters in decomposed syllables.
diff --git a/src/hb-ot-shape-complex-hangul.cc b/src/hb-ot-shape-complex-hangul.cc
index 7c137c6..f37ed85 100644
--- a/src/hb-ot-shape-complex-hangul.cc
+++ b/src/hb-ot-shape-complex-hangul.cc
@@ -30,21 +30,62 @@
/* Hangul shaper */
-static const hb_tag_t hangul_features[] =
+/* Same order as the feature array below */
+enum {
+ NONE,
+
+ LJMO,
+ VJMO,
+ TJMO,
+
+ FIRST_HANGUL_FEATURE = LJMO,
+ HANGUL_FEATURE_COUNT = TJMO + 1
+};
+
+static const hb_tag_t hangul_features[HANGUL_FEATURE_COUNT] =
{
+ HB_TAG_NONE,
HB_TAG('l','j','m','o'),
HB_TAG('v','j','m','o'),
- HB_TAG('t','j','m','o'),
- HB_TAG_NONE
+ HB_TAG('t','j','m','o')
};
static void
collect_features_hangul (hb_ot_shape_planner_t *plan)
{
- for (const hb_tag_t *script_features = hangul_features; script_features && *script_features; script_features++)
- plan->map.add_global_bool_feature (*script_features);
+ hb_ot_map_builder_t *map = &plan->map;
+
+ for (unsigned int i = FIRST_HANGUL_FEATURE; i < HANGUL_FEATURE_COUNT; i++)
+ map->add_feature (hangul_features[i], 1, F_NONE);
+}
+
+struct hangul_shape_plan_t
+{
+ ASSERT_POD ();
+
+ hb_mask_t mask_array[HANGUL_FEATURE_COUNT];
+};
+
+static void *
+data_create_hangul (const hb_ot_shape_plan_t *plan)
+{
+ hangul_shape_plan_t *hangul_plan = (hangul_shape_plan_t *) calloc (1, sizeof (hangul_shape_plan_t));
+ if (unlikely (!hangul_plan))
+ return NULL;
+
+ for (unsigned int i = 0; i < HANGUL_FEATURE_COUNT; i++)
+ hangul_plan->mask_array[i] = plan->map.get_1_mask (hangul_features[i]);
+
+ return hangul_plan;
+}
+
+static void
+data_destroy_hangul (void *data)
+{
+ free (data);
}
+/* Constants for algorithmic hangul syllable [de]composition. */
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
@@ -60,13 +101,28 @@ collect_features_hangul (hb_ot_shape_planner_t *plan)
#define isCombiningT(u) (hb_in_range<hb_codepoint_t> ((u), TBase+1, TBase+TCount-1))
#define isCombinedS(u) (hb_in_range<hb_codepoint_t> ((u), SBase, SBase+SCount-1))
-#define isT(u) (hb_in_ranges<hb_codepoint_t> ((u), 0x11A8, 0x11FF, 0xD7CB, 0xD7FB))
+#define isL(u) (hb_in_ranges<hb_codepoint_t> ((u), 0x1100, 0x115F, 0xA960, 0xA97C))
+#define isV(u) (hb_in_ranges<hb_codepoint_t> ((u), 0x1160, 0x11A7, 0xD7B0, 0xD7C6))
+#define isT(u) (hb_in_ranges<hb_codepoint_t> ((u), 0x11A8, 0x11FF, 0xD7CB, 0xD7FB))
+
+/* buffer var allocations */
+#define hangul_shaping_feature() complex_var_u8_0() /* hangul jamo shaping feature */
+
+static bool
+is_zero_width_char (hb_font_t *font,
+ hb_codepoint_t unicode)
+{
+ hb_codepoint_t glyph;
+ return hb_font_get_glyph (font, unicode, 0, &glyph) && hb_font_get_glyph_h_advance (font, glyph) == 0;
+}
static void
preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
hb_buffer_t *buffer,
hb_font_t *font)
{
+ HB_BUFFER_ALLOCATE_VAR (buffer, hangul_shaping_feature);
+
/* Hangul syllables come in two shapes: LV, and LVT. Of those:
*
* - LV can be precomposed, or decomposed. Lets call those
@@ -90,7 +146,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
* Here is what we want to accomplish in this shaper:
*
* - If the whole syllable can be precomposed, do that,
- * - Otherwise, fully decompose.
+ * - Otherwise, fully decompose and apply ljmo/vjmo/tjmo features.
*
* That is, of the different possible syllables:
*
@@ -113,52 +169,77 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
*/
buffer->clear_output ();
+ unsigned int start = 0, end = 0; /* Extent of most recently seen syllable;
+ * valid only if start < end
+ */
unsigned int count = buffer->len;
+
for (buffer->idx = 0; buffer->idx < count;)
{
hb_codepoint_t u = buffer->cur().codepoint;
- if (isCombiningL(u) && buffer->idx + 1 < count)
+ start = buffer->out_len; /* Remember current position as a potential syllable start;
+ * will only be used if we set end to a later position.
+ */
+
+ if (isL (u) && buffer->idx + 1 < count)
{
hb_codepoint_t l = u;
hb_codepoint_t v = buffer->cur(+1).codepoint;
- if (isCombiningV(v))
+ if (isV (v))
{
- /* Have <L,V> or <L,V,T>. */
- unsigned int len = 2;
+ /* Have <L,V> or <L,V,T>. */
+ hb_codepoint_t t = 0;
unsigned int tindex = 0;
if (buffer->idx + 2 < count)
{
- hb_codepoint_t t = buffer->cur(+2).codepoint;
- if (isCombiningT(t))
- {
- len = 3;
- tindex = t - TBase;
- }
- else if (isT (t))
- {
- /* Old T jamo. Doesn't combine. Don't combine *anything*. */
- len = 0;
- }
+ t = buffer->cur(+2).codepoint;
+ if (isT (t))
+ tindex = t - TBase; /* Only used if isCombiningT (t); otherwise invalid. */
+ else
+ t = 0; /* The next character was not a trailing jamo. */
}
- if (len)
+ /* We've got a syllable <L,V,T?>; see if it can potentially be composed. */
+ if (isCombiningL (l) && isCombiningV (v) && (t == 0 || isCombiningT (t)))
{
+ /* Try to compose; if this succeeds, end is set to start+1. */
hb_codepoint_t s = SBase + (l - LBase) * NCount + (v - VBase) * TCount + tindex;
if (font->has_glyph (s))
{
- buffer->replace_glyphs (len, 1, &s);
+ buffer->replace_glyphs (t ? 3 : 2, 1, &s);
if (unlikely (buffer->in_error))
return;
+ end = start + 1;
continue;
}
}
+
+ /* We didn't compose, either because it's an Old Hangul syllable without a
+ * precomposed character in Unicode, or because the font didn't support the
+ * necessary precomposed glyph.
+ * Set jamo features on the individual glyphs, and advance past them.
+ */
+ buffer->cur().hangul_shaping_feature() = LJMO;
+ buffer->next_glyph ();
+ buffer->cur().hangul_shaping_feature() = VJMO;
+ buffer->next_glyph ();
+ if (t)
+ {
+ buffer->cur().hangul_shaping_feature() = TJMO;
+ buffer->next_glyph ();
+ end = start + 3;
+ }
+ else
+ end = start + 2;
+ buffer->merge_out_clusters (start, end);
+ continue;
}
}
- else if (isCombinedS(u))
+ else if (isCombinedS (u))
{
- /* Have <LV>, <LVT>, or <LV,T> */
+ /* Have <LV>, <LVT>, or <LV,T> */
hb_codepoint_t s = u;
bool has_glyph = font->has_glyph (s);
unsigned int lindex = (s - SBase) / NCount;
@@ -173,11 +254,12 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
/* <LV,T>, try to combine. */
unsigned int new_tindex = buffer->cur(+1).codepoint - TBase;
hb_codepoint_t new_s = s + new_tindex;
- if (font->has_glyph (new_s))
+ if (font->has_glyph (new_s))
{
buffer->replace_glyphs (2, 1, &new_s);
if (unlikely (buffer->in_error))
return;
+ end = start + 1;
continue;
}
}
@@ -193,35 +275,86 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
hb_codepoint_t decomposed[3] = {LBase + lindex,
VBase + vindex,
TBase + tindex};
- if (font->has_glyph (decomposed[0]) &&
+ if (font->has_glyph (decomposed[0]) &&
font->has_glyph (decomposed[1]) &&
(!tindex || font->has_glyph (decomposed[2])))
{
- buffer->replace_glyphs (1, tindex ? 3 : 2, decomposed);
+ unsigned int s_len = tindex ? 3 : 2;
+ buffer->replace_glyphs (1, s_len, decomposed);
if (unlikely (buffer->in_error))
return;
+
+ /* We decomposed S: apply jamo features to the individual glyphs
+ * that are now in buffer->out_info.
+ */
+ hb_glyph_info_t *info = buffer->out_info;
+
+ /* If we decomposed an LV because of a non-combining T following,
+ * we want to include this T in the syllable.
+ */
+ if (has_glyph && !tindex)
+ {
+ buffer->next_glyph ();
+ s_len++;
+ }
+ end = start + s_len;
+
+ unsigned int i = start;
+ info[i++].hangul_shaping_feature() = LJMO;
+ info[i++].hangul_shaping_feature() = VJMO;
+ if (i < end)
+ info[i++].hangul_shaping_feature() = TJMO;
+ buffer->merge_out_clusters (start, end);
continue;
}
}
+
+ if (has_glyph)
+ {
+ /* We didn't decompose the S, so just advance past it. */
+ end = start + 1;
+ buffer->next_glyph ();
+ continue;
+ }
}
+ /* Didn't find a recognizable syllable. */
buffer->next_glyph ();
}
buffer->swap_buffers ();
}
+static void
+setup_masks_hangul (const hb_ot_shape_plan_t *plan,
+ hb_buffer_t *buffer,
+ hb_font_t *font HB_UNUSED)
+{
+ const hangul_shape_plan_t *hangul_plan = (const hangul_shape_plan_t *) plan->data;
+
+ if (likely (hangul_plan))
+ {
+ unsigned int count = buffer->len;
+ hb_glyph_info_t *info = buffer->info;
+ for (unsigned int i = 0; i < count; i++, info++)
+ info->mask |= hangul_plan->mask_array[info->hangul_shaping_feature()];
+ }
+
+ HB_BUFFER_DEALLOCATE_VAR (buffer, hangul_shaping_feature);
+}
+
+
const hb_ot_complex_shaper_t _hb_ot_complex_shaper_hangul =
{
"hangul",
collect_features_hangul,
NULL, /* override_features */
- NULL, /* data_create */
- NULL, /* data_destroy */
+ data_create_hangul, /* data_create */
+ data_destroy_hangul, /* data_destroy */
preprocess_text_hangul,
HB_OT_SHAPE_NORMALIZATION_MODE_NONE,
NULL, /* decompose */
NULL, /* compose */
- NULL, /* setup_masks */
+ setup_masks_hangul, /* setup_masks */
HB_OT_SHAPE_ZERO_WIDTH_MARKS_DEFAULT,
false, /* fallback_position */
};
-------------- next part --------------
commit 7b187fe00d0f93008c80f96e7e213892a8b0f46c
Author: Jonathan Kew <jfkthame at gmail.com>
Date: Sun Jan 19 23:34:05 2014 +0000
[ot-hangul] Reorder Hangul tone mark to beginning of syllable, unless font implements it using a zero-width glyph.
diff --git a/src/hb-ot-shape-complex-hangul.cc b/src/hb-ot-shape-complex-hangul.cc
index f37ed85..710df32 100644
--- a/src/hb-ot-shape-complex-hangul.cc
+++ b/src/hb-ot-shape-complex-hangul.cc
@@ -105,6 +105,8 @@ data_destroy_hangul (void *data)
#define isV(u) (hb_in_ranges<hb_codepoint_t> ((u), 0x1160, 0x11A7, 0xD7B0, 0xD7C6))
#define isT(u) (hb_in_ranges<hb_codepoint_t> ((u), 0x11A8, 0x11FF, 0xD7CB, 0xD7FB))
+#define isHangulTone(u) (hb_in_range<hb_codepoint_t> ((u), 0x302e, 0x302f))
+
/* buffer var allocations */
#define hangul_shaping_feature() complex_var_u8_0() /* hangul jamo shaping feature */
@@ -147,6 +149,9 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
*
* - If the whole syllable can be precomposed, do that,
* - Otherwise, fully decompose and apply ljmo/vjmo/tjmo features.
+ * - If a valid syllable is followed by a Hangul tone mark, reorder the tone
+ * mark to precede the whole syllable - unless it is a zero-width glyph, in
+ * which case we leave it untouched, assuming it's designed to overstrike.
*
* That is, of the different possible syllables:
*
@@ -178,6 +183,56 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
{
hb_codepoint_t u = buffer->cur().codepoint;
+ if (isHangulTone (u))
+ {
+ /*
+ * We could cache the width of the tone marks and the existence of dotted-circle,
+ * but the use of the Hangul tone mark characters seems to be rare enough that
+ * I didn't bother for now.
+ */
+ if (start < end && end == buffer->out_len)
+ {
+ /* Tone mark follows a valid syllable; move it in front, unless it's zero width. */
+ buffer->next_glyph ();
+ if (!is_zero_width_char (font, u))
+ {
+ hb_glyph_info_t *info = buffer->out_info;
+ hb_glyph_info_t tone = info[end];
+ memmove (&info[start + 1], &info[start], (end - start) * sizeof (hb_glyph_info_t));
+ info[start] = tone;
+ }
+ /* Merge clusters across the (possibly reordered) syllable+tone.
+ * We want to merge even in the zero-width tone mark case here,
+ * so that clustering behavior isn't dependent on how the tone mark
+ * is handled by the font.
+ */
+ buffer->merge_out_clusters (start, end + 1);
+ }
+ else
+ {
+ /* No valid syllable as base for tone mark; try to insert dotted circle. */
+ if (font->has_glyph (0x25cc))
+ {
+ hb_codepoint_t chars[2];
+ if (is_zero_width_char (font, u)) {
+ chars[0] = u;
+ chars[1] = 0x25cc;
+ } else {
+ chars[0] = 0x25cc;
+ chars[1] = u;
+ }
+ buffer->replace_glyphs (1, 2, chars);
+ }
+ else
+ {
+ /* No dotted circle available in the font; just leave tone mark untouched. */
+ buffer->next_glyph ();
+ }
+ }
+ start = end = buffer->out_len;
+ continue;
+ }
+
start = buffer->out_len; /* Remember current position as a potential syllable start;
* will only be used if we set end to a later position.
*/
@@ -318,7 +373,9 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan,
}
}
- /* Didn't find a recognizable syllable. */
+ /* Didn't find a recognizable syllable, so we leave end <= start;
+ * this will prevent tone-mark reordering happening.
+ */
buffer->next_glyph ();
}
buffer->swap_buffers ();
-------------- next part --------------
commit 831579b4b811ad3afe3e764cf064286beecb1b4f
Author: Jonathan Kew <jfkthame at gmail.com>
Date: Sun Jan 19 23:36:17 2014 +0000
[hangul] Don't hide any of the Jamo Filler characters, as some fonts want these to be visible/spacing glyphs.
diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh
index cd54cf7..ba193e8 100644
--- a/src/hb-unicode-private.hh
+++ b/src/hb-unicode-private.hh
@@ -134,10 +134,10 @@ HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE
* 6.3 is also added manually. The new Unicode 6.3 bidi formatting
* characters are encoded in a block that was Default_Ignorable already.
*
- * Note: While U+115F and U+1160 are Default_Ignorable, we do NOT want to
- * hide them, as the way Uniscribe has implemented them is with regular
- * spacing glyphs, and that's the way fonts are made to work. As such,
- * we make exceptions for those two.
+ * Note: While U+115F, U+1160, U+3164 and U+FFA0 are Default_Ignorable,
+ * we do NOT want to hide them, as the way Uniscribe has implemented them
+ * is with regular spacing glyphs, and that's the way fonts are made to work.
+ * As such, we make exceptions for those four.
*
* Gathered from:
* http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:DI:]&abb=on&ucd=on&esc=on
@@ -159,10 +159,10 @@ HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE
* 200B..200F ;RIGHT-TO-LEFT MARK
* 202A..202E ;RIGHT-TO-LEFT OVERRIDE
* 2060..206F ;NOMINAL DIGIT SHAPES
- * 3164 ;HANGUL FILLER
+ * #3164 ;HANGUL FILLER
* FE00..FE0F ;VARIATION SELECTOR-16
* FEFF ;ZERO WIDTH NO-BREAK SPACE
- * FFA0 ;HALFWIDTH HANGUL FILLER
+ * #FFA0 ;HALFWIDTH HANGUL FILLER
* FFF0..FFF8 ;<unassigned-FFF8>
* 1D173..1D17A ;MUSICAL SYMBOL END PHRASE
* E0000..E0FFF ;<unassigned-E0FFF>
@@ -184,9 +184,8 @@ HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE
case 0x20: return hb_in_ranges<hb_codepoint_t> (ch, 0x200B, 0x200F,
0x202A, 0x202E,
0x2060, 0x206F);
- case 0x31: return unlikely (ch == 0x3164);
case 0xFE: return hb_in_range<hb_codepoint_t> (ch, 0xFE00, 0xFE0F) || ch == 0xFEFF;
- case 0xFF: return hb_in_range<hb_codepoint_t> (ch, 0xFFF0, 0xFFF8) || ch == 0xFFA0;
+ case 0xFF: return hb_in_range<hb_codepoint_t> (ch, 0xFFF0, 0xFFF8);
default: return false;
}
}
-------------- next part --------------
commit 7988d1b0f6339866cd8be560dbb962be129c3bfb
Author: Jonathan Kew <jfkthame at gmail.com>
Date: Sun Jan 19 23:37:31 2014 +0000
[hangul] Don't zero width of marks - this is not wanted for the Jamo Filler glyphs.
diff --git a/src/hb-ot-shape-complex-hangul.cc b/src/hb-ot-shape-complex-hangul.cc
index 710df32..6df6c3f 100644
--- a/src/hb-ot-shape-complex-hangul.cc
+++ b/src/hb-ot-shape-complex-hangul.cc
@@ -412,6 +412,6 @@ const hb_ot_complex_shaper_t _hb_ot_complex_shaper_hangul =
NULL, /* decompose */
NULL, /* compose */
setup_masks_hangul, /* setup_masks */
- HB_OT_SHAPE_ZERO_WIDTH_MARKS_DEFAULT,
+ HB_OT_SHAPE_ZERO_WIDTH_MARKS_NONE,
false, /* fallback_position */
};
More information about the HarfBuzz
mailing list