[HarfBuzz] harfbuzz-ng: Branch 'master' - 2 commits

Behdad Esfahbod behdad at kemper.freedesktop.org
Mon Jul 23 10:53:28 PDT 2012


 src/hb-ot-shape-complex-misc.cc                                     |   49 ++++++----
 src/hb-private.hh                                                   |    6 +
 src/hb-unicode.cc                                                   |    6 +
 test/shaping/texts/in-tree/shaper-thai/MANIFEST                     |    1 
 test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST          |    1 
 test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST     |    1 
 test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt  |   20 ++++
 test/shaping/texts/in-tree/shaper-thai/script-thai/misc/MANIFEST    |    1 
 test/shaping/texts/in-tree/shaper-thai/script-thai/misc/phinthu.txt |   16 +++
 test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt |   18 +++
 10 files changed, 102 insertions(+), 17 deletions(-)

New commits:
commit 42848453bf260b456b46a07f066e31b8c3aac2f1
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Mon Jul 23 13:52:07 2012 -0400

    [Thai] Reorder U+0E3A THAI VOWEL SIGN PHINTHU
    
    Uniscribe reorders U+0E3A to be after U+0E38 and U+0E39.  We do that by
    modifying the ccc for U+0E3A.
    
    Fixes the two remaining Thai failures (see previous commit).

diff --git a/src/hb-ot-shape-complex-misc.cc b/src/hb-ot-shape-complex-misc.cc
index 17e2625..6852d47 100644
--- a/src/hb-ot-shape-complex-misc.cc
+++ b/src/hb-ot-shape-complex-misc.cc
@@ -132,6 +132,13 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED,
    * chattawa.
    *
    * Same for Lao.
+   *
+   * Note:
+   *
+   * Uniscribe also does so below-marks reordering.  Namely, it positions U+0E3A
+   * after U+0E38 and U+0E39.  We do that by modifying the ccc for U+0E3A.
+   * See _hb_unicode_modified_combining_class ().  Lao does NOT have a U+0E3A
+   * equivalent.
    */
 
 
diff --git a/src/hb-unicode.cc b/src/hb-unicode.cc
index 140f382..3569b20 100644
--- a/src/hb-unicode.cc
+++ b/src/hb-unicode.cc
@@ -363,6 +363,12 @@ _hb_unicode_modified_combining_class (hb_unicode_funcs_t *ufuncs,
     };
     c = permuted_hebrew_classes[c - 10];
   }
+  else if (unlikely (unicode == 0x0E3A)) /* THAI VOWEL SIGN PHINTHU */
+  {
+    /* Assign 104, so it reorders after the THAI ccc=103 marks.
+     * Uniscribe does this. */
+    c = 104;
+  }
 
   return c;
 }
diff --git a/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/MANIFEST b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/MANIFEST
index ffd16f1..6aa865b 100644
--- a/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/MANIFEST
+++ b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/MANIFEST
@@ -1 +1,2 @@
+phinthu.txt
 sara-am.txt
diff --git a/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/phinthu.txt b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/phinthu.txt
new file mode 100644
index 0000000..e304777
--- /dev/null
+++ b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/phinthu.txt
@@ -0,0 +1,16 @@
+ป
+ปฺ
+ปุ
+ปู
+ปุู
+ปูุ
+ปฺุ
+ปฺุ
+ปฺู
+ปฺู
+ปฺุู
+ปฺุู
+ปฺุู
+ปฺูุ
+ปฺูุ
+ปฺูุ
commit 4a7f4f3e56f8f7640ae7337aa1b3324f31e0d4ab
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Mon Jul 23 13:15:33 2012 -0400

    [Thai] Adjust SARA AM reordering to match Uniscribe
    
    Adjust the list of marks before SARA AM that get the reordering
    treatment.  Also adjust cluster formation to match Uniscribe.
    
    With Wikipedia test data, now I see:
    
      - For Thai, with the Angsana New font from Win7, I see 54 failures out
        of over 4M tests  (0.00129107%).  Of the 54, two are legitimate
        reordering issues (fix coming soon), and the other 52 are simply
        Uniscribe using a zero-width space char instead of an unknown
        character for missing glyphs.  No idea why.  The missing-glyph
        sequences include one that is a Thai character followed by an Arabic
        Sokun.  Someone confused it with Nikhahit I assume!
    
      - For Lao, with the Dokchampa font from Win7, 33 tests fail out of
        54k (0.0615167%).  All seem to be insignificant mark positioning
        with two marks on a base.  Have to investigate.

diff --git a/src/hb-ot-shape-complex-misc.cc b/src/hb-ot-shape-complex-misc.cc
index 7a11876..17e2625 100644
--- a/src/hb-ot-shape-complex-misc.cc
+++ b/src/hb-ot-shape-complex-misc.cc
@@ -121,19 +121,20 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED,
   /* The following is NOT specified in the MS OT Thai spec, however, it seems
    * to be what Uniscribe and other engines implement.  According to Eric Muller:
    *
-   * When you have a sara am, decompose it in nikhahit + sara a, *and* mode the
-   * nihka hit backwards over any *tone* mark (0E48-0E4B).
+   * When you have a SARA AM, decompose it in NIKHAHIT + SARA AA, *and* move the
+   * NIKHAHIT backwards over any tone mark (0E48-0E4B).
    *
    * <0E14, 0E4B, 0E33> -> <0E14, 0E4D, 0E4B, 0E32>
    *
-   * This reordering is legit only when the nikhahit comes from a sara am, not
+   * This reordering is legit only when the NIKHAHIT comes from a SARA AM, not
    * when it's there to start with. The string <0E14, 0E4B, 0E4D> is probably
-   * not what a u↪ser wanted, but the rendering is nevertheless nikhahit above
+   * not what a user wanted, but the rendering is nevertheless nikhahit above
    * chattawa.
    *
    * Same for Lao.
    */
 
+
   /*
    * Here are the characters of significance:
    *
@@ -142,9 +143,9 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED,
    * SARA AA:		U+0E32	U+0EB2
    * Nikhahit:		U+0E4D	U+0ECD
    *
-   * Tone marks:
-   * Thai:	<0E48..0E4B> CCC=107
-   * Lao:	<0EC8..0ECB> CCC=122
+   * Testing shows that Uniscribe reorder the following marks:
+   * Thai:	<0E31..0E37,0E47..0E4E>
+   * Lao:	<0EB1..0EB7,0EC7..0ECE>
    *
    * Note how the Lao versions are the same as Thai + 0x80.
    */
@@ -154,7 +155,7 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED,
 #define IS_SARA_AM(x) (((x) & ~0x0080) == 0x0E33)
 #define NIKHAHIT_FROM_SARA_AM(x) ((x) - 0xE33 + 0xE4D)
 #define SARA_AA_FROM_SARA_AM(x) ((x) - 1)
-#define IS_TONE_MARK(x) (((x) & ~0x0083) == 0x0E48)
+#define IS_TONE_MARK(x) (hb_in_ranges<hb_codepoint_t> ((x) & ~0x0080, 0x0E31, 0x0E37, 0x0E47, 0x0E4E))
 
   buffer->clear_output ();
   unsigned int count = buffer->len;
@@ -179,14 +180,23 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED,
     while (start > 0 && IS_TONE_MARK (buffer->out_info[start - 1].codepoint))
       start--;
 
-    /* Move Nikhahit (end-2) to the beginning */
-    hb_glyph_info_t t = buffer->out_info[end - 2];
-    memmove (buffer->out_info + start + 1,
-	     buffer->out_info + start,
-	     sizeof (buffer->out_info[0]) * (end - start - 2));
-    buffer->out_info[start] = t;
-
-    buffer->merge_out_clusters (start, end);
+    if (start + 2 < end)
+    {
+      /* Move Nikhahit (end-2) to the beginning */
+      buffer->merge_out_clusters (start, end);
+      hb_glyph_info_t t = buffer->out_info[end - 2];
+      memmove (buffer->out_info + start + 1,
+	       buffer->out_info + start,
+	       sizeof (buffer->out_info[0]) * (end - start - 2));
+      buffer->out_info[start] = t;
+    }
+    else
+    {
+      /* Since we decomposed, and NIKHAHIT is combining, merge clusters with the
+       * previous cluster. */
+      if (start)
+	buffer->merge_out_clusters (start - 1, end);
+    }
   }
   buffer->swap_buffers ();
 }
diff --git a/src/hb-private.hh b/src/hb-private.hh
index 3f710ed..0b9c4ef 100644
--- a/src/hb-private.hh
+++ b/src/hb-private.hh
@@ -729,6 +729,12 @@ hb_in_range (T u, T lo, T hi)
     return lo <= u && u <= hi;
 }
 
+template <typename T> static inline bool
+hb_in_ranges (T u, T lo1, T hi1, T lo2, T hi2)
+{
+  return hb_in_range (u, lo1, hi1) || hb_in_range (u, lo2, hi2);
+}
+
 
 /* Useful for set-operations on small enums.
  * For example, for testing "x ∈ {x1, x2, x3}" use:
diff --git a/test/shaping/texts/in-tree/shaper-thai/MANIFEST b/test/shaping/texts/in-tree/shaper-thai/MANIFEST
index 22bc0ed..32b5476 100644
--- a/test/shaping/texts/in-tree/shaper-thai/MANIFEST
+++ b/test/shaping/texts/in-tree/shaper-thai/MANIFEST
@@ -1 +1,2 @@
+script-lao
 script-thai
diff --git a/test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST b/test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST
new file mode 100644
index 0000000..b8752e7
--- /dev/null
+++ b/test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST
@@ -0,0 +1 @@
+misc
diff --git a/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST
new file mode 100644
index 0000000..ffd16f1
--- /dev/null
+++ b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST
@@ -0,0 +1 @@
+sara-am.txt
diff --git a/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt
new file mode 100644
index 0000000..234d8c0
--- /dev/null
+++ b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt
@@ -0,0 +1,20 @@
+ດຳ
+ດ໋ຳ
+ດໍ໋າ
+ດ໋ໍາ
+ມັຳ
+ມິຳ
+ມີຳ
+ມຶຳ
+ມືຳ
+ມຸຳ
+ມູຳ
+ມ຺ຳ
+ມ໇ຳ
+ມ່ຳ
+ມ້ຳ
+ມ໊ຳ
+ມ໋ຳ
+ມ໌ຳ
+ມໍຳ
+ມ໎ຳ
diff --git a/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt
index 6d385ef..9f044ce 100644
--- a/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt
+++ b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt
@@ -1,4 +1,20 @@
-ำ
 ดำ
 ด๋ำ
 ดํ๋า
+ด๋ํา
+มัำ
+มิำ
+มีำ
+มึำ
+มืำ
+มุำ
+มูำ
+มฺำ
+ม็ำ
+ม่ำ
+ม้ำ
+ม๊ำ
+ม๋ำ
+ม์ำ
+มํำ
+ม๎ำ



More information about the HarfBuzz mailing list