[HarfBuzz] harfbuzz: Branch 'master'

Fri Jan 5 12:48:46 UTC 2018

src/hb-buffer-private.hh                   |    1 
 src/hb-ot-layout-private.hh                |   11 +++++++-
 src/hb-ot-shape-complex-arabic.cc          |   37 +++++++++++++++++++++--------
 src/hb-ot-shape-normalize.cc               |   30 ++++++++++++-----------
 test/shaping/tests/arabic-mark-order.tests |    4 +++
 5 files changed, 58 insertions(+), 25 deletions(-)

New commits:
commit 8c0d1916a41f0fb32340ce5257de780acf598353
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Fri Jan 5 12:46:12 2018 +0000

    Improve CGJ skipping logic
    
    Previously we made CGJ unskippable.  Now, if CGJ did NOT prevent
    any reordering, allow skipping over it.  To make this work we
    had to make changes to the Arabic mark reordering algorithm
    implementation to renumber moved MCM marks.  See comments.
    
    Fixes https://github.com/harfbuzz/harfbuzz/issues/554

diff --git a/src/hb-buffer-private.hh b/src/hb-buffer-private.hh
index 97bdc1be..fa5ca166 100644
--- a/src/hb-buffer-private.hh
+++ b/src/hb-buffer-private.hh
@@ -69,6 +69,7 @@ enum hb_buffer_scratch_flags_t {
   HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK		= 0x00000004u,
   HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT		= 0x00000008u,
   HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK		= 0x00000010u,
+  HB_BUFFER_SCRATCH_FLAG_HAS_CGJ			= 0x00000020u,
 
   /* Reserved for complex shapers' internal use. */
   HB_BUFFER_SCRATCH_FLAG_COMPLEX0			= 0x01000000u,
diff --git a/src/hb-ot-layout-private.hh b/src/hb-ot-layout-private.hh
index 0f0926f8..be5695d9 100644
--- a/src/hb-ot-layout-private.hh
+++ b/src/hb-ot-layout-private.hh
@@ -280,7 +280,11 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer)
       else if (unlikely (hb_in_range (u, 0xE0020u, 0xE007Fu))) props |= UPROPS_MASK_HIDDEN;
       /* COMBINING GRAPHEME JOINER should not be skipped; at least some times.
        * https://github.com/harfbuzz/harfbuzz/issues/554 */
-      else if (unlikely (u == 0x034Fu)) props |= UPROPS_MASK_HIDDEN;
+      else if (unlikely (u == 0x034Fu))
+      {
+	buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_CGJ;
+	props |= UPROPS_MASK_HIDDEN;
+      }
     }
     else if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK_OR_MODIFIER_SYMBOL (gen_cat)))
     {
@@ -388,6 +392,11 @@ _hb_glyph_info_is_default_ignorable_and_not_hidden (const hb_glyph_info_t *info)
 	  == UPROPS_MASK_IGNORABLE) &&
 	 !_hb_glyph_info_ligated (info);
 }
+static inline void
+_hb_glyph_info_unhide (hb_glyph_info_t *info)
+{
+  info->unicode_props() &= ~ UPROPS_MASK_HIDDEN;
+}
 
 static inline bool
 _hb_glyph_info_is_unicode_format (const hb_glyph_info_t *info)
diff --git a/src/hb-ot-shape-complex-arabic.cc b/src/hb-ot-shape-complex-arabic.cc
index eb9d36ff..47961bfd 100644
--- a/src/hb-ot-shape-complex-arabic.cc
+++ b/src/hb-ot-shape-complex-arabic.cc
@@ -644,13 +644,15 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
 {
   hb_glyph_info_t *info = buffer->info;
 
+  DEBUG_MSG (ARABIC, buffer, "Reordering marks from %d to %d", start, end);
+
   unsigned int i = start;
   for (unsigned int cc = 220; cc <= 230; cc += 10)
   {
-    DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d\n", cc, i);
+    DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d", cc, i);
     while (i < end && info_cc(info[i]) < cc)
       i++;
-    DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d\n", cc, i);
+    DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d", cc, i);
 
     if (i == end)
       break;
@@ -658,20 +660,17 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
     if (info_cc(info[i]) > cc)
       continue;
 
-    /* Technically we should also check "info_cc(info[j]) == cc"
-     * in the following loop.  But not doing it is safe; we might
-     * end up moving all the 220 MCMs and 230 MCMs together in one
-     * move and be done. */
     unsigned int j = i;
-    while (j < end && info_is_mcm (info[j]))
+    while (j < end && info_cc(info[j]) == cc && info_is_mcm (info[j]))
       j++;
-    DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d\n", cc, i, j);
 
     if (i == j)
       continue;
 
+    DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d", cc, i, j);
+
     /* Shift it! */
-    DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d\n", cc, i, j);
+    DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d", cc, i, j);
     hb_glyph_info_t temp[HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS];
     assert (j - i <= ARRAY_LENGTH (temp));
     buffer->merge_clusters (start, j);
@@ -679,7 +678,25 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
     memmove (&info[start + j - i], &info[start], (i - start) * sizeof (hb_glyph_info_t));
     memmove (&info[start], temp, (j - i) * sizeof (hb_glyph_info_t));
 
-    start += j - i;
+    /* Renumber CC such that the reordered sequence is still sorted.
+     * 22 and 26 are chosen because they are smaller than all Arabic categories,
+     * and are folded back to 220/230 respectively during fallback mark positioning.
+     *
+     * We do this because the CGJ-handling logic in the normalizer relies on
+     * mark sequences having an increasing order even after this reordering.
+     * https://github.com/harfbuzz/harfbuzz/issues/554
+     * This, however, does break some obscure sequences, where the normalizer
+     * might compose a sequence that it should not.  For example, in the seequence
+     * ALEF, HAMZAH, MADDAH, we should NOT try to compose ALEF+MADDAH, but with this
+     * renumbering, we will.
+     */
+    unsigned int new_start = start + j - i;
+    unsigned int new_cc = cc == 220 ? HB_MODIFIED_COMBINING_CLASS_CCC22 : HB_MODIFIED_COMBINING_CLASS_CCC26;
+    while (start < new_start)
+    {
+      _hb_glyph_info_set_modified_combining_class (&info[start], new_cc);
+      start++;
+    }
 
     i = j;
   }
diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc
index fd9e7c2a..62cbb9de 100644
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -345,9 +345,8 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
       if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0)
         break;
 
-    /* We are going to do a O(n^2).  Only do this if the sequence is short,
-     * but not too short ;). */
-    if (end - i < 2 || end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) {
+    /* We are going to do a O(n^2).  Only do this if the sequence is short. */
+    if (end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) {
       i = end;
       continue;
     }
@@ -373,13 +372,11 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
   buffer->clear_output ();
   count = buffer->len;
   unsigned int starter = 0;
-  bool combine = true;
   buffer->next_glyph ();
   while (buffer->idx < count && !buffer->in_error)
   {
     hb_codepoint_t composed, glyph;
-    if (combine &&
-	/* We don't try to compose a non-mark character with it's preceding starter.
+    if (/* We don't try to compose a non-mark character with it's preceding starter.
 	 * This is both an optimization to avoid trying to compose every two neighboring
 	 * glyphs in most scripts AND a desired feature for Hangul.  Apparently Hangul
 	 * fonts are not designed to mix-and-match pre-composed syllables and Jamo. */
@@ -410,22 +407,27 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
 
 	continue;
       }
-      else if (/* We sometimes custom-tailor the sorted order of marks. In that case, stop
-		* trying to combine as soon as combining-class drops. */
-	       starter < buffer->out_len - 1 &&
-	       info_cc (buffer->prev()) > info_cc (buffer->cur()))
-        combine = false;
     }
 
     /* Blocked, or doesn't compose. */
     buffer->next_glyph ();
 
     if (info_cc (buffer->prev()) == 0)
-    {
       starter = buffer->out_len - 1;
-      combine = true;
-    }
   }
   buffer->swap_buffers ();
 
+  if (buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ)
+  {
+    /* For all CGJ, check if it prevented any reordering at all.
+     * If it did NOT, then make it skippable.
+     * https://github.com/harfbuzz/harfbuzz/issues/554
+     */
+    for (unsigned int i = 1; i + 1 < buffer->len; i++)
+      if (buffer->info[i].codepoint == 0x034Fu/*CGJ*/ &&
+	  info_cc(buffer->info[i-1]) <= info_cc(buffer->info[i+1]))
+      {
+	_hb_glyph_info_unhide (&buffer->info[i]);
+      }
+  }
 }
diff --git a/test/shaping/tests/arabic-mark-order.tests b/test/shaping/tests/arabic-mark-order.tests
index b48c82fd..cc5226bb 100644
--- a/test/shaping/tests/arabic-mark-order.tests
+++ b/test/shaping/tests/arabic-mark-order.tests
@@ -1,2 +1,6 @@
 fonts/sha1sum/94a5d6fb15a27521fba9ea4aee9cb39b2d03322a.ttf::U+064A,U+064E,U+0670,U+0653,U+0640,U+0654,U+064E,U+0627:[afii57415.zz04=7+481|afii57454=4 at 25,975+0|uni0654=4 at -50,50+0|afii57440=4+650|uni0670_uni0653=0 at 75,400+0|afii57454=0 at 750,1125+0|afii57450.calt=0+1331]
 fonts/sha1sum/24b8d24d00ae86f49791b746da4c9d3f717a51a8.ttf::U+0628,U+0618,U+0619,U+064E,U+064F,U+0654,U+0658,U+0653,U+0654,U+0651,U+0656,U+0651,U+065C,U+0655,U+0650:[uni0653.small=0 at 266,2508+0|uni0654=0 at 308,2151+0|uni0655=0 at 518,-1544+0|uni065C=0 at 501,-1453+0|uni0656=0 at 573,-659+0|uni0650=0 at 500,133+0|uni0619=0 at 300,1807+0|uni0618=0 at 357,1674+0|uni0651064E=0 at 387,1178+0|uni0651=0 at 402,764+0|uni0658=0 at 424,404+0|uni0654064F=0 at 540,-435+0|uni0628=0+1352]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+034F,U+0650:[uni0650.small2=0 at 727,-774+0|space=0+0|uni0655=0 at 727,-209+0|uni0649=0+1566]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+0650:[uni0650.small2=0 at 727,-774+0|uni0655=0 at 727,-209+0|uni0649=0+1566]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+0655:[uni0650.small2=0 at 727,-774+0|uni0655=0 at 727,-209+0|uni0649=0+1566]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+034F,U+0655:[uni0655=0+0|space=0+0|uni0650=0 at 166,0+0|uni0649=0+1566]