[HarfBuzz] harfbuzz: Branch 'master' - 17 commits

Behdad Esfahbod behdad at kemper.freedesktop.org
Sat Nov 24 14:48:15 UTC 2018


 docs/usermanual-clusters.xml    |  772 ++++++++++++++++++++++++++--------------
 src/hb-aat-layout-kerx-table.hh |    5 
 src/hb-aat-layout-morx-table.hh |    5 
 src/hb-aat-map.hh               |    4 
 src/hb-dsalgs.hh                |  110 +++++
 src/hb-face.cc                  |    6 
 src/hb-open-type.hh             |  106 ++---
 src/hb-ot-map.hh                |    4 
 src/hb-set.hh                   |    2 
 src/hb-vector.hh                |   88 +---
 10 files changed, 698 insertions(+), 404 deletions(-)

New commits:
commit 8dcc1913a1670ede7b124f7b5b775d7ab8791386
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 09:47:45 2018 -0500

    [kerx/morx] Make sure object length is sanitized before accessing it

diff --git a/src/hb-aat-layout-kerx-table.hh b/src/hb-aat-layout-kerx-table.hh
index 521c4c72..2d548932 100644
--- a/src/hb-aat-layout-kerx-table.hh
+++ b/src/hb-aat-layout-kerx-table.hh
@@ -962,6 +962,11 @@ struct KerxTable
     unsigned int count = thiz()->tableCount;
     for (unsigned int i = 0; i < count; i++)
     {
+      if (unlikely (!st->u.header.sanitize (c)))
+      {
+	c->reset_object ();
+	return_trace (false);
+      }
       /* OpenType kern table has 2-byte subtable lengths.  That's limiting.
        * MS implementation also only supports one subtable, of format 0,
        * anyway.  Certain versions of some fonts, like Calibry, contain
diff --git a/src/hb-aat-layout-morx-table.hh b/src/hb-aat-layout-morx-table.hh
index 7a39eea8..5b44a4cf 100644
--- a/src/hb-aat-layout-morx-table.hh
+++ b/src/hb-aat-layout-morx-table.hh
@@ -1061,6 +1061,11 @@ struct Chain
     unsigned int count = subtableCount;
     for (unsigned int i = 0; i < count; i++)
     {
+      if (unlikely (!c->check_struct (subtable)))
+      {
+	c->reset_object ();
+	return_trace (false);
+      }
       c->set_object (*subtable);
       if (!subtable->sanitize (c))
 	return_trace (false);
commit 70d80c90fe2f4eca66bec3e1d313bbf7e4d0ab65
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:59:50 2018 -0500

    [arrays] Port ArrayOf.qsort() and hb_vector_t.qsort() to hb_array_t

diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index 87606072..cc1a1d42 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -560,6 +560,9 @@ struct hb_bytes_t
 };
 
 template <typename Type>
+struct hb_sorted_array_t;
+
+template <typename Type>
 struct hb_array_t
 {
   static_assert ((bool) (unsigned) hb_static_size (Type), "");
@@ -600,13 +603,20 @@ struct hb_array_t
     return not_found;
   }
 
-  inline void qsort (int (*cmp)(const void*, const void*))
+  inline hb_sorted_array_t<Type> qsort (int (*cmp)(const void*, const void*))
   {
     ::qsort (arrayZ, len, sizeof (Type), cmp);
+    return hb_sorted_array_t<Type> (*this);
+  }
+  inline hb_sorted_array_t<Type> qsort (void)
+  {
+    ::qsort (arrayZ, len, sizeof (Type), Type::cmp);
+    return hb_sorted_array_t<Type> (*this);
   }
-  inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
+  inline void qsort (unsigned int start, unsigned int end)
   {
     end = MIN (end, len);
+    assert (start <= end);
     ::qsort (arrayZ + start, end - start, sizeof (Type), Type::cmp);
   }
 
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 4704861d..c7154c42 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -375,6 +375,16 @@ struct UnsizedArrayOf
   inline hb_array_t<const Type> as_array (unsigned int len) const
   { return hb_array (arrayZ, len); }
 
+  template <typename T>
+  inline Type &lsearch (unsigned int len, const T &x)
+  { return *as_array (len).lsearch (x, &Crap (T)); }
+  template <typename T>
+  inline const Type &lsearch (unsigned int len, const T &x) const
+  { return *as_array (len).lsearch (x, &Null (T)); }
+
+  inline void qsort (unsigned int len, unsigned int start = 0, unsigned int end = (unsigned int) -1)
+  { as_array (len).qsort (start, end); }
+
   inline bool sanitize (hb_sanitize_context_t *c, unsigned int count) const
   {
     TRACE_SANITIZE (this);
@@ -577,8 +587,8 @@ struct ArrayOf
   inline const Type &lsearch (const T &x) const
   { return *as_array ().lsearch (x, &Null (T)); }
 
-  inline void qsort (void)
-  { as_array ().qsort (); }
+  inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
+  { as_array ().qsort (start, end); }
 
   inline bool sanitize_shallow (hb_sanitize_context_t *c) const
   {
commit 073d837aa2394d29dda72679802d583c559c3c5b
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:48:48 2018 -0500

    [arrays] Port ArrayOf.qsort() to hb_array_t's

diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index eaefc3bc..4704861d 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -578,9 +578,7 @@ struct ArrayOf
   { return *as_array ().lsearch (x, &Null (T)); }
 
   inline void qsort (void)
-  {
-    ::qsort (arrayZ, len, sizeof (Type), Type::cmp);
-  }
+  { as_array ().qsort (); }
 
   inline bool sanitize_shallow (hb_sanitize_context_t *c) const
   {
commit ad5c871d801b481f95dd32c8b65ecc70def597be
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:47:49 2018 -0500

    [arrays] Add copy-constructor to hb_array_t and hb_sorted_array_t

diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index ef030732..87606072 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -564,6 +564,7 @@ struct hb_array_t
 {
   static_assert ((bool) (unsigned) hb_static_size (Type), "");
 
+  inline hb_array_t (const hb_array_t &o) : arrayZ (o.arrayZ), len (o.len) {}
   inline hb_array_t (Type *array_, unsigned int len_) : arrayZ (array_), len (len_) {}
 
   inline Type& operator [] (unsigned int i) const
@@ -642,6 +643,7 @@ inline hb_array_t<T> hb_array (T *array, unsigned int len)
 template <typename Type>
 struct hb_sorted_array_t : hb_array_t<Type>
 {
+  inline hb_sorted_array_t (const hb_array_t<Type> &o) : hb_array_t<Type> (o) {}
   inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {}
 
   template <typename T>
commit 61de55bf496c1edb120e4d096140eb1125552bbe
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:45:58 2018 -0500

    [arrays] Port hb_vector_t.qsort() to hb_array_t's

diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index e66f2b4c..ef030732 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -572,6 +572,10 @@ struct hb_array_t
     return arrayZ[i];
   }
 
+  template <typename T> inline operator  T * (void) const { return arrayZ; }
+
+  inline Type * operator & (void) const { return arrayZ; }
+
   inline unsigned int get_size (void) const { return len * sizeof (Type); }
 
   template <typename T>
@@ -595,9 +599,15 @@ struct hb_array_t
     return not_found;
   }
 
-  template <typename T> inline operator  T * (void) const { return arrayZ; }
-
-  inline Type * operator & (void) const { return arrayZ; }
+  inline void qsort (int (*cmp)(const void*, const void*))
+  {
+    ::qsort (arrayZ, len, sizeof (Type), cmp);
+  }
+  inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
+  {
+    end = MIN (end, len);
+    ::qsort (arrayZ + start, end - start, sizeof (Type), Type::cmp);
+  }
 
   inline hb_array_t<Type> sub_array (unsigned int start_offset, unsigned int seg_count) const
   {
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index a5fa4142..a8c98d22 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -221,15 +221,9 @@ struct hb_vector_t
   }
 
   inline void qsort (int (*cmp)(const void*, const void*))
-  {
-    ::qsort (arrayZ(), len, sizeof (Type), cmp);
-  }
-
+  { as_array ().qsort (cmp); }
   inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
-  {
-    end = MIN (end, len);
-    ::qsort (arrayZ() + start, end - start, sizeof (Type), Type::cmp);
-  }
+  { as_array ().qsort (start, end); }
 
   template <typename T>
   inline Type *lsearch (const T &x, Type *not_found = nullptr)
commit e3face8e791d677f94154e8a7f3d787d0d69a02f
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:42:17 2018 -0500

    [arrays] Remove one flavor of hb_vector_t.qsort()

diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index 085523f8..a5fa4142 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -225,13 +225,9 @@ struct hb_vector_t
     ::qsort (arrayZ(), len, sizeof (Type), cmp);
   }
 
-  inline void qsort (void)
-  {
-    ::qsort (arrayZ(), len, sizeof (Type), Type::cmp);
-  }
-
-  inline void qsort (unsigned int start, unsigned int end)
+  inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
   {
+    end = MIN (end, len);
     ::qsort (arrayZ() + start, end - start, sizeof (Type), Type::cmp);
   }
 
commit 7c1600dcd9813ca560ecccc5c54877a5750caf4e
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:37:11 2018 -0500

    [arrays] Add (unused) SortedUnsizedArrayOf<>

diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 0ea88431..eaefc3bc 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -449,6 +449,27 @@ struct UnsizedOffsetListOf : UnsizedOffsetArrayOf<Type, OffsetType, has_null>
   }
 };
 
+/* An array with sorted elements.  Supports binary searching. */
+template <typename Type>
+struct SortedUnsizedArrayOf : UnsizedArrayOf<Type>
+{
+  inline hb_sorted_array_t<Type> as_array (unsigned int len)
+  { return hb_sorted_array (this->arrayZ, len); }
+  inline hb_sorted_array_t<const Type> as_array (unsigned int len) const
+  { return hb_sorted_array (this->arrayZ, len); }
+
+  template <typename T>
+  inline Type &bsearch (unsigned int len, const T &x)
+  { return *as_array (len).bsearch (x, &Crap (Type)); }
+  template <typename T>
+  inline const Type &bsearch (unsigned int len, const T &x) const
+  { return *as_array (len).bsearch (x, &Null (Type)); }
+  template <typename T>
+  inline bool bfind (unsigned int len, const T &x, unsigned int *i = nullptr) const
+  { return as_array (len).bfind (x, i); }
+};
+
+
 /* An array with a number of elements. */
 template <typename Type, typename LenType=HBUINT16>
 struct ArrayOf
commit e700392f5cbf366f1e03dc7e7b1a2eb6c3027b92
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:31:00 2018 -0500

    [arrays] Port SortedArrayOf.bsearch/bfind to hb_sorted_array_t's

diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index a7592828..e66f2b4c 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -635,22 +635,19 @@ struct hb_sorted_array_t : hb_array_t<Type>
   inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {}
 
   template <typename T>
-  inline Type *bsearch (const T &x,
-			Type *not_found = nullptr)
+  inline Type *bsearch (const T &x, Type *not_found = nullptr)
   {
     unsigned int i;
     return bfind (x, &i) ? &this->arrayZ[i] : not_found;
   }
   template <typename T>
-  inline const Type *bsearch (const T &x,
-			      const Type *not_found = nullptr) const
+  inline const Type *bsearch (const T &x, const Type *not_found = nullptr) const
   {
     unsigned int i;
     return bfind (x, &i) ? &this->arrayZ[i] : not_found;
   }
   template <typename T>
-  inline bool bfind (const T &x,
-		     unsigned int *i = nullptr) const
+  inline bool bfind (const T &x, unsigned int *i = nullptr) const
   {
     int min = 0, max = (int) this->len - 1;
     const Type *array = this->arrayZ;
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index e384a7ad..0ea88431 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -551,14 +551,10 @@ struct ArrayOf
 
   template <typename T>
   inline Type &lsearch (const T &x)
-  {
-    return *as_array ().lsearch (x, &Crap (T));
-  }
+  { return *as_array ().lsearch (x, &Crap (T)); }
   template <typename T>
   inline const Type &lsearch (const T &x) const
-  {
-    return *as_array ().lsearch (x, &Null (T));
-  }
+  { return *as_array ().lsearch (x, &Null (T)); }
 
   inline void qsort (void)
   {
@@ -745,46 +741,20 @@ struct ArrayOfM1
 template <typename Type, typename LenType=HBUINT16>
 struct SortedArrayOf : ArrayOf<Type, LenType>
 {
+  inline hb_sorted_array_t<Type> as_array (void)
+  { return hb_sorted_array (this->arrayZ, this->len); }
+  inline hb_sorted_array_t<const Type> as_array (void) const
+  { return hb_sorted_array (this->arrayZ, this->len); }
+
   template <typename T>
   inline Type &bsearch (const T &x)
-  {
-    unsigned int i;
-    return bfind (x, &i) ? this->arrayZ[i] : Crap(Type);
-  }
+  { return *as_array ().bsearch (x, &Crap (Type)); }
   template <typename T>
   inline const Type &bsearch (const T &x) const
-  {
-    unsigned int i;
-    return bfind (x, &i) ? this->arrayZ[i] : Null(Type);
-  }
+  { return *as_array ().bsearch (x, &Null (Type)); }
   template <typename T>
   inline bool bfind (const T &x, unsigned int *i = nullptr) const
-  {
-    int min = 0, max = (int) this->len - 1;
-    const Type *array = this->arrayZ;
-    while (min <= max)
-    {
-      int mid = ((unsigned int) min + (unsigned int) max) / 2;
-      int c = array[mid].cmp (x);
-      if (c < 0)
-        max = mid - 1;
-      else if (c > 0)
-        min = mid + 1;
-      else
-      {
-	if (i)
-	  *i = mid;
-	return true;
-      }
-    }
-    if (i)
-    {
-      if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
-	max++;
-      *i = max;
-    }
-    return false;
-  }
+  { return as_array ().bfind (x, i); }
 };
 
 /*
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index 14967a41..085523f8 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -236,36 +236,21 @@ struct hb_vector_t
   }
 
   template <typename T>
-  inline Type *lsearch (const T &x,
-			Type *not_found = nullptr)
-  {
-    return as_array ().lsearch (x, not_found);
-  }
+  inline Type *lsearch (const T &x, Type *not_found = nullptr)
+  { return as_array ().lsearch (x, not_found); }
   template <typename T>
-  inline const Type *lsearch (const T &x,
-			      const Type *not_found = nullptr) const
-  {
-    return as_array ().lsearch (x, not_found);
-  }
+  inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+  { return as_array ().lsearch (x, not_found); }
 
   template <typename T>
-  inline Type *bsearch (const T &x,
-			Type *not_found = nullptr)
-  {
-    return as_sorted_array ().bsearch (x, not_found);
-  }
+  inline Type *bsearch (const T &x, Type *not_found = nullptr)
+  { return as_sorted_array ().bsearch (x, not_found); }
   template <typename T>
-  inline const Type *bsearch (const T &x,
-			      const Type *not_found = nullptr) const
-  {
-    return as_sorted_array ().bsearch (x, not_found);
-  }
+  inline const Type *bsearch (const T &x, const Type *not_found = nullptr) const
+  { return as_sorted_array ().bsearch (x, not_found); }
   template <typename T>
-  inline bool bfind (const T &x,
-		     unsigned int *i = nullptr) const
-  {
-    return as_sorted_array ().bfind (x, i);
-  }
+  inline bool bfind (const T &x, unsigned int *i = nullptr) const
+  { return as_sorted_array ().bfind (x, i); }
 };
 
 
commit e604306f2829804e9016966c1378166253b19d29
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:24:48 2018 -0500

    [arrays] Port hb_vector_t.bsearch/bfind to (new) hb_sorted_array_t's

diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index 1d5bb352..a7592828 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -564,7 +564,6 @@ struct hb_array_t
 {
   static_assert ((bool) (unsigned) hb_static_size (Type), "");
 
-  inline hb_array_t (void) : arrayZ (nullptr), len (0) {}
   inline hb_array_t (Type *array_, unsigned int len_) : arrayZ (array_), len (len_) {}
 
   inline Type& operator [] (unsigned int i) const
@@ -576,7 +575,8 @@ struct hb_array_t
   inline unsigned int get_size (void) const { return len * sizeof (Type); }
 
   template <typename T>
-  inline Type *lsearch (const T &x, Type *not_found = nullptr)
+  inline Type *lsearch (const T &x,
+			Type *not_found = nullptr)
   {
     unsigned int count = len;
     for (unsigned int i = 0; i < count; i++)
@@ -585,7 +585,8 @@ struct hb_array_t
     return not_found;
   }
   template <typename T>
-  inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+  inline const Type *lsearch (const T &x,
+			      const Type *not_found = nullptr) const
   {
     unsigned int count = len;
     for (unsigned int i = 0; i < count; i++)
@@ -625,7 +626,61 @@ struct hb_array_t
   unsigned int len;
 };
 template <typename T>
-inline hb_array_t<T> hb_array (T *array, unsigned int len) { return hb_array_t<T> (array, len); }
+inline hb_array_t<T> hb_array (T *array, unsigned int len)
+{ return hb_array_t<T> (array, len); }
+
+template <typename Type>
+struct hb_sorted_array_t : hb_array_t<Type>
+{
+  inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {}
+
+  template <typename T>
+  inline Type *bsearch (const T &x,
+			Type *not_found = nullptr)
+  {
+    unsigned int i;
+    return bfind (x, &i) ? &this->arrayZ[i] : not_found;
+  }
+  template <typename T>
+  inline const Type *bsearch (const T &x,
+			      const Type *not_found = nullptr) const
+  {
+    unsigned int i;
+    return bfind (x, &i) ? &this->arrayZ[i] : not_found;
+  }
+  template <typename T>
+  inline bool bfind (const T &x,
+		     unsigned int *i = nullptr) const
+  {
+    int min = 0, max = (int) this->len - 1;
+    const Type *array = this->arrayZ;
+    while (min <= max)
+    {
+      int mid = ((unsigned int) min + (unsigned int) max) / 2;
+      int c = array[mid].cmp (x);
+      if (c < 0)
+        max = mid - 1;
+      else if (c > 0)
+        min = mid + 1;
+      else
+      {
+	if (i)
+	  *i = mid;
+	return true;
+      }
+    }
+    if (i)
+    {
+      if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
+	max++;
+      *i = max;
+    }
+    return false;
+  }
+};
+template <typename T>
+inline hb_sorted_array_t<T> hb_sorted_array (T *array, unsigned int len)
+{ return hb_sorted_array_t<T> (array, len); }
 
 
 struct HbOpOr
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 7fa38b1c..e384a7ad 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -370,8 +370,10 @@ struct UnsizedArrayOf
   inline unsigned int get_size (unsigned int len) const
   { return len * Type::static_size; }
 
-  inline hb_array_t<Type> as_array (unsigned int len) { return hb_array_t<Type> (arrayZ, len); }
-  inline hb_array_t<const Type> as_array (unsigned int len) const { return hb_array_t<const Type> (arrayZ, len); }
+  inline hb_array_t<Type> as_array (unsigned int len)
+  { return hb_array (arrayZ, len); }
+  inline hb_array_t<const Type> as_array (unsigned int len) const
+  { return hb_array (arrayZ, len); }
 
   inline bool sanitize (hb_sanitize_context_t *c, unsigned int count) const
   {
@@ -483,8 +485,10 @@ struct ArrayOf
   inline unsigned int get_size (void) const
   { return len.static_size + len * Type::static_size; }
 
-  inline hb_array_t<Type> as_array (void) { return hb_array_t<Type> (arrayZ, len); }
-  inline hb_array_t<const Type> as_array (void) const { return hb_array_t<const Type> (arrayZ, len); }
+  inline hb_array_t<Type> as_array (void)
+  { return hb_array (arrayZ, len); }
+  inline hb_array_t<const Type> as_array (void) const
+  { return hb_array (arrayZ, len); }
 
   inline bool serialize (hb_serialize_context_t *c,
 			 unsigned int items_len)
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index d1fb65df..14967a41 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -91,8 +91,15 @@ struct hb_vector_t
     return arrayZ()[i];
   }
 
-  inline hb_array_t<Type> as_array (void) { return hb_array_t<Type> (arrayZ(), len); }
-  inline hb_array_t<const Type> as_array (void) const { return hb_array_t<const Type> (arrayZ(), len); }
+  inline hb_array_t<Type> as_array (void)
+  { return hb_array (arrayZ(), len); }
+  inline hb_array_t<const Type> as_array (void) const
+  { return hb_array (arrayZ(), len); }
+
+  inline hb_sorted_array_t<Type> as_sorted_array (void)
+  { return hb_sorted_array (arrayZ(), len); }
+  inline hb_sorted_array_t<const Type> as_sorted_array (void) const
+  { return hb_sorted_array (arrayZ(), len); }
 
   template <typename T> inline operator  T * (void) { return arrayZ(); }
   template <typename T> inline operator const T * (void) const { return arrayZ(); }
@@ -229,55 +236,35 @@ struct hb_vector_t
   }
 
   template <typename T>
-  inline Type *lsearch (const T &x, Type *not_found = nullptr)
+  inline Type *lsearch (const T &x,
+			Type *not_found = nullptr)
   {
     return as_array ().lsearch (x, not_found);
   }
   template <typename T>
-  inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+  inline const Type *lsearch (const T &x,
+			      const Type *not_found = nullptr) const
   {
     return as_array ().lsearch (x, not_found);
   }
 
   template <typename T>
-  inline Type *bsearch (const T &x)
+  inline Type *bsearch (const T &x,
+			Type *not_found = nullptr)
   {
-    unsigned int i;
-    return bfind (x, &i) ? &arrayZ()[i] : nullptr;
+    return as_sorted_array ().bsearch (x, not_found);
   }
   template <typename T>
-  inline const Type *bsearch (const T &x) const
+  inline const Type *bsearch (const T &x,
+			      const Type *not_found = nullptr) const
   {
-    unsigned int i;
-    return bfind (x, &i) ? &arrayZ()[i] : nullptr;
+    return as_sorted_array ().bsearch (x, not_found);
   }
   template <typename T>
-  inline bool bfind (const T &x, unsigned int *i = nullptr) const
+  inline bool bfind (const T &x,
+		     unsigned int *i = nullptr) const
   {
-    int min = 0, max = (int) this->len - 1;
-    const Type *array = this->arrayZ();
-    while (min <= max)
-    {
-      int mid = ((unsigned int) min + (unsigned int) max) / 2;
-      int c = array[mid].cmp (x);
-      if (c < 0)
-        max = mid - 1;
-      else if (c > 0)
-        min = mid + 1;
-      else
-      {
-	if (i)
-	  *i = mid;
-	return true;
-      }
-    }
-    if (i)
-    {
-      if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
-	max++;
-      *i = max;
-    }
-    return false;
+    return as_sorted_array ().bfind (x, i);
   }
 };
 
commit 268eca24921e85eda98f4f0cce05d40c7235ba62
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:11:12 2018 -0500

    [arrays] Port (unused) ArrayOf.lsearch() to hb_array_t's

diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 4e0c033f..7fa38b1c 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -548,20 +548,12 @@ struct ArrayOf
   template <typename T>
   inline Type &lsearch (const T &x)
   {
-    unsigned int count = len;
-    for (unsigned int i = 0; i < count; i++)
-      if (!this->arrayZ[i].cmp (x))
-	return this->arrayZ[i];
-    return Crap (T);
+    return *as_array ().lsearch (x, &Crap (T));
   }
   template <typename T>
   inline const Type &lsearch (const T &x) const
   {
-    unsigned int count = len;
-    for (unsigned int i = 0; i < count; i++)
-      if (!this->arrayZ[i].cmp (x))
-	return this->arrayZ[i];
-    return Null (T);
+    return *as_array ().lsearch (x, &Null (T));
   }
 
   inline void qsort (void)
commit 830856ba6b9454bf507e00416f9d45e9975fb7dc
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:09:28 2018 -0500

    [arrays] Port hb_vector_t.lsearch() to hb_array_t's

diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index c133a532..1d5bb352 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -575,9 +575,24 @@ struct hb_array_t
 
   inline unsigned int get_size (void) const { return len * sizeof (Type); }
 
-  template <typename hb_sanitize_context_t>
-  inline bool sanitize (hb_sanitize_context_t *c) const
-  { return c->check_array (arrayZ, len); }
+  template <typename T>
+  inline Type *lsearch (const T &x, Type *not_found = nullptr)
+  {
+    unsigned int count = len;
+    for (unsigned int i = 0; i < count; i++)
+      if (!this->arrayZ[i].cmp (x))
+	return &this->arrayZ[i];
+    return not_found;
+  }
+  template <typename T>
+  inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+  {
+    unsigned int count = len;
+    for (unsigned int i = 0; i < count; i++)
+      if (!this->arrayZ[i].cmp (x))
+	return &this->arrayZ[i];
+    return not_found;
+  }
 
   template <typename T> inline operator  T * (void) const { return arrayZ; }
 
@@ -601,6 +616,11 @@ struct hb_array_t
 
   inline void free (void) { ::free ((void *) arrayZ); arrayZ = nullptr; len = 0; }
 
+  template <typename hb_sanitize_context_t>
+  inline bool sanitize (hb_sanitize_context_t *c) const
+  { return c->check_array (arrayZ, len); }
+
+  public:
   Type *arrayZ;
   unsigned int len;
 };
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index e17f8897..d1fb65df 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -229,22 +229,14 @@ struct hb_vector_t
   }
 
   template <typename T>
-  inline Type *lsearch (const T &x)
+  inline Type *lsearch (const T &x, Type *not_found = nullptr)
   {
-    Type *array = arrayZ();
-    for (unsigned int i = 0; i < len; i++)
-      if (0 == array[i].cmp (x))
-	return &array[i];
-    return nullptr;
+    return as_array ().lsearch (x, not_found);
   }
   template <typename T>
-  inline const Type *lsearch (const T &x) const
+  inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
   {
-    const Type *array = arrayZ();
-    for (unsigned int i = 0; i < len; i++)
-      if (0 == array[i].cmp (x))
-	return &array[i];
-    return nullptr;
+    return as_array ().lsearch (x, not_found);
   }
 
   template <typename T>
commit 96cf0889804b7d72a96274b25641bb18f7dd2e1e
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 01:07:15 2018 -0500

    [arrays] More

diff --git a/src/hb-face.cc b/src/hb-face.cc
index e23842fb..5b33784f 100644
--- a/src/hb-face.cc
+++ b/src/hb-face.cc
@@ -588,10 +588,10 @@ struct hb_face_builder_data_t
 {
   struct table_entry_t
   {
-    inline int cmp (const hb_tag_t *t) const
+    inline int cmp (hb_tag_t t) const
     {
-      if (*t < tag) return -1;
-      if (*t > tag) return -1;
+      if (t < tag) return -1;
+      if (t > tag) return -1;
       return 0;
     }
 
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index 436b92bb..e17f8897 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -233,7 +233,7 @@ struct hb_vector_t
   {
     Type *array = arrayZ();
     for (unsigned int i = 0; i < len; i++)
-      if (0 == array[i].cmp (&x))
+      if (0 == array[i].cmp (x))
 	return &array[i];
     return nullptr;
   }
@@ -242,7 +242,7 @@ struct hb_vector_t
   {
     const Type *array = arrayZ();
     for (unsigned int i = 0; i < len; i++)
-      if (0 == array[i].cmp (&x))
+      if (0 == array[i].cmp (x))
 	return &array[i];
     return nullptr;
   }
commit 3e26c8d2b10fc08642c25c7f13aef68b0b1008f6
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 00:58:44 2018 -0500

    [arrays] Update ArrayOf.lsearch()
    
    Currently unused apparently

diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index f255ea12..4e0c033f 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -545,14 +545,23 @@ struct ArrayOf
     return_trace (true);
   }
 
-  template <typename SearchType>
-  inline int lsearch (const SearchType &x) const
+  template <typename T>
+  inline Type &lsearch (const T &x)
+  {
+    unsigned int count = len;
+    for (unsigned int i = 0; i < count; i++)
+      if (!this->arrayZ[i].cmp (x))
+	return this->arrayZ[i];
+    return Crap (T);
+  }
+  template <typename T>
+  inline const Type &lsearch (const T &x) const
   {
     unsigned int count = len;
     for (unsigned int i = 0; i < count; i++)
       if (!this->arrayZ[i].cmp (x))
-	return i;
-    return -1;
+	return this->arrayZ[i];
+    return Null (T);
   }
 
   inline void qsort (void)
commit 22e1857b01c71714245ddca05cb3fa0127bf7da2
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Sat Nov 24 00:53:19 2018 -0500

    [arrays] Change argument type of cmp called by hb_vector_t.bsearch()
    
    Towards consolidating all array bsearch/...

diff --git a/src/hb-aat-map.hh b/src/hb-aat-map.hh
index 07454b2c..7d85c7c3 100644
--- a/src/hb-aat-map.hh
+++ b/src/hb-aat-map.hh
@@ -77,9 +77,9 @@ struct hb_aat_map_builder_t
 	     (a->seq < b->seq ? -1 : a->seq > b->seq ? 1 : 0);
     }
 
-    int cmp (const short unsigned int *ty) const
+    int cmp (unsigned int ty) const
     {
-      return (type != *ty) ? (type < *ty ? -1 : 1) : 0;
+      return (type != ty) ? (type < ty ? -1 : 1) : 0;
     }
   };
 
diff --git a/src/hb-ot-map.hh b/src/hb-ot-map.hh
index 8e1f5aa8..0a5a4fbc 100644
--- a/src/hb-ot-map.hh
+++ b/src/hb-ot-map.hh
@@ -57,8 +57,8 @@ struct hb_ot_map_t
     unsigned int auto_zwj : 1;
     unsigned int random : 1;
 
-    inline int cmp (const hb_tag_t *tag_) const
-    { return *tag_ < tag ? -1 : *tag_ > tag ? 1 : 0; }
+    inline int cmp (const hb_tag_t tag_) const
+    { return tag_ < tag ? -1 : tag_ > tag ? 1 : 0; }
   };
 
   struct lookup_map_t {
diff --git a/src/hb-set.hh b/src/hb-set.hh
index cc061a7c..a464131d 100644
--- a/src/hb-set.hh
+++ b/src/hb-set.hh
@@ -45,7 +45,7 @@ struct hb_set_t
 
   struct page_map_t
   {
-    inline int cmp (const page_map_t *o) const { return (int) o->major - (int) major; }
+    inline int cmp (const page_map_t &o) const { return (int) o.major - (int) major; }
 
     uint32_t major;
     uint32_t index;
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index b30511d8..436b92bb 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -267,7 +267,7 @@ struct hb_vector_t
     while (min <= max)
     {
       int mid = ((unsigned int) min + (unsigned int) max) / 2;
-      int c = array[mid].cmp (&x);
+      int c = array[mid].cmp (x);
       if (c < 0)
         max = mid - 1;
       else if (c > 0)
@@ -281,7 +281,7 @@ struct hb_vector_t
     }
     if (i)
     {
-      if (max < 0 || (max < (int) this->len && array[max].cmp (&x) > 0))
+      if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
 	max++;
       *i = max;
     }
commit 5fdf7b724eb3cb5ac60cd7f90d3250877ad7ca06
Author: Nathan Willis <nwillis at glyphography.com>
Date:   Thu Nov 15 17:40:21 2018 -0600

    Usermanual: clusters chapter; add brief grapheme definition and clarify monotonous cluster handling.

diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml
index c59818fc..f48e89c2 100644
--- a/docs/usermanual-clusters.xml
+++ b/docs/usermanual-clusters.xml
@@ -14,15 +14,29 @@
       unit.
     </para>
     <para>
-      During the shaping process, some shaping operations may
-      merge adjacent characters (for example, when two code points form
-      a ligature and are replaced by a single glyph) or split one
-      character into several (for example, when performing the Unicode
-      canonical decomposition of a code point).
+      A cluster is distinct from a <emphasis>grapheme</emphasis>,
+      which is the smallest unit of a writing system or script,
+      because clusters are only relevant for script shaping and the
+      layout of glyphs.
+    </para>
+    <para>
+      For example, a grapheme may be a letter, a number, a logogram,
+      or a symbol. When two letters form a ligature, however, they
+      combine into a single glyph. They are therefore part of the same
+      cluster and are treated as a unit — even though the two
+      original, underlying letters are separate graphemes.
+    </para>
+    <para>
+      During the shaping process, there are several shaping operations
+      that may merge adjacent characters (for example, when two code
+      points form a ligature or a conjunct form and are replaced by a
+      single glyph) or split one character into several (for example,
+      when decomposing a code point through the
+      <literal>ccmp</literal> feature).
     </para>
     <para>
       HarfBuzz tracks clusters independently from how these
-      shaping operations alter the individual glyphs that comprise the
+      shaping operations affect the individual glyphs that comprise the
       output HarfBuzz returns in a buffer. Consequently,
       a client program using HarfBuzz can utilize the cluster
       information to implement features such as:
@@ -69,15 +83,15 @@
       </listitem>
     </itemizedlist>
     <para>
-      When you add text to a HarfBuzz buffer, each code point is assigned
-      a <emphasis>cluster value</emphasis>.
+      When you add text to a HarfBuzz buffer, each code point must be
+      assigned a <emphasis>cluster value</emphasis>.
     </para>
     <para>
       This cluster value is an arbitrary number; HarfBuzz uses it only
       to distinguish between clusters. Many client programs will use
       the index of each code point in the input text stream as the
-      cluster value, for the sake of convenience; the actual value does
-      not matter.
+      cluster value. This is for the sake of convenience; the actual
+      value does not matter.
     </para>
     <para>
       Client programs can choose how HarfBuzz handles clusters during
@@ -100,7 +114,7 @@
 	  as well as the <emphasis>Zero Width Joiner</emphasis> and
 	  <emphasis>Zero Width Non-Joiner</emphasis> code points, are
 	  assigned the cluster value of the closest preceding code
-	  point from <emphasis>diferent</emphasis> category. 
+	  point from <emphasis>different</emphasis> category. 
 	</para>
 	<para>
 	  In essence, whenever a base character is followed by a mark
@@ -161,22 +175,30 @@
       </listitem>
     </itemizedlist>
     <para>
+      As mentioned earlier, client programs using HarfBuzz often
+      assign initial cluster values in a buffer by reusing the indices
+      of the code points in the input text. This gives a sequence of
+      cluster values that is monotonically increasing (for example,
+      0,1,2,3,4,5). 
+    </para>
+    <para>
       It is not <emphasis>required</emphasis> that the cluster values
       in a buffer be monotonically increasing. However, if the initial
       cluster values in a buffer are monotonic and the buffer is
-      configured to use clustering level 0 or 1, then HarfBuzz
+      configured to use cluster level 0 or 1, then HarfBuzz
       guarantees that the final cluster values in the shaped buffer
       will also be monotonic. No such guarantee is made for cluster
       level 2.
     </para>
     <para>
-      In levels 0 and 1, HarfBuzz implements the following conceptual model for
-      cluster values:
+      In levels 0 and 1, HarfBuzz implements the following conceptual
+      model for cluster values:
     </para>
     <itemizedlist spacing="compact">
       <listitem>
 	<para>
-          The sequence of cluster values will always remain monotonic.
+          If the sequence of input cluster values is monotonic, the
+	  sequence of cluster values will remain monotonic.
 	</para>
       </listitem>
       <listitem>
@@ -231,7 +253,7 @@
     </programlisting>
     <para>
       During shaping, HarfBuzz maps these characters to glyphs from
-      the font. For simplicity, let's assume that each character maps
+      the font. For simplicity, let us assume that each character maps
       to the corresponding, identical-looking glyph:
     </para>
     <programlisting>
@@ -297,7 +319,7 @@
       0,1,2,3,4
     </programlisting>
     <para>
-      If <literal>D</literal> is reordered before <literal>B</literal>,
+      If <literal>D</literal> is reordered to before <literal>B</literal>,
       then HarfBuzz merges the <literal>B</literal>,
       <literal>C</literal>, and <literal>D</literal> clusters, and we
       get:
commit 939220e57da613e090d247aa1af2396c28370af4
Author: Nathan Willis <nwillis at glyphography.com>
Date:   Thu Nov 15 15:47:03 2018 -0600

    Usermanual: clusters chapter, minor updates.

diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml
index f7db0f59..c59818fc 100644
--- a/docs/usermanual-clusters.xml
+++ b/docs/usermanual-clusters.xml
@@ -30,20 +30,21 @@
     <itemizedlist>
       <listitem>
 	<para>
-	  Correctly positioning the cursor between two characters that
-	  have combined into a single glyph by forming a ligature.
+	  Correctly positioning the cursor within a shaped text run,
+	  even when characters have formed ligatures, composed or
+	  decomposed, reordered, or undergone other shaping operations.
 	</para>
       </listitem>
       <listitem>
 	<para>
 	  Correctly highlighting a text selection that includes some,
-	  but not all, of the characters comprising a ligature. 
+	  but not all, of the characters in a word. 
 	</para>
       </listitem>
       <listitem>
 	<para>
 	  Applying text attributes (such as color or underlining) to
-	  part, but not all, of a composed base-and-mark combination.
+	  part, but not all, of a word.
 	</para>
       </listitem>
       <listitem>
@@ -54,6 +55,12 @@
       </listitem>
       <listitem>
 	<para>
+	  Determining the mapping between input characters and output
+	  glyphs, such as which glyphs are ligatures.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
 	  Performing line-breaking, justification, and other
 	  line-level or paragraph-level operations that must be done
 	  after shaping is complete, but which require character-level
@@ -69,7 +76,7 @@
       This cluster value is an arbitrary number; HarfBuzz uses it only
       to distinguish between clusters. Many client programs will use
       the index of each code point in the input text stream as the
-      cluster value, as a matter of convenience; the actual value does
+      cluster value, for the sake of convenience; the actual value does
       not matter.
     </para>
     <para>
@@ -122,10 +129,10 @@
 	  Level 1 differs from level 0 by not merging the 
 	  clusters of marks and other modifier code points with the
 	  preceding "base" code point's cluster. By preserving the
-	  cluster values of these marks and modifier code points,
-	  script shaping can perform additional operations that might
-	  lead to improved results (for example, reordering a sequence
-	  of marks).
+	  separate cluster values of these marks and modifier code
+	  points, script shapers can perform additional operations
+	  that might lead to improved results (for example, reordering
+	  a sequence of marks).
 	</para>
 	<para>
 	  Client programs can specify level 1 behavior for a buffer by
commit 53ac46e974cf0ee8720b40ef394714eb97ff53b9
Author: Nathan Willis <nwillis at glyphography.com>
Date:   Mon Nov 12 12:17:06 2018 -0600

    Usermanual: expand clusters chapter.

diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml
index 7b2c7adc..f7db0f59 100644
--- a/docs/usermanual-clusters.xml
+++ b/docs/usermanual-clusters.xml
@@ -5,306 +5,509 @@
   <!ENTITY version SYSTEM "version.xml">
 ]>
 <chapter id="clusters">
-<sect1 id="clusters">
   <title>Clusters</title>
-  <para>
-    In shaping text, a <emphasis>cluster</emphasis> is a sequence of
-    code points that needs to be treated as a single, indivisible unit.
-  </para>
-  <para>
-    When you add text to a HB buffer, each character is associated with
-    a <emphasis>cluster value</emphasis>. This is an arbitrary number as
-    far as HB is concerned.
-  </para>
-  <para>
-    Most clients will use UTF-8, UTF-16, or UTF-32 indices, but the
-    actual number does not matter. Moreover, it is not required for the
-    cluster values to be monotonically increasing, but pretty much all
-    of HB's tests are performed on monotonically increasing cluster
-    numbers. Nevertheless, there is no such assumption in the code
-    itself. With that in mind, let's examine what happens with cluster
-    values during shaping under each cluster-level.
-  </para>
-  <para>
-    HarfBuzz provides three <emphasis>levels</emphasis> of clustering
-    support. Level 0 is the default behavior and reproduces the behavior
-    of the old HarfBuzz library. Level 1 tweaks this behavior slightly
-    to produce better results, so level 1 clustering is recommended for
-    code that is not required to implement backward compatibility with
-    the old HarfBuzz.
-  </para>
-  <para>
-    Level 2 differs significantly in how it treats cluster values.
-    Levels 0 and 1 both process ligatures and glyph decomposition by
-    merging clusters; level 2 does not.
-  </para>
-  <para>
-    The conceptual model for what the cluster values mean, in levels 0
-    and 1, is this:
-  </para>
-  <itemizedlist spacing="compact">
-    <listitem>
-      <para>
-        the sequence of cluster values will always remain monotone
-      </para>
-    </listitem>
-    <listitem>
-      <para>
-        each value represents a single cluster
-      </para>
-    </listitem>
-    <listitem>
-      <para>
-        each cluster contains one or more glyphs and one or more
-        characters
-      </para>
-    </listitem>
-  </itemizedlist>
-  <para>
-    Assuming that initial cluster numbers were monotonically increasing
-    and distinct, then all adjacent glyphs having the same cluster
-    number belong to the same cluster, and all characters belong to the
-    cluster that has the highest number not larger than their initial
-    cluster number. This will become clearer with an example.
-  </para>
-</sect1>
-<sect1 id="a-clustering-example-for-levels-0-and-1">
-  <title>A clustering example for levels 0 and 1</title>
-  <para>
-    Let's say we start with the following character sequence and cluster
-    values:
-  </para>
-  <programlisting>
-   A,B,C,D,E
-   0,1,2,3,4
-</programlisting>
-  <para>
-    We then map the characters to glyphs. For simplicity, let's assume
-    that each character maps to the corresponding, identical-looking
-    glyph:
-  </para>
-  <programlisting>
-   A,B,C,D,E
-   0,1,2,3,4
-</programlisting>
-  <para>
-    Now if, for example, <literal>B</literal> and <literal>C</literal>
-    ligate, then the clusters to which they belong "merge".
-    This merged cluster takes for its cluster number the minimum of all
-    the cluster numbers of the clusters that went in. In this case, we
-    get:
-  </para>
-  <programlisting>
-   A,BC,D,E
-   0,1 ,3,4
-</programlisting>
-  <para>
-    Now let's assume that the <literal>BC</literal> glyph decomposes
-    into three components, and <literal>D</literal> also decomposes into
-    two. The components each inherit the cluster value of their parent:
-  </para>
-  <programlisting>
-   A,BC0,BC1,BC2,D0,D1,E
-   0,1  ,1  ,1  ,3 ,3 ,4
-</programlisting>
-  <para>
-    Now if <literal>BC2</literal> and <literal>D0</literal> ligate, then
-    their clusters (numbers 1 and 3) merge into
-    <literal>min(1,3) = 1</literal>:
-  </para>
-  <programlisting>
-   A,BC0,BC1,BC2D0,D1,E
-   0,1  ,1  ,1    ,1 ,4
-</programlisting>
-  <para>
-    At this point, cluster 1 means: the character sequence
-    <literal>BCD</literal> is represented by glyphs
-    <literal>BC0,BC1,BC2D0,D1</literal> and cannot be broken down any
-    further.
-  </para>
-</sect1>
-<sect1 id="reordering-in-levels-0-and-1">
-  <title>Reordering in levels 0 and 1</title>
-  <para>
-    Another common operation in the more complex shapers is when things
-    reorder. In those cases, to maintain monotone clusters, HB merges
-    the clusters of everything in the reordering sequence. For example,
-    let's again start with the character sequence:
-  </para>
-  <programlisting>
-   A,B,C,D,E
-   0,1,2,3,4
-</programlisting>
-  <para>
-    If <literal>D</literal> is reordered before <literal>B</literal>,
-    then the <literal>B</literal>, <literal>C</literal>, and
-    <literal>D</literal> clusters merge, and we get:
-  </para>
-  <programlisting>
-   A,D,B,C,E
-   0,1,1,1,4
-</programlisting>
-  <para>
-    This is clearly not ideal, but it is the only sensible way to
-    maintain monotone indices and retain the true relationship between
-    glyphs and characters.
-  </para>
-</sect1>
-<sect1 id="the-distinction-between-levels-0-and-1">
-  <title>The distinction between levels 0 and 1</title>
-  <para>
-    So, the above is pretty much what cluster levels 0 and 1 do. The
-    only difference between the two is this: in level 0, at the very
-    beginning of the shaping process, we also merge clusters between
-    base characters and all Unicode marks (combining or not) following
-    them. E.g.:
-  </para>
-  <programlisting>
-  A,acute,B
-  0,1    ,2
-</programlisting>
-  <para>
-    will become:
-  </para>
-  <programlisting>
-  A,acute,B
-  0,0    ,2
-</programlisting>
-  <para>
-    This is the default behavior. We do it because Windows did it and
-    old HarfBuzz did it, so this remained the default. But this behavior
-    makes it impossible to color diacritic marks differently from their
-    base characters. That's why in level 1 we do not perform this
-    initial merging step.
-  </para>
-  <para>
-    For clients, level 0 is more convenient if they rely on HarfBuzz
-    clusters for cursor positioning. But that's wrong anyway: cursor
-    positions should be determined based on Unicode grapheme boundaries,
-    NOT shaping clusters. As such, level 1 clusters are preferred.
-  </para>
-  <para>
-    One last note about levels 0 and 1. We currently don't allow a
-    <literal>MultipleSubst</literal> lookup to replace a glyph with zero
-    glyphs (i.e., to delete a glyph). But in some other situations,
-    glyphs can be deleted. In those cases, if the glyph being deleted is
-    the last glyph of its cluster, we make sure to merge the cluster
-    with a neighboring cluster.
-  </para>
-  <para>
-    This is, primarily, to make sure that the starting cluster of the
-    text always has the cluster index pointing to the start of the text
-    for the run; more than one client currently relies on this
-    guarantee.
-  </para>
-  <para>
-    Incidentally, Apple's CoreText does something else to maintain the
-    same promise: it inserts a glyph with id 65535 at the beginning of
-    the glyph string if the glyph corresponding to the first character
-    in the run was deleted. HarfBuzz might do something similar in the
-    future.
-  </para>
-</sect1>
-<sect1 id="level-2">
-  <title>Level 2</title>
-  <para>
-    Level 2 is a different beast from levels 0 and 1. It is simple to
-    describe, but hard to make sense of. It simply doesn't do any
-    cluster merging whatsoever. When things ligate or otherwise multiple
-    glyphs turn into one, the cluster value of the first glyph is
-    retained.
-  </para>
-  <para>
-    Here are a few examples of why processing cluster values produced at
-    this level might be tricky:
-  </para>
-  <sect2 id="ligatures-with-combining-marks">
-    <title>Ligatures with combining marks</title>
-    <para>
-      Imagine capital letters are bases and lower case letters are
-      combining marks. With an input sequence like this:
+  <section id="clusters">
+    <title>Clusters</title>
+    <para>
+      In text shaping, a <emphasis>cluster</emphasis> is a sequence of
+      characters that needs to be treated as a single, indivisible
+      unit.
     </para>
-    <programlisting>
-  A,a,B,b,C,c
-  0,1,2,3,4,5
-</programlisting>
     <para>
-      if <literal>A,B,C</literal> ligate, then here are the cluster
-      values one would get under the various levels:
+      During the shaping process, some shaping operations may
+      merge adjacent characters (for example, when two code points form
+      a ligature and are replaced by a single glyph) or split one
+      character into several (for example, when performing the Unicode
+      canonical decomposition of a code point).
     </para>
     <para>
-      level 0:
+      HarfBuzz tracks clusters independently from how these
+      shaping operations alter the individual glyphs that comprise the
+      output HarfBuzz returns in a buffer. Consequently,
+      a client program using HarfBuzz can utilize the cluster
+      information to implement features such as:
+    </para>
+    <itemizedlist>
+      <listitem>
+	<para>
+	  Correctly positioning the cursor between two characters that
+	  have combined into a single glyph by forming a ligature.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+	  Correctly highlighting a text selection that includes some,
+	  but not all, of the characters comprising a ligature. 
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+	  Applying text attributes (such as color or underlining) to
+	  part, but not all, of a composed base-and-mark combination.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+	  Generating output document formats (such as PDF) with
+	  embedded text that can be fully extracted.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+	  Performing line-breaking, justification, and other
+	  line-level or paragraph-level operations that must be done
+	  after shaping is complete, but which require character-level
+	  properties.
+	</para>
+      </listitem>
+    </itemizedlist>
+    <para>
+      When you add text to a HarfBuzz buffer, each code point is assigned
+      a <emphasis>cluster value</emphasis>.
+    </para>
+    <para>
+      This cluster value is an arbitrary number; HarfBuzz uses it only
+      to distinguish between clusters. Many client programs will use
+      the index of each code point in the input text stream as the
+      cluster value, as a matter of convenience; the actual value does
+      not matter.
+    </para>
+    <para>
+      Client programs can choose how HarfBuzz handles clusters during
+      shaping by setting the
+      <literal>cluster_level</literal> of the
+      buffer. HarfBuzz offers three <emphasis>levels</emphasis> of
+      clustering support for this property:
+    </para>
+    <itemizedlist>
+      <listitem>
+	<para><emphasis>Level 0</emphasis> is the default and
+	reproduces the behavior of the old HarfBuzz library.
+	</para>
+	<para>
+	  The distinguishing feature of level 0 behavior is that, at
+	  the beginning of processing the buffer, all code points that
+	  are categorized as <emphasis>marks</emphasis>,
+	  <emphasis>modifier symbols</emphasis>, or
+	  <emphasis>Emoji extended pictographic</emphasis> modifiers,
+	  as well as the <emphasis>Zero Width Joiner</emphasis> and
+	  <emphasis>Zero Width Non-Joiner</emphasis> code points, are
+	  assigned the cluster value of the closest preceding code
+	  point from <emphasis>diferent</emphasis> category. 
+	</para>
+	<para>
+	  In essence, whenever a base character is followed by a mark
+	  character or a sequence of mark characters, those marks are
+	  reassigned to the same initial cluster value as the base
+	  character. This reassignment is referred to as
+	  "merging" the affected clusters. This behavior is based on
+	  the Grapheme Cluster Boundary specification in <ulink
+	  url="https://www.unicode.org/reports/tr29/#Regex_Definitions">Unicode
+	  Technical Report 29</ulink>.
+	</para>
+	<para>
+	  Client programs can specify level 0 behavior for a buffer by
+	  setting its <literal>cluster_level</literal> to
+	  <literal>HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES</literal>. 
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+	  <emphasis>Level 1</emphasis> tweaks the old behavior
+	  slightly to produce better results. Therefore, level 1
+	  clustering is recommended for code that is not required to
+	  implement backward compatibility with the old HarfBuzz.
+	</para>
+	<para>
+	  Level 1 differs from level 0 by not merging the 
+	  clusters of marks and other modifier code points with the
+	  preceding "base" code point's cluster. By preserving the
+	  cluster values of these marks and modifier code points,
+	  script shaping can perform additional operations that might
+	  lead to improved results (for example, reordering a sequence
+	  of marks).
+	</para>
+	<para>
+	  Client programs can specify level 1 behavior for a buffer by
+	  setting its <literal>cluster_level</literal> to
+	  <literal>HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS</literal>. 
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+	  <emphasis>Level 2</emphasis> differs significantly in how it
+	  treats cluster values. In level 2, HarfBuzz never merges
+	  clusters.
+	</para>
+	<para>
+	  This difference can be seen most clearly when HarfBuzz processes
+	  ligature substitutions and glyph decompositions. In level 0 
+	  and level 1, ligatures and glyph decomposition both involve
+	  merging clusters; in level 2, neither of these operations
+	  triggers a merge.
+	</para>
+	<para>
+	  Client programs can specify level 2 behavior for a buffer by
+	  setting its <literal>cluster_level</literal> to
+	  <literal>HB_BUFFER_CLUSTER_LEVEL_CHARACTERS</literal>. 
+	</para>
+      </listitem>
+    </itemizedlist>
+    <para>
+      It is not <emphasis>required</emphasis> that the cluster values
+      in a buffer be monotonically increasing. However, if the initial
+      cluster values in a buffer are monotonic and the buffer is
+      configured to use clustering level 0 or 1, then HarfBuzz
+      guarantees that the final cluster values in the shaped buffer
+      will also be monotonic. No such guarantee is made for cluster
+      level 2.
+    </para>
+    <para>
+      In levels 0 and 1, HarfBuzz implements the following conceptual model for
+      cluster values:
+    </para>
+    <itemizedlist spacing="compact">
+      <listitem>
+	<para>
+          The sequence of cluster values will always remain monotonic.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+          Each cluster value represents a single cluster.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+          Each cluster contains one or more glyphs and one or more
+          characters.
+	</para>
+      </listitem>
+    </itemizedlist>
+    <para>
+      In practice, this model offers several benefits. Assuming that
+      the initial cluster values were monotonically increasing
+      and distinct before shaping began, then, in the final output:
+    </para>
+    <itemizedlist spacing="compact">
+      <listitem>
+	<para>
+	  All adjacent glyphs having the same final cluster
+	  value belong to the same cluster.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+          Each character belongs to the cluster that has the highest
+	  cluster value <emphasis>not larger than</emphasis> its
+	  initial cluster value.
+	</para>
+      </listitem>
+    </itemizedlist>
+	
+  </section>
+  <section id="a-clustering-example-for-levels-0-and-1">
+    <title>A clustering example for levels 0 and 1</title>
+    <para>
+      The guarantees and benefits of level 0 and level 1 can be seen
+      with some examples. First, let us examine what happens with cluster
+      values when shaping involves cluster merging with ligatures and
+      decomposition.
+    </para>
+    <para>
+      Let's say we start with the following character sequence (top row) and
+      initial cluster values (bottom row):
     </para>
     <programlisting>
-  ABC,a,b,c
-  0  ,0,0,0
-</programlisting>
+      A,B,C,D,E
+      0,1,2,3,4
+    </programlisting>
     <para>
-      level 1:
+      During shaping, HarfBuzz maps these characters to glyphs from
+      the font. For simplicity, let's assume that each character maps
+      to the corresponding, identical-looking glyph:
     </para>
     <programlisting>
-  ABC,a,b,c
-  0  ,0,0,5
-</programlisting>
+      A,B,C,D,E
+      0,1,2,3,4
+    </programlisting>
     <para>
-      level 2:
+      Now if, for example, <literal>B</literal> and <literal>C</literal>
+      form a ligature, then the clusters to which they belong
+      "merge". This merged cluster takes for its cluster
+      value the minimum of all the cluster values of the clusters that
+      went in to the ligature. In this case, we get:
     </para>
     <programlisting>
-  ABC,a,b,c
-  0  ,1,3,5
-</programlisting>
+      A,BC,D,E
+      0,1 ,3,4
+    </programlisting>
+    <para>
+      because 1 is the minimum of the set {1,2}, which were the
+      cluster values of <literal>B</literal> and
+      <literal>C</literal>. 
+    </para>
     <para>
-      Making sense of the last example is the hardest for a client,
-      because there is nothing in the cluster values to suggest that
-      <literal>B</literal> and <literal>C</literal> ligated with
-      <literal>A</literal>.
+      Next, let us say that the <literal>BC</literal> ligature glyph
+      decomposes into three components, and <literal>D</literal> also
+      decomposes into two components. These components each inherit the
+      cluster value of their parent: 
     </para>
-  </sect2>
-  <sect2 id="reordering">
-    <title>Reordering</title>
+    <programlisting>
+      A,BC0,BC1,BC2,D0,D1,E
+      0,1  ,1  ,1  ,3 ,3 ,4
+    </programlisting>
     <para>
-      Another tricky case is when things reorder. Under level 2:
+      Next, if <literal>BC2</literal> and <literal>D0</literal> form a
+      ligature, then their clusters (cluster values 1 and 3) merge into
+      <literal>min(1,3) = 1</literal>:
     </para>
     <programlisting>
-  A,B,C,D,E
-  0,1,2,3,4
-</programlisting>
+      A,BC0,BC1,BC2D0,D1,E
+      0,1  ,1  ,1    ,1 ,4
+    </programlisting>
+    <para>
+      At this point, cluster 1 means: the character sequence
+      <literal>BCD</literal> is represented by glyphs
+      <literal>BC0,BC1,BC2D0,D1</literal> and cannot be broken down any
+      further.
+    </para>
+  </section>
+  <section id="reordering-in-levels-0-and-1">
+    <title>Reordering in levels 0 and 1</title>
+    <para>
+      Another common operation in the more complex shapers is glyph
+      reordering. In order to maintain a monotonic cluster sequence
+      when glyph reordering takes place, HarfBuzz merges the clusters
+      of everything in the reordering sequence.
+    </para>
     <para>
-      Now imagine <literal>D</literal> moves before
-      <literal>B</literal>:
+      For example, let us again start with the character sequence (top
+      row) and initial cluster values (bottom row):
     </para>
     <programlisting>
-  A,D,B,C,E
-  0,3,1,2,4
-</programlisting>
+      A,B,C,D,E
+      0,1,2,3,4
+    </programlisting>
     <para>
-      Now, if <literal>D</literal> ligates with <literal>B</literal>, we
+      If <literal>D</literal> is reordered before <literal>B</literal>,
+      then HarfBuzz merges the <literal>B</literal>,
+      <literal>C</literal>, and <literal>D</literal> clusters, and we
       get:
     </para>
     <programlisting>
-  A,DB,C,E
-  0,3 ,2,4
-</programlisting>
+      A,D,B,C,E
+      0,1,1,1,4
+    </programlisting>
     <para>
-      In a different scenario, <literal>A</literal> and
-      <literal>B</literal> could have ligated
-      <emphasis>before</emphasis> <literal>D</literal> reordered; that
-      would have resulted in:
+      This is clearly not ideal, but it is the only sensible way to
+      maintain a monotonic sequence of cluster values and retain the
+      true relationship between glyphs and characters.
+    </para>
+  </section>
+  <section id="the-distinction-between-levels-0-and-1">
+    <title>The distinction between levels 0 and 1</title>
+    <para>
+      The preceding examples demonstrate the main effects of using
+      cluster levels 0 and 1. The only difference between the two
+      levels is this: in level 0, at the very beginning of the shaping
+      process, HarfBuzz also merges clusters between any base character
+      and all Unicode marks (combining or not) that follow it.
+    </para>
+    <para>
+      For example, let us start with the following character sequence
+      (top row) and accompanying initial cluster values (bottom row):
+    </para>
+    <programlisting>
+      A,acute,B
+      0,1    ,2
+    </programlisting>
+    <para>
+      The <literal>acute</literal> is a Unicode mark. If HarfBuzz is
+      using cluster level 0 on this sequence, then the
+      <literal>A</literal> and <literal>acute</literal> clusters will
+      merge, and the result will become:
     </para>
     <programlisting>
-  AB,D,C,E
-  0 ,3,2,4   
-</programlisting>
+      A,acute,B
+      0,0    ,2
+    </programlisting>
+    <para>
+      This initial cluster merging is the default behavior of the
+      Windows shaping engine, and the old HarfBuzz codebase copied
+      that behavior to maintain compatibility. Consequently, it has
+      remained the default behavior in the new HarfBuzz codebase.
+    </para>
+    <para>
+      But this initial cluster-merging behavior makes it impossible to
+      color diacritic marks differently from their base
+      characters. That is why, in level 1, HarfBuzz does not perform
+      the initial merging step.
+    </para>
+    <para>
+      For client programs that rely on HarfBuzz cluster values to
+      perform cursor positioning, level 0 is more convenient. But
+      relying on cluster boundaries for cursor positioning is wrong: cursor
+      positions should be determined based on Unicode grapheme
+      boundaries, not on shaping-cluster boundaries. As such, level 1
+      clusters are preferred. 
+    </para>
+    <para>
+      One last note about levels 0 and 1. HarfBuzz currently does not allow a
+      <literal>MultipleSubst</literal> lookup to replace a glyph with zero
+      glyphs (in other words, to delete a glyph). But, in some other situations,
+      glyphs can be deleted. In those cases, if the glyph being deleted is
+      the last glyph of its cluster, HarfBuzz makes sure to merge the cluster
+      with a neighboring cluster.
+    </para>
+    <para>
+      This is done primarily to make sure that the starting cluster of the
+      text always has the cluster index pointing to the start of the text
+      for the run; more than one client currently relies on this
+      guarantee.
+    </para>
+    <para>
+      Incidentally, Apple's CoreText does something else to maintain the
+      same promise: it inserts a glyph with id 65535 at the beginning of
+      the glyph string if the glyph corresponding to the first character
+      in the run was deleted. HarfBuzz might do something similar in the
+      future.
+    </para>
+  </section>
+  <section id="level-2">
+    <title>Level 2</title>
+    <para>
+      HarfBuzz's level 2 cluster behavior uses a significantly
+      different model than that of level 0 and level 1.
+    </para>
     <para>
-      There's no way to differentiate between these two scenarios based
-      on the cluster numbers alone.
+      The level 2 behavior is easy to describe, but it may be
+      difficult to understand in practical terms. In brief, level 2 
+      performs no merging of clusters whatsoever.
     </para>
     <para>
-      Another problem happens with ligatures under level 2 if the
-      direction of the text is forced to opposite of its natural
-      direction (e.g. left-to-right Arabic). But that's too much of a
-      corner case to worry about.
+      When glyphs form a ligature (or when some other feature
+      substitutes multiple glyphs with one glyph), the cluster value
+      of the first glyph is retained as the cluster value for the
+      ligature. However, no subsequent clusters — including
+      marks and modifiers — are affected.
     </para>
-  </sect2>
-</sect1>
+    <para>
+      Level 2 cluster behavior is less complex than level 0 or level
+      1, but there are a few cases in which processing cluster values
+      produced at level 2 may be tricky. 
+    </para>
+    <section id="ligatures-with-combining-marks-in-level-2">
+      <title>Ligatures with combining marks in level 2</title>
+      <para>
+	The first example of how HarfBuzz's level 2 cluster behavior
+	can be tricky is when the text to be shaped includes combining
+	marks attached to ligatures.
+      </para>
+      <para>
+	Let us start with an input sequence with the following
+	characters (top row) and initial cluster values (bottom row):
+      </para>
+      <programlisting>
+	A,acute,B,breve,C,circumflex
+	0,1    ,2,3    ,4,5
+      </programlisting>
+      <para>
+	If the sequence <literal>A,B,C</literal> forms a ligature,
+	then these are the cluster values HarfBuzz will return under
+	the various cluster levels:
+      </para>
+      <para>
+	Level 0:
+      </para>
+      <programlisting>
+	ABC,acute,breve,circumflex
+	0  ,0    ,0    ,0
+      </programlisting>
+      <para>
+	Level 1:
+      </para>
+      <programlisting>
+	ABC,acute,breve,circumflex
+	0  ,0    ,0    ,5
+      </programlisting>
+      <para>
+	Level 2:
+      </para>
+      <programlisting>
+	ABC,acute,breve,circumflex
+	0  ,1    ,3    ,5
+      </programlisting>
+      <para>
+	Making sense of the level 2 result is the hardest for a client
+	program, because there is nothing in the cluster values that
+	indicates that <literal>B</literal> and <literal>C</literal>
+	formed a ligature with <literal>A</literal>.
+      </para>
+      <para>
+	In contrast, the "merged" cluster values of the mark glyphs
+	that are seen in the level 0 and level 1 output are evidence
+	that a ligature substitution took place. 
+      </para>
+    </section>
+    <section id="reordering-in-level-2">
+      <title>Reordering in level 2</title>
+      <para>
+	Another example of how HarfBuzz's level 2 cluster behavior
+	can be tricky is when glyphs reorder. Consider an input sequence
+	with the following characters (top row) and initial cluster
+	values (bottom row):
+      </para>
+      <programlisting>
+	A,B,C,D,E
+	0,1,2,3,4
+      </programlisting>
+      <para>
+	Now imagine <literal>D</literal> moves before
+	<literal>B</literal> in a reordering operation. The cluster
+	values will then be:
+      </para>
+      <programlisting>
+	A,D,B,C,E
+	0,3,1,2,4
+      </programlisting>
+      <para>
+	Next, if <literal>D</literal> forms a ligature with
+	<literal>B</literal>, the output is:
+      </para>
+      <programlisting>
+	A,DB,C,E
+	0,3 ,2,4
+      </programlisting>
+      <para>
+	However, in a different scenario, in which the shaping rules
+	of the script instead caused <literal>A</literal> and
+	<literal>B</literal> to form a ligature
+	<emphasis>before</emphasis> the <literal>D</literal> reordered, the
+	result would be:
+      </para>
+      <programlisting>
+	AB,D,C,E
+	0 ,3,2,4   
+      </programlisting>
+      <para>
+	There is no way for a client program to differentiate between
+	these two scenarios based on the cluster values
+	alone. Consequently, client programs that use level 2 might
+	need to undertake additional work in order to manage cursor
+	positioning, text attributes, or other desired features.
+      </para>
+    </section>
+    <section id="other-considerations-in-level-2">
+      <title>Other considerations in level 2</title>
+      <para>
+	There may be other problems encountered with ligatures under
+	level 2, such as if the direction of the text is forced to
+	opposite of its natural direction (for example, left-to-right
+	Arabic). But, generally speaking, these other scenarios are
+	minor corner cases that are too obscure for most client
+	programs to need to worry about.
+      </para>
+    </section>
+  </section>
 </chapter>


More information about the HarfBuzz mailing list