[HarfBuzz] harfbuzz: Branch 'master' - 17 commits
Behdad Esfahbod
behdad at kemper.freedesktop.org
Sat Nov 24 14:48:15 UTC 2018
docs/usermanual-clusters.xml | 772 ++++++++++++++++++++++++++--------------
src/hb-aat-layout-kerx-table.hh | 5
src/hb-aat-layout-morx-table.hh | 5
src/hb-aat-map.hh | 4
src/hb-dsalgs.hh | 110 +++++
src/hb-face.cc | 6
src/hb-open-type.hh | 106 ++---
src/hb-ot-map.hh | 4
src/hb-set.hh | 2
src/hb-vector.hh | 88 +---
10 files changed, 698 insertions(+), 404 deletions(-)
New commits:
commit 8dcc1913a1670ede7b124f7b5b775d7ab8791386
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 09:47:45 2018 -0500
[kerx/morx] Make sure object length is sanitized before accessing it
diff --git a/src/hb-aat-layout-kerx-table.hh b/src/hb-aat-layout-kerx-table.hh
index 521c4c72..2d548932 100644
--- a/src/hb-aat-layout-kerx-table.hh
+++ b/src/hb-aat-layout-kerx-table.hh
@@ -962,6 +962,11 @@ struct KerxTable
unsigned int count = thiz()->tableCount;
for (unsigned int i = 0; i < count; i++)
{
+ if (unlikely (!st->u.header.sanitize (c)))
+ {
+ c->reset_object ();
+ return_trace (false);
+ }
/* OpenType kern table has 2-byte subtable lengths. That's limiting.
* MS implementation also only supports one subtable, of format 0,
* anyway. Certain versions of some fonts, like Calibry, contain
diff --git a/src/hb-aat-layout-morx-table.hh b/src/hb-aat-layout-morx-table.hh
index 7a39eea8..5b44a4cf 100644
--- a/src/hb-aat-layout-morx-table.hh
+++ b/src/hb-aat-layout-morx-table.hh
@@ -1061,6 +1061,11 @@ struct Chain
unsigned int count = subtableCount;
for (unsigned int i = 0; i < count; i++)
{
+ if (unlikely (!c->check_struct (subtable)))
+ {
+ c->reset_object ();
+ return_trace (false);
+ }
c->set_object (*subtable);
if (!subtable->sanitize (c))
return_trace (false);
commit 70d80c90fe2f4eca66bec3e1d313bbf7e4d0ab65
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:59:50 2018 -0500
[arrays] Port ArrayOf.qsort() and hb_vector_t.qsort() to hb_array_t
diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index 87606072..cc1a1d42 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -560,6 +560,9 @@ struct hb_bytes_t
};
template <typename Type>
+struct hb_sorted_array_t;
+
+template <typename Type>
struct hb_array_t
{
static_assert ((bool) (unsigned) hb_static_size (Type), "");
@@ -600,13 +603,20 @@ struct hb_array_t
return not_found;
}
- inline void qsort (int (*cmp)(const void*, const void*))
+ inline hb_sorted_array_t<Type> qsort (int (*cmp)(const void*, const void*))
{
::qsort (arrayZ, len, sizeof (Type), cmp);
+ return hb_sorted_array_t<Type> (*this);
+ }
+ inline hb_sorted_array_t<Type> qsort (void)
+ {
+ ::qsort (arrayZ, len, sizeof (Type), Type::cmp);
+ return hb_sorted_array_t<Type> (*this);
}
- inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
+ inline void qsort (unsigned int start, unsigned int end)
{
end = MIN (end, len);
+ assert (start <= end);
::qsort (arrayZ + start, end - start, sizeof (Type), Type::cmp);
}
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 4704861d..c7154c42 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -375,6 +375,16 @@ struct UnsizedArrayOf
inline hb_array_t<const Type> as_array (unsigned int len) const
{ return hb_array (arrayZ, len); }
+ template <typename T>
+ inline Type &lsearch (unsigned int len, const T &x)
+ { return *as_array (len).lsearch (x, &Crap (T)); }
+ template <typename T>
+ inline const Type &lsearch (unsigned int len, const T &x) const
+ { return *as_array (len).lsearch (x, &Null (T)); }
+
+ inline void qsort (unsigned int len, unsigned int start = 0, unsigned int end = (unsigned int) -1)
+ { as_array (len).qsort (start, end); }
+
inline bool sanitize (hb_sanitize_context_t *c, unsigned int count) const
{
TRACE_SANITIZE (this);
@@ -577,8 +587,8 @@ struct ArrayOf
inline const Type &lsearch (const T &x) const
{ return *as_array ().lsearch (x, &Null (T)); }
- inline void qsort (void)
- { as_array ().qsort (); }
+ inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
+ { as_array ().qsort (start, end); }
inline bool sanitize_shallow (hb_sanitize_context_t *c) const
{
commit 073d837aa2394d29dda72679802d583c559c3c5b
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:48:48 2018 -0500
[arrays] Port ArrayOf.qsort() to hb_array_t's
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index eaefc3bc..4704861d 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -578,9 +578,7 @@ struct ArrayOf
{ return *as_array ().lsearch (x, &Null (T)); }
inline void qsort (void)
- {
- ::qsort (arrayZ, len, sizeof (Type), Type::cmp);
- }
+ { as_array ().qsort (); }
inline bool sanitize_shallow (hb_sanitize_context_t *c) const
{
commit ad5c871d801b481f95dd32c8b65ecc70def597be
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:47:49 2018 -0500
[arrays] Add copy-constructor to hb_array_t and hb_sorted_array_t
diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index ef030732..87606072 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -564,6 +564,7 @@ struct hb_array_t
{
static_assert ((bool) (unsigned) hb_static_size (Type), "");
+ inline hb_array_t (const hb_array_t &o) : arrayZ (o.arrayZ), len (o.len) {}
inline hb_array_t (Type *array_, unsigned int len_) : arrayZ (array_), len (len_) {}
inline Type& operator [] (unsigned int i) const
@@ -642,6 +643,7 @@ inline hb_array_t<T> hb_array (T *array, unsigned int len)
template <typename Type>
struct hb_sorted_array_t : hb_array_t<Type>
{
+ inline hb_sorted_array_t (const hb_array_t<Type> &o) : hb_array_t<Type> (o) {}
inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {}
template <typename T>
commit 61de55bf496c1edb120e4d096140eb1125552bbe
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:45:58 2018 -0500
[arrays] Port hb_vector_t.qsort() to hb_array_t's
diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index e66f2b4c..ef030732 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -572,6 +572,10 @@ struct hb_array_t
return arrayZ[i];
}
+ template <typename T> inline operator T * (void) const { return arrayZ; }
+
+ inline Type * operator & (void) const { return arrayZ; }
+
inline unsigned int get_size (void) const { return len * sizeof (Type); }
template <typename T>
@@ -595,9 +599,15 @@ struct hb_array_t
return not_found;
}
- template <typename T> inline operator T * (void) const { return arrayZ; }
-
- inline Type * operator & (void) const { return arrayZ; }
+ inline void qsort (int (*cmp)(const void*, const void*))
+ {
+ ::qsort (arrayZ, len, sizeof (Type), cmp);
+ }
+ inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
+ {
+ end = MIN (end, len);
+ ::qsort (arrayZ + start, end - start, sizeof (Type), Type::cmp);
+ }
inline hb_array_t<Type> sub_array (unsigned int start_offset, unsigned int seg_count) const
{
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index a5fa4142..a8c98d22 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -221,15 +221,9 @@ struct hb_vector_t
}
inline void qsort (int (*cmp)(const void*, const void*))
- {
- ::qsort (arrayZ(), len, sizeof (Type), cmp);
- }
-
+ { as_array ().qsort (cmp); }
inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
- {
- end = MIN (end, len);
- ::qsort (arrayZ() + start, end - start, sizeof (Type), Type::cmp);
- }
+ { as_array ().qsort (start, end); }
template <typename T>
inline Type *lsearch (const T &x, Type *not_found = nullptr)
commit e3face8e791d677f94154e8a7f3d787d0d69a02f
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:42:17 2018 -0500
[arrays] Remove one flavor of hb_vector_t.qsort()
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index 085523f8..a5fa4142 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -225,13 +225,9 @@ struct hb_vector_t
::qsort (arrayZ(), len, sizeof (Type), cmp);
}
- inline void qsort (void)
- {
- ::qsort (arrayZ(), len, sizeof (Type), Type::cmp);
- }
-
- inline void qsort (unsigned int start, unsigned int end)
+ inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
{
+ end = MIN (end, len);
::qsort (arrayZ() + start, end - start, sizeof (Type), Type::cmp);
}
commit 7c1600dcd9813ca560ecccc5c54877a5750caf4e
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:37:11 2018 -0500
[arrays] Add (unused) SortedUnsizedArrayOf<>
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 0ea88431..eaefc3bc 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -449,6 +449,27 @@ struct UnsizedOffsetListOf : UnsizedOffsetArrayOf<Type, OffsetType, has_null>
}
};
+/* An array with sorted elements. Supports binary searching. */
+template <typename Type>
+struct SortedUnsizedArrayOf : UnsizedArrayOf<Type>
+{
+ inline hb_sorted_array_t<Type> as_array (unsigned int len)
+ { return hb_sorted_array (this->arrayZ, len); }
+ inline hb_sorted_array_t<const Type> as_array (unsigned int len) const
+ { return hb_sorted_array (this->arrayZ, len); }
+
+ template <typename T>
+ inline Type &bsearch (unsigned int len, const T &x)
+ { return *as_array (len).bsearch (x, &Crap (Type)); }
+ template <typename T>
+ inline const Type &bsearch (unsigned int len, const T &x) const
+ { return *as_array (len).bsearch (x, &Null (Type)); }
+ template <typename T>
+ inline bool bfind (unsigned int len, const T &x, unsigned int *i = nullptr) const
+ { return as_array (len).bfind (x, i); }
+};
+
+
/* An array with a number of elements. */
template <typename Type, typename LenType=HBUINT16>
struct ArrayOf
commit e700392f5cbf366f1e03dc7e7b1a2eb6c3027b92
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:31:00 2018 -0500
[arrays] Port SortedArrayOf.bsearch/bfind to hb_sorted_array_t's
diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index a7592828..e66f2b4c 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -635,22 +635,19 @@ struct hb_sorted_array_t : hb_array_t<Type>
inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {}
template <typename T>
- inline Type *bsearch (const T &x,
- Type *not_found = nullptr)
+ inline Type *bsearch (const T &x, Type *not_found = nullptr)
{
unsigned int i;
return bfind (x, &i) ? &this->arrayZ[i] : not_found;
}
template <typename T>
- inline const Type *bsearch (const T &x,
- const Type *not_found = nullptr) const
+ inline const Type *bsearch (const T &x, const Type *not_found = nullptr) const
{
unsigned int i;
return bfind (x, &i) ? &this->arrayZ[i] : not_found;
}
template <typename T>
- inline bool bfind (const T &x,
- unsigned int *i = nullptr) const
+ inline bool bfind (const T &x, unsigned int *i = nullptr) const
{
int min = 0, max = (int) this->len - 1;
const Type *array = this->arrayZ;
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index e384a7ad..0ea88431 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -551,14 +551,10 @@ struct ArrayOf
template <typename T>
inline Type &lsearch (const T &x)
- {
- return *as_array ().lsearch (x, &Crap (T));
- }
+ { return *as_array ().lsearch (x, &Crap (T)); }
template <typename T>
inline const Type &lsearch (const T &x) const
- {
- return *as_array ().lsearch (x, &Null (T));
- }
+ { return *as_array ().lsearch (x, &Null (T)); }
inline void qsort (void)
{
@@ -745,46 +741,20 @@ struct ArrayOfM1
template <typename Type, typename LenType=HBUINT16>
struct SortedArrayOf : ArrayOf<Type, LenType>
{
+ inline hb_sorted_array_t<Type> as_array (void)
+ { return hb_sorted_array (this->arrayZ, this->len); }
+ inline hb_sorted_array_t<const Type> as_array (void) const
+ { return hb_sorted_array (this->arrayZ, this->len); }
+
template <typename T>
inline Type &bsearch (const T &x)
- {
- unsigned int i;
- return bfind (x, &i) ? this->arrayZ[i] : Crap(Type);
- }
+ { return *as_array ().bsearch (x, &Crap (Type)); }
template <typename T>
inline const Type &bsearch (const T &x) const
- {
- unsigned int i;
- return bfind (x, &i) ? this->arrayZ[i] : Null(Type);
- }
+ { return *as_array ().bsearch (x, &Null (Type)); }
template <typename T>
inline bool bfind (const T &x, unsigned int *i = nullptr) const
- {
- int min = 0, max = (int) this->len - 1;
- const Type *array = this->arrayZ;
- while (min <= max)
- {
- int mid = ((unsigned int) min + (unsigned int) max) / 2;
- int c = array[mid].cmp (x);
- if (c < 0)
- max = mid - 1;
- else if (c > 0)
- min = mid + 1;
- else
- {
- if (i)
- *i = mid;
- return true;
- }
- }
- if (i)
- {
- if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
- max++;
- *i = max;
- }
- return false;
- }
+ { return as_array ().bfind (x, i); }
};
/*
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index 14967a41..085523f8 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -236,36 +236,21 @@ struct hb_vector_t
}
template <typename T>
- inline Type *lsearch (const T &x,
- Type *not_found = nullptr)
- {
- return as_array ().lsearch (x, not_found);
- }
+ inline Type *lsearch (const T &x, Type *not_found = nullptr)
+ { return as_array ().lsearch (x, not_found); }
template <typename T>
- inline const Type *lsearch (const T &x,
- const Type *not_found = nullptr) const
- {
- return as_array ().lsearch (x, not_found);
- }
+ inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+ { return as_array ().lsearch (x, not_found); }
template <typename T>
- inline Type *bsearch (const T &x,
- Type *not_found = nullptr)
- {
- return as_sorted_array ().bsearch (x, not_found);
- }
+ inline Type *bsearch (const T &x, Type *not_found = nullptr)
+ { return as_sorted_array ().bsearch (x, not_found); }
template <typename T>
- inline const Type *bsearch (const T &x,
- const Type *not_found = nullptr) const
- {
- return as_sorted_array ().bsearch (x, not_found);
- }
+ inline const Type *bsearch (const T &x, const Type *not_found = nullptr) const
+ { return as_sorted_array ().bsearch (x, not_found); }
template <typename T>
- inline bool bfind (const T &x,
- unsigned int *i = nullptr) const
- {
- return as_sorted_array ().bfind (x, i);
- }
+ inline bool bfind (const T &x, unsigned int *i = nullptr) const
+ { return as_sorted_array ().bfind (x, i); }
};
commit e604306f2829804e9016966c1378166253b19d29
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:24:48 2018 -0500
[arrays] Port hb_vector_t.bsearch/bfind to (new) hb_sorted_array_t's
diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index 1d5bb352..a7592828 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -564,7 +564,6 @@ struct hb_array_t
{
static_assert ((bool) (unsigned) hb_static_size (Type), "");
- inline hb_array_t (void) : arrayZ (nullptr), len (0) {}
inline hb_array_t (Type *array_, unsigned int len_) : arrayZ (array_), len (len_) {}
inline Type& operator [] (unsigned int i) const
@@ -576,7 +575,8 @@ struct hb_array_t
inline unsigned int get_size (void) const { return len * sizeof (Type); }
template <typename T>
- inline Type *lsearch (const T &x, Type *not_found = nullptr)
+ inline Type *lsearch (const T &x,
+ Type *not_found = nullptr)
{
unsigned int count = len;
for (unsigned int i = 0; i < count; i++)
@@ -585,7 +585,8 @@ struct hb_array_t
return not_found;
}
template <typename T>
- inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+ inline const Type *lsearch (const T &x,
+ const Type *not_found = nullptr) const
{
unsigned int count = len;
for (unsigned int i = 0; i < count; i++)
@@ -625,7 +626,61 @@ struct hb_array_t
unsigned int len;
};
template <typename T>
-inline hb_array_t<T> hb_array (T *array, unsigned int len) { return hb_array_t<T> (array, len); }
+inline hb_array_t<T> hb_array (T *array, unsigned int len)
+{ return hb_array_t<T> (array, len); }
+
+template <typename Type>
+struct hb_sorted_array_t : hb_array_t<Type>
+{
+ inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {}
+
+ template <typename T>
+ inline Type *bsearch (const T &x,
+ Type *not_found = nullptr)
+ {
+ unsigned int i;
+ return bfind (x, &i) ? &this->arrayZ[i] : not_found;
+ }
+ template <typename T>
+ inline const Type *bsearch (const T &x,
+ const Type *not_found = nullptr) const
+ {
+ unsigned int i;
+ return bfind (x, &i) ? &this->arrayZ[i] : not_found;
+ }
+ template <typename T>
+ inline bool bfind (const T &x,
+ unsigned int *i = nullptr) const
+ {
+ int min = 0, max = (int) this->len - 1;
+ const Type *array = this->arrayZ;
+ while (min <= max)
+ {
+ int mid = ((unsigned int) min + (unsigned int) max) / 2;
+ int c = array[mid].cmp (x);
+ if (c < 0)
+ max = mid - 1;
+ else if (c > 0)
+ min = mid + 1;
+ else
+ {
+ if (i)
+ *i = mid;
+ return true;
+ }
+ }
+ if (i)
+ {
+ if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
+ max++;
+ *i = max;
+ }
+ return false;
+ }
+};
+template <typename T>
+inline hb_sorted_array_t<T> hb_sorted_array (T *array, unsigned int len)
+{ return hb_sorted_array_t<T> (array, len); }
struct HbOpOr
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 7fa38b1c..e384a7ad 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -370,8 +370,10 @@ struct UnsizedArrayOf
inline unsigned int get_size (unsigned int len) const
{ return len * Type::static_size; }
- inline hb_array_t<Type> as_array (unsigned int len) { return hb_array_t<Type> (arrayZ, len); }
- inline hb_array_t<const Type> as_array (unsigned int len) const { return hb_array_t<const Type> (arrayZ, len); }
+ inline hb_array_t<Type> as_array (unsigned int len)
+ { return hb_array (arrayZ, len); }
+ inline hb_array_t<const Type> as_array (unsigned int len) const
+ { return hb_array (arrayZ, len); }
inline bool sanitize (hb_sanitize_context_t *c, unsigned int count) const
{
@@ -483,8 +485,10 @@ struct ArrayOf
inline unsigned int get_size (void) const
{ return len.static_size + len * Type::static_size; }
- inline hb_array_t<Type> as_array (void) { return hb_array_t<Type> (arrayZ, len); }
- inline hb_array_t<const Type> as_array (void) const { return hb_array_t<const Type> (arrayZ, len); }
+ inline hb_array_t<Type> as_array (void)
+ { return hb_array (arrayZ, len); }
+ inline hb_array_t<const Type> as_array (void) const
+ { return hb_array (arrayZ, len); }
inline bool serialize (hb_serialize_context_t *c,
unsigned int items_len)
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index d1fb65df..14967a41 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -91,8 +91,15 @@ struct hb_vector_t
return arrayZ()[i];
}
- inline hb_array_t<Type> as_array (void) { return hb_array_t<Type> (arrayZ(), len); }
- inline hb_array_t<const Type> as_array (void) const { return hb_array_t<const Type> (arrayZ(), len); }
+ inline hb_array_t<Type> as_array (void)
+ { return hb_array (arrayZ(), len); }
+ inline hb_array_t<const Type> as_array (void) const
+ { return hb_array (arrayZ(), len); }
+
+ inline hb_sorted_array_t<Type> as_sorted_array (void)
+ { return hb_sorted_array (arrayZ(), len); }
+ inline hb_sorted_array_t<const Type> as_sorted_array (void) const
+ { return hb_sorted_array (arrayZ(), len); }
template <typename T> inline operator T * (void) { return arrayZ(); }
template <typename T> inline operator const T * (void) const { return arrayZ(); }
@@ -229,55 +236,35 @@ struct hb_vector_t
}
template <typename T>
- inline Type *lsearch (const T &x, Type *not_found = nullptr)
+ inline Type *lsearch (const T &x,
+ Type *not_found = nullptr)
{
return as_array ().lsearch (x, not_found);
}
template <typename T>
- inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+ inline const Type *lsearch (const T &x,
+ const Type *not_found = nullptr) const
{
return as_array ().lsearch (x, not_found);
}
template <typename T>
- inline Type *bsearch (const T &x)
+ inline Type *bsearch (const T &x,
+ Type *not_found = nullptr)
{
- unsigned int i;
- return bfind (x, &i) ? &arrayZ()[i] : nullptr;
+ return as_sorted_array ().bsearch (x, not_found);
}
template <typename T>
- inline const Type *bsearch (const T &x) const
+ inline const Type *bsearch (const T &x,
+ const Type *not_found = nullptr) const
{
- unsigned int i;
- return bfind (x, &i) ? &arrayZ()[i] : nullptr;
+ return as_sorted_array ().bsearch (x, not_found);
}
template <typename T>
- inline bool bfind (const T &x, unsigned int *i = nullptr) const
+ inline bool bfind (const T &x,
+ unsigned int *i = nullptr) const
{
- int min = 0, max = (int) this->len - 1;
- const Type *array = this->arrayZ();
- while (min <= max)
- {
- int mid = ((unsigned int) min + (unsigned int) max) / 2;
- int c = array[mid].cmp (x);
- if (c < 0)
- max = mid - 1;
- else if (c > 0)
- min = mid + 1;
- else
- {
- if (i)
- *i = mid;
- return true;
- }
- }
- if (i)
- {
- if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
- max++;
- *i = max;
- }
- return false;
+ return as_sorted_array ().bfind (x, i);
}
};
commit 268eca24921e85eda98f4f0cce05d40c7235ba62
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:11:12 2018 -0500
[arrays] Port (unused) ArrayOf.lsearch() to hb_array_t's
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index 4e0c033f..7fa38b1c 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -548,20 +548,12 @@ struct ArrayOf
template <typename T>
inline Type &lsearch (const T &x)
{
- unsigned int count = len;
- for (unsigned int i = 0; i < count; i++)
- if (!this->arrayZ[i].cmp (x))
- return this->arrayZ[i];
- return Crap (T);
+ return *as_array ().lsearch (x, &Crap (T));
}
template <typename T>
inline const Type &lsearch (const T &x) const
{
- unsigned int count = len;
- for (unsigned int i = 0; i < count; i++)
- if (!this->arrayZ[i].cmp (x))
- return this->arrayZ[i];
- return Null (T);
+ return *as_array ().lsearch (x, &Null (T));
}
inline void qsort (void)
commit 830856ba6b9454bf507e00416f9d45e9975fb7dc
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:09:28 2018 -0500
[arrays] Port hb_vector_t.lsearch() to hb_array_t's
diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh
index c133a532..1d5bb352 100644
--- a/src/hb-dsalgs.hh
+++ b/src/hb-dsalgs.hh
@@ -575,9 +575,24 @@ struct hb_array_t
inline unsigned int get_size (void) const { return len * sizeof (Type); }
- template <typename hb_sanitize_context_t>
- inline bool sanitize (hb_sanitize_context_t *c) const
- { return c->check_array (arrayZ, len); }
+ template <typename T>
+ inline Type *lsearch (const T &x, Type *not_found = nullptr)
+ {
+ unsigned int count = len;
+ for (unsigned int i = 0; i < count; i++)
+ if (!this->arrayZ[i].cmp (x))
+ return &this->arrayZ[i];
+ return not_found;
+ }
+ template <typename T>
+ inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
+ {
+ unsigned int count = len;
+ for (unsigned int i = 0; i < count; i++)
+ if (!this->arrayZ[i].cmp (x))
+ return &this->arrayZ[i];
+ return not_found;
+ }
template <typename T> inline operator T * (void) const { return arrayZ; }
@@ -601,6 +616,11 @@ struct hb_array_t
inline void free (void) { ::free ((void *) arrayZ); arrayZ = nullptr; len = 0; }
+ template <typename hb_sanitize_context_t>
+ inline bool sanitize (hb_sanitize_context_t *c) const
+ { return c->check_array (arrayZ, len); }
+
+ public:
Type *arrayZ;
unsigned int len;
};
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index e17f8897..d1fb65df 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -229,22 +229,14 @@ struct hb_vector_t
}
template <typename T>
- inline Type *lsearch (const T &x)
+ inline Type *lsearch (const T &x, Type *not_found = nullptr)
{
- Type *array = arrayZ();
- for (unsigned int i = 0; i < len; i++)
- if (0 == array[i].cmp (x))
- return &array[i];
- return nullptr;
+ return as_array ().lsearch (x, not_found);
}
template <typename T>
- inline const Type *lsearch (const T &x) const
+ inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const
{
- const Type *array = arrayZ();
- for (unsigned int i = 0; i < len; i++)
- if (0 == array[i].cmp (x))
- return &array[i];
- return nullptr;
+ return as_array ().lsearch (x, not_found);
}
template <typename T>
commit 96cf0889804b7d72a96274b25641bb18f7dd2e1e
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 01:07:15 2018 -0500
[arrays] More
diff --git a/src/hb-face.cc b/src/hb-face.cc
index e23842fb..5b33784f 100644
--- a/src/hb-face.cc
+++ b/src/hb-face.cc
@@ -588,10 +588,10 @@ struct hb_face_builder_data_t
{
struct table_entry_t
{
- inline int cmp (const hb_tag_t *t) const
+ inline int cmp (hb_tag_t t) const
{
- if (*t < tag) return -1;
- if (*t > tag) return -1;
+ if (t < tag) return -1;
+ if (t > tag) return -1;
return 0;
}
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index 436b92bb..e17f8897 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -233,7 +233,7 @@ struct hb_vector_t
{
Type *array = arrayZ();
for (unsigned int i = 0; i < len; i++)
- if (0 == array[i].cmp (&x))
+ if (0 == array[i].cmp (x))
return &array[i];
return nullptr;
}
@@ -242,7 +242,7 @@ struct hb_vector_t
{
const Type *array = arrayZ();
for (unsigned int i = 0; i < len; i++)
- if (0 == array[i].cmp (&x))
+ if (0 == array[i].cmp (x))
return &array[i];
return nullptr;
}
commit 3e26c8d2b10fc08642c25c7f13aef68b0b1008f6
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 00:58:44 2018 -0500
[arrays] Update ArrayOf.lsearch()
Currently unused apparently
diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh
index f255ea12..4e0c033f 100644
--- a/src/hb-open-type.hh
+++ b/src/hb-open-type.hh
@@ -545,14 +545,23 @@ struct ArrayOf
return_trace (true);
}
- template <typename SearchType>
- inline int lsearch (const SearchType &x) const
+ template <typename T>
+ inline Type &lsearch (const T &x)
+ {
+ unsigned int count = len;
+ for (unsigned int i = 0; i < count; i++)
+ if (!this->arrayZ[i].cmp (x))
+ return this->arrayZ[i];
+ return Crap (T);
+ }
+ template <typename T>
+ inline const Type &lsearch (const T &x) const
{
unsigned int count = len;
for (unsigned int i = 0; i < count; i++)
if (!this->arrayZ[i].cmp (x))
- return i;
- return -1;
+ return this->arrayZ[i];
+ return Null (T);
}
inline void qsort (void)
commit 22e1857b01c71714245ddca05cb3fa0127bf7da2
Author: Behdad Esfahbod <behdad at behdad.org>
Date: Sat Nov 24 00:53:19 2018 -0500
[arrays] Change argument type of cmp called by hb_vector_t.bsearch()
Towards consolidating all array bsearch/...
diff --git a/src/hb-aat-map.hh b/src/hb-aat-map.hh
index 07454b2c..7d85c7c3 100644
--- a/src/hb-aat-map.hh
+++ b/src/hb-aat-map.hh
@@ -77,9 +77,9 @@ struct hb_aat_map_builder_t
(a->seq < b->seq ? -1 : a->seq > b->seq ? 1 : 0);
}
- int cmp (const short unsigned int *ty) const
+ int cmp (unsigned int ty) const
{
- return (type != *ty) ? (type < *ty ? -1 : 1) : 0;
+ return (type != ty) ? (type < ty ? -1 : 1) : 0;
}
};
diff --git a/src/hb-ot-map.hh b/src/hb-ot-map.hh
index 8e1f5aa8..0a5a4fbc 100644
--- a/src/hb-ot-map.hh
+++ b/src/hb-ot-map.hh
@@ -57,8 +57,8 @@ struct hb_ot_map_t
unsigned int auto_zwj : 1;
unsigned int random : 1;
- inline int cmp (const hb_tag_t *tag_) const
- { return *tag_ < tag ? -1 : *tag_ > tag ? 1 : 0; }
+ inline int cmp (const hb_tag_t tag_) const
+ { return tag_ < tag ? -1 : tag_ > tag ? 1 : 0; }
};
struct lookup_map_t {
diff --git a/src/hb-set.hh b/src/hb-set.hh
index cc061a7c..a464131d 100644
--- a/src/hb-set.hh
+++ b/src/hb-set.hh
@@ -45,7 +45,7 @@ struct hb_set_t
struct page_map_t
{
- inline int cmp (const page_map_t *o) const { return (int) o->major - (int) major; }
+ inline int cmp (const page_map_t &o) const { return (int) o.major - (int) major; }
uint32_t major;
uint32_t index;
diff --git a/src/hb-vector.hh b/src/hb-vector.hh
index b30511d8..436b92bb 100644
--- a/src/hb-vector.hh
+++ b/src/hb-vector.hh
@@ -267,7 +267,7 @@ struct hb_vector_t
while (min <= max)
{
int mid = ((unsigned int) min + (unsigned int) max) / 2;
- int c = array[mid].cmp (&x);
+ int c = array[mid].cmp (x);
if (c < 0)
max = mid - 1;
else if (c > 0)
@@ -281,7 +281,7 @@ struct hb_vector_t
}
if (i)
{
- if (max < 0 || (max < (int) this->len && array[max].cmp (&x) > 0))
+ if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0))
max++;
*i = max;
}
commit 5fdf7b724eb3cb5ac60cd7f90d3250877ad7ca06
Author: Nathan Willis <nwillis at glyphography.com>
Date: Thu Nov 15 17:40:21 2018 -0600
Usermanual: clusters chapter; add brief grapheme definition and clarify monotonous cluster handling.
diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml
index c59818fc..f48e89c2 100644
--- a/docs/usermanual-clusters.xml
+++ b/docs/usermanual-clusters.xml
@@ -14,15 +14,29 @@
unit.
</para>
<para>
- During the shaping process, some shaping operations may
- merge adjacent characters (for example, when two code points form
- a ligature and are replaced by a single glyph) or split one
- character into several (for example, when performing the Unicode
- canonical decomposition of a code point).
+ A cluster is distinct from a <emphasis>grapheme</emphasis>,
+ which is the smallest unit of a writing system or script,
+ because clusters are only relevant for script shaping and the
+ layout of glyphs.
+ </para>
+ <para>
+ For example, a grapheme may be a letter, a number, a logogram,
+ or a symbol. When two letters form a ligature, however, they
+ combine into a single glyph. They are therefore part of the same
+ cluster and are treated as a unit — even though the two
+ original, underlying letters are separate graphemes.
+ </para>
+ <para>
+ During the shaping process, there are several shaping operations
+ that may merge adjacent characters (for example, when two code
+ points form a ligature or a conjunct form and are replaced by a
+ single glyph) or split one character into several (for example,
+ when decomposing a code point through the
+ <literal>ccmp</literal> feature).
</para>
<para>
HarfBuzz tracks clusters independently from how these
- shaping operations alter the individual glyphs that comprise the
+ shaping operations affect the individual glyphs that comprise the
output HarfBuzz returns in a buffer. Consequently,
a client program using HarfBuzz can utilize the cluster
information to implement features such as:
@@ -69,15 +83,15 @@
</listitem>
</itemizedlist>
<para>
- When you add text to a HarfBuzz buffer, each code point is assigned
- a <emphasis>cluster value</emphasis>.
+ When you add text to a HarfBuzz buffer, each code point must be
+ assigned a <emphasis>cluster value</emphasis>.
</para>
<para>
This cluster value is an arbitrary number; HarfBuzz uses it only
to distinguish between clusters. Many client programs will use
the index of each code point in the input text stream as the
- cluster value, for the sake of convenience; the actual value does
- not matter.
+ cluster value. This is for the sake of convenience; the actual
+ value does not matter.
</para>
<para>
Client programs can choose how HarfBuzz handles clusters during
@@ -100,7 +114,7 @@
as well as the <emphasis>Zero Width Joiner</emphasis> and
<emphasis>Zero Width Non-Joiner</emphasis> code points, are
assigned the cluster value of the closest preceding code
- point from <emphasis>diferent</emphasis> category.
+ point from <emphasis>different</emphasis> category.
</para>
<para>
In essence, whenever a base character is followed by a mark
@@ -161,22 +175,30 @@
</listitem>
</itemizedlist>
<para>
+ As mentioned earlier, client programs using HarfBuzz often
+ assign initial cluster values in a buffer by reusing the indices
+ of the code points in the input text. This gives a sequence of
+ cluster values that is monotonically increasing (for example,
+ 0,1,2,3,4,5).
+ </para>
+ <para>
It is not <emphasis>required</emphasis> that the cluster values
in a buffer be monotonically increasing. However, if the initial
cluster values in a buffer are monotonic and the buffer is
- configured to use clustering level 0 or 1, then HarfBuzz
+ configured to use cluster level 0 or 1, then HarfBuzz
guarantees that the final cluster values in the shaped buffer
will also be monotonic. No such guarantee is made for cluster
level 2.
</para>
<para>
- In levels 0 and 1, HarfBuzz implements the following conceptual model for
- cluster values:
+ In levels 0 and 1, HarfBuzz implements the following conceptual
+ model for cluster values:
</para>
<itemizedlist spacing="compact">
<listitem>
<para>
- The sequence of cluster values will always remain monotonic.
+ If the sequence of input cluster values is monotonic, the
+ sequence of cluster values will remain monotonic.
</para>
</listitem>
<listitem>
@@ -231,7 +253,7 @@
</programlisting>
<para>
During shaping, HarfBuzz maps these characters to glyphs from
- the font. For simplicity, let's assume that each character maps
+ the font. For simplicity, let us assume that each character maps
to the corresponding, identical-looking glyph:
</para>
<programlisting>
@@ -297,7 +319,7 @@
0,1,2,3,4
</programlisting>
<para>
- If <literal>D</literal> is reordered before <literal>B</literal>,
+ If <literal>D</literal> is reordered to before <literal>B</literal>,
then HarfBuzz merges the <literal>B</literal>,
<literal>C</literal>, and <literal>D</literal> clusters, and we
get:
commit 939220e57da613e090d247aa1af2396c28370af4
Author: Nathan Willis <nwillis at glyphography.com>
Date: Thu Nov 15 15:47:03 2018 -0600
Usermanual: clusters chapter, minor updates.
diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml
index f7db0f59..c59818fc 100644
--- a/docs/usermanual-clusters.xml
+++ b/docs/usermanual-clusters.xml
@@ -30,20 +30,21 @@
<itemizedlist>
<listitem>
<para>
- Correctly positioning the cursor between two characters that
- have combined into a single glyph by forming a ligature.
+ Correctly positioning the cursor within a shaped text run,
+ even when characters have formed ligatures, composed or
+ decomposed, reordered, or undergone other shaping operations.
</para>
</listitem>
<listitem>
<para>
Correctly highlighting a text selection that includes some,
- but not all, of the characters comprising a ligature.
+ but not all, of the characters in a word.
</para>
</listitem>
<listitem>
<para>
Applying text attributes (such as color or underlining) to
- part, but not all, of a composed base-and-mark combination.
+ part, but not all, of a word.
</para>
</listitem>
<listitem>
@@ -54,6 +55,12 @@
</listitem>
<listitem>
<para>
+ Determining the mapping between input characters and output
+ glyphs, such as which glyphs are ligatures.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
Performing line-breaking, justification, and other
line-level or paragraph-level operations that must be done
after shaping is complete, but which require character-level
@@ -69,7 +76,7 @@
This cluster value is an arbitrary number; HarfBuzz uses it only
to distinguish between clusters. Many client programs will use
the index of each code point in the input text stream as the
- cluster value, as a matter of convenience; the actual value does
+ cluster value, for the sake of convenience; the actual value does
not matter.
</para>
<para>
@@ -122,10 +129,10 @@
Level 1 differs from level 0 by not merging the
clusters of marks and other modifier code points with the
preceding "base" code point's cluster. By preserving the
- cluster values of these marks and modifier code points,
- script shaping can perform additional operations that might
- lead to improved results (for example, reordering a sequence
- of marks).
+ separate cluster values of these marks and modifier code
+ points, script shapers can perform additional operations
+ that might lead to improved results (for example, reordering
+ a sequence of marks).
</para>
<para>
Client programs can specify level 1 behavior for a buffer by
commit 53ac46e974cf0ee8720b40ef394714eb97ff53b9
Author: Nathan Willis <nwillis at glyphography.com>
Date: Mon Nov 12 12:17:06 2018 -0600
Usermanual: expand clusters chapter.
diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml
index 7b2c7adc..f7db0f59 100644
--- a/docs/usermanual-clusters.xml
+++ b/docs/usermanual-clusters.xml
@@ -5,306 +5,509 @@
<!ENTITY version SYSTEM "version.xml">
]>
<chapter id="clusters">
-<sect1 id="clusters">
<title>Clusters</title>
- <para>
- In shaping text, a <emphasis>cluster</emphasis> is a sequence of
- code points that needs to be treated as a single, indivisible unit.
- </para>
- <para>
- When you add text to a HB buffer, each character is associated with
- a <emphasis>cluster value</emphasis>. This is an arbitrary number as
- far as HB is concerned.
- </para>
- <para>
- Most clients will use UTF-8, UTF-16, or UTF-32 indices, but the
- actual number does not matter. Moreover, it is not required for the
- cluster values to be monotonically increasing, but pretty much all
- of HB's tests are performed on monotonically increasing cluster
- numbers. Nevertheless, there is no such assumption in the code
- itself. With that in mind, let's examine what happens with cluster
- values during shaping under each cluster-level.
- </para>
- <para>
- HarfBuzz provides three <emphasis>levels</emphasis> of clustering
- support. Level 0 is the default behavior and reproduces the behavior
- of the old HarfBuzz library. Level 1 tweaks this behavior slightly
- to produce better results, so level 1 clustering is recommended for
- code that is not required to implement backward compatibility with
- the old HarfBuzz.
- </para>
- <para>
- Level 2 differs significantly in how it treats cluster values.
- Levels 0 and 1 both process ligatures and glyph decomposition by
- merging clusters; level 2 does not.
- </para>
- <para>
- The conceptual model for what the cluster values mean, in levels 0
- and 1, is this:
- </para>
- <itemizedlist spacing="compact">
- <listitem>
- <para>
- the sequence of cluster values will always remain monotone
- </para>
- </listitem>
- <listitem>
- <para>
- each value represents a single cluster
- </para>
- </listitem>
- <listitem>
- <para>
- each cluster contains one or more glyphs and one or more
- characters
- </para>
- </listitem>
- </itemizedlist>
- <para>
- Assuming that initial cluster numbers were monotonically increasing
- and distinct, then all adjacent glyphs having the same cluster
- number belong to the same cluster, and all characters belong to the
- cluster that has the highest number not larger than their initial
- cluster number. This will become clearer with an example.
- </para>
-</sect1>
-<sect1 id="a-clustering-example-for-levels-0-and-1">
- <title>A clustering example for levels 0 and 1</title>
- <para>
- Let's say we start with the following character sequence and cluster
- values:
- </para>
- <programlisting>
- A,B,C,D,E
- 0,1,2,3,4
-</programlisting>
- <para>
- We then map the characters to glyphs. For simplicity, let's assume
- that each character maps to the corresponding, identical-looking
- glyph:
- </para>
- <programlisting>
- A,B,C,D,E
- 0,1,2,3,4
-</programlisting>
- <para>
- Now if, for example, <literal>B</literal> and <literal>C</literal>
- ligate, then the clusters to which they belong "merge".
- This merged cluster takes for its cluster number the minimum of all
- the cluster numbers of the clusters that went in. In this case, we
- get:
- </para>
- <programlisting>
- A,BC,D,E
- 0,1 ,3,4
-</programlisting>
- <para>
- Now let's assume that the <literal>BC</literal> glyph decomposes
- into three components, and <literal>D</literal> also decomposes into
- two. The components each inherit the cluster value of their parent:
- </para>
- <programlisting>
- A,BC0,BC1,BC2,D0,D1,E
- 0,1 ,1 ,1 ,3 ,3 ,4
-</programlisting>
- <para>
- Now if <literal>BC2</literal> and <literal>D0</literal> ligate, then
- their clusters (numbers 1 and 3) merge into
- <literal>min(1,3) = 1</literal>:
- </para>
- <programlisting>
- A,BC0,BC1,BC2D0,D1,E
- 0,1 ,1 ,1 ,1 ,4
-</programlisting>
- <para>
- At this point, cluster 1 means: the character sequence
- <literal>BCD</literal> is represented by glyphs
- <literal>BC0,BC1,BC2D0,D1</literal> and cannot be broken down any
- further.
- </para>
-</sect1>
-<sect1 id="reordering-in-levels-0-and-1">
- <title>Reordering in levels 0 and 1</title>
- <para>
- Another common operation in the more complex shapers is when things
- reorder. In those cases, to maintain monotone clusters, HB merges
- the clusters of everything in the reordering sequence. For example,
- let's again start with the character sequence:
- </para>
- <programlisting>
- A,B,C,D,E
- 0,1,2,3,4
-</programlisting>
- <para>
- If <literal>D</literal> is reordered before <literal>B</literal>,
- then the <literal>B</literal>, <literal>C</literal>, and
- <literal>D</literal> clusters merge, and we get:
- </para>
- <programlisting>
- A,D,B,C,E
- 0,1,1,1,4
-</programlisting>
- <para>
- This is clearly not ideal, but it is the only sensible way to
- maintain monotone indices and retain the true relationship between
- glyphs and characters.
- </para>
-</sect1>
-<sect1 id="the-distinction-between-levels-0-and-1">
- <title>The distinction between levels 0 and 1</title>
- <para>
- So, the above is pretty much what cluster levels 0 and 1 do. The
- only difference between the two is this: in level 0, at the very
- beginning of the shaping process, we also merge clusters between
- base characters and all Unicode marks (combining or not) following
- them. E.g.:
- </para>
- <programlisting>
- A,acute,B
- 0,1 ,2
-</programlisting>
- <para>
- will become:
- </para>
- <programlisting>
- A,acute,B
- 0,0 ,2
-</programlisting>
- <para>
- This is the default behavior. We do it because Windows did it and
- old HarfBuzz did it, so this remained the default. But this behavior
- makes it impossible to color diacritic marks differently from their
- base characters. That's why in level 1 we do not perform this
- initial merging step.
- </para>
- <para>
- For clients, level 0 is more convenient if they rely on HarfBuzz
- clusters for cursor positioning. But that's wrong anyway: cursor
- positions should be determined based on Unicode grapheme boundaries,
- NOT shaping clusters. As such, level 1 clusters are preferred.
- </para>
- <para>
- One last note about levels 0 and 1. We currently don't allow a
- <literal>MultipleSubst</literal> lookup to replace a glyph with zero
- glyphs (i.e., to delete a glyph). But in some other situations,
- glyphs can be deleted. In those cases, if the glyph being deleted is
- the last glyph of its cluster, we make sure to merge the cluster
- with a neighboring cluster.
- </para>
- <para>
- This is, primarily, to make sure that the starting cluster of the
- text always has the cluster index pointing to the start of the text
- for the run; more than one client currently relies on this
- guarantee.
- </para>
- <para>
- Incidentally, Apple's CoreText does something else to maintain the
- same promise: it inserts a glyph with id 65535 at the beginning of
- the glyph string if the glyph corresponding to the first character
- in the run was deleted. HarfBuzz might do something similar in the
- future.
- </para>
-</sect1>
-<sect1 id="level-2">
- <title>Level 2</title>
- <para>
- Level 2 is a different beast from levels 0 and 1. It is simple to
- describe, but hard to make sense of. It simply doesn't do any
- cluster merging whatsoever. When things ligate or otherwise multiple
- glyphs turn into one, the cluster value of the first glyph is
- retained.
- </para>
- <para>
- Here are a few examples of why processing cluster values produced at
- this level might be tricky:
- </para>
- <sect2 id="ligatures-with-combining-marks">
- <title>Ligatures with combining marks</title>
- <para>
- Imagine capital letters are bases and lower case letters are
- combining marks. With an input sequence like this:
+ <section id="clusters">
+ <title>Clusters</title>
+ <para>
+ In text shaping, a <emphasis>cluster</emphasis> is a sequence of
+ characters that needs to be treated as a single, indivisible
+ unit.
</para>
- <programlisting>
- A,a,B,b,C,c
- 0,1,2,3,4,5
-</programlisting>
<para>
- if <literal>A,B,C</literal> ligate, then here are the cluster
- values one would get under the various levels:
+ During the shaping process, some shaping operations may
+ merge adjacent characters (for example, when two code points form
+ a ligature and are replaced by a single glyph) or split one
+ character into several (for example, when performing the Unicode
+ canonical decomposition of a code point).
</para>
<para>
- level 0:
+ HarfBuzz tracks clusters independently from how these
+ shaping operations alter the individual glyphs that comprise the
+ output HarfBuzz returns in a buffer. Consequently,
+ a client program using HarfBuzz can utilize the cluster
+ information to implement features such as:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ Correctly positioning the cursor between two characters that
+ have combined into a single glyph by forming a ligature.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Correctly highlighting a text selection that includes some,
+ but not all, of the characters comprising a ligature.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Applying text attributes (such as color or underlining) to
+ part, but not all, of a composed base-and-mark combination.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Generating output document formats (such as PDF) with
+ embedded text that can be fully extracted.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Performing line-breaking, justification, and other
+ line-level or paragraph-level operations that must be done
+ after shaping is complete, but which require character-level
+ properties.
+ </para>
+ </listitem>
+ </itemizedlist>
+ <para>
+ When you add text to a HarfBuzz buffer, each code point is assigned
+ a <emphasis>cluster value</emphasis>.
+ </para>
+ <para>
+ This cluster value is an arbitrary number; HarfBuzz uses it only
+ to distinguish between clusters. Many client programs will use
+ the index of each code point in the input text stream as the
+ cluster value, as a matter of convenience; the actual value does
+ not matter.
+ </para>
+ <para>
+ Client programs can choose how HarfBuzz handles clusters during
+ shaping by setting the
+ <literal>cluster_level</literal> of the
+ buffer. HarfBuzz offers three <emphasis>levels</emphasis> of
+ clustering support for this property:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para><emphasis>Level 0</emphasis> is the default and
+ reproduces the behavior of the old HarfBuzz library.
+ </para>
+ <para>
+ The distinguishing feature of level 0 behavior is that, at
+ the beginning of processing the buffer, all code points that
+ are categorized as <emphasis>marks</emphasis>,
+ <emphasis>modifier symbols</emphasis>, or
+ <emphasis>Emoji extended pictographic</emphasis> modifiers,
+ as well as the <emphasis>Zero Width Joiner</emphasis> and
+ <emphasis>Zero Width Non-Joiner</emphasis> code points, are
+ assigned the cluster value of the closest preceding code
+ point from <emphasis>diferent</emphasis> category.
+ </para>
+ <para>
+ In essence, whenever a base character is followed by a mark
+ character or a sequence of mark characters, those marks are
+ reassigned to the same initial cluster value as the base
+ character. This reassignment is referred to as
+ "merging" the affected clusters. This behavior is based on
+ the Grapheme Cluster Boundary specification in <ulink
+ url="https://www.unicode.org/reports/tr29/#Regex_Definitions">Unicode
+ Technical Report 29</ulink>.
+ </para>
+ <para>
+ Client programs can specify level 0 behavior for a buffer by
+ setting its <literal>cluster_level</literal> to
+ <literal>HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES</literal>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <emphasis>Level 1</emphasis> tweaks the old behavior
+ slightly to produce better results. Therefore, level 1
+ clustering is recommended for code that is not required to
+ implement backward compatibility with the old HarfBuzz.
+ </para>
+ <para>
+ Level 1 differs from level 0 by not merging the
+ clusters of marks and other modifier code points with the
+ preceding "base" code point's cluster. By preserving the
+ cluster values of these marks and modifier code points,
+ script shaping can perform additional operations that might
+ lead to improved results (for example, reordering a sequence
+ of marks).
+ </para>
+ <para>
+ Client programs can specify level 1 behavior for a buffer by
+ setting its <literal>cluster_level</literal> to
+ <literal>HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS</literal>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <emphasis>Level 2</emphasis> differs significantly in how it
+ treats cluster values. In level 2, HarfBuzz never merges
+ clusters.
+ </para>
+ <para>
+ This difference can be seen most clearly when HarfBuzz processes
+ ligature substitutions and glyph decompositions. In level 0
+ and level 1, ligatures and glyph decomposition both involve
+ merging clusters; in level 2, neither of these operations
+ triggers a merge.
+ </para>
+ <para>
+ Client programs can specify level 2 behavior for a buffer by
+ setting its <literal>cluster_level</literal> to
+ <literal>HB_BUFFER_CLUSTER_LEVEL_CHARACTERS</literal>.
+ </para>
+ </listitem>
+ </itemizedlist>
+ <para>
+ It is not <emphasis>required</emphasis> that the cluster values
+ in a buffer be monotonically increasing. However, if the initial
+ cluster values in a buffer are monotonic and the buffer is
+ configured to use clustering level 0 or 1, then HarfBuzz
+ guarantees that the final cluster values in the shaped buffer
+ will also be monotonic. No such guarantee is made for cluster
+ level 2.
+ </para>
+ <para>
+ In levels 0 and 1, HarfBuzz implements the following conceptual model for
+ cluster values:
+ </para>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ The sequence of cluster values will always remain monotonic.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Each cluster value represents a single cluster.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Each cluster contains one or more glyphs and one or more
+ characters.
+ </para>
+ </listitem>
+ </itemizedlist>
+ <para>
+ In practice, this model offers several benefits. Assuming that
+ the initial cluster values were monotonically increasing
+ and distinct before shaping began, then, in the final output:
+ </para>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ All adjacent glyphs having the same final cluster
+ value belong to the same cluster.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Each character belongs to the cluster that has the highest
+ cluster value <emphasis>not larger than</emphasis> its
+ initial cluster value.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ </section>
+ <section id="a-clustering-example-for-levels-0-and-1">
+ <title>A clustering example for levels 0 and 1</title>
+ <para>
+ The guarantees and benefits of level 0 and level 1 can be seen
+ with some examples. First, let us examine what happens with cluster
+ values when shaping involves cluster merging with ligatures and
+ decomposition.
+ </para>
+ <para>
+ Let's say we start with the following character sequence (top row) and
+ initial cluster values (bottom row):
</para>
<programlisting>
- ABC,a,b,c
- 0 ,0,0,0
-</programlisting>
+ A,B,C,D,E
+ 0,1,2,3,4
+ </programlisting>
<para>
- level 1:
+ During shaping, HarfBuzz maps these characters to glyphs from
+ the font. For simplicity, let's assume that each character maps
+ to the corresponding, identical-looking glyph:
</para>
<programlisting>
- ABC,a,b,c
- 0 ,0,0,5
-</programlisting>
+ A,B,C,D,E
+ 0,1,2,3,4
+ </programlisting>
<para>
- level 2:
+ Now if, for example, <literal>B</literal> and <literal>C</literal>
+ form a ligature, then the clusters to which they belong
+ "merge". This merged cluster takes for its cluster
+ value the minimum of all the cluster values of the clusters that
+ went in to the ligature. In this case, we get:
</para>
<programlisting>
- ABC,a,b,c
- 0 ,1,3,5
-</programlisting>
+ A,BC,D,E
+ 0,1 ,3,4
+ </programlisting>
+ <para>
+ because 1 is the minimum of the set {1,2}, which were the
+ cluster values of <literal>B</literal> and
+ <literal>C</literal>.
+ </para>
<para>
- Making sense of the last example is the hardest for a client,
- because there is nothing in the cluster values to suggest that
- <literal>B</literal> and <literal>C</literal> ligated with
- <literal>A</literal>.
+ Next, let us say that the <literal>BC</literal> ligature glyph
+ decomposes into three components, and <literal>D</literal> also
+ decomposes into two components. These components each inherit the
+ cluster value of their parent:
</para>
- </sect2>
- <sect2 id="reordering">
- <title>Reordering</title>
+ <programlisting>
+ A,BC0,BC1,BC2,D0,D1,E
+ 0,1 ,1 ,1 ,3 ,3 ,4
+ </programlisting>
<para>
- Another tricky case is when things reorder. Under level 2:
+ Next, if <literal>BC2</literal> and <literal>D0</literal> form a
+ ligature, then their clusters (cluster values 1 and 3) merge into
+ <literal>min(1,3) = 1</literal>:
</para>
<programlisting>
- A,B,C,D,E
- 0,1,2,3,4
-</programlisting>
+ A,BC0,BC1,BC2D0,D1,E
+ 0,1 ,1 ,1 ,1 ,4
+ </programlisting>
+ <para>
+ At this point, cluster 1 means: the character sequence
+ <literal>BCD</literal> is represented by glyphs
+ <literal>BC0,BC1,BC2D0,D1</literal> and cannot be broken down any
+ further.
+ </para>
+ </section>
+ <section id="reordering-in-levels-0-and-1">
+ <title>Reordering in levels 0 and 1</title>
+ <para>
+ Another common operation in the more complex shapers is glyph
+ reordering. In order to maintain a monotonic cluster sequence
+ when glyph reordering takes place, HarfBuzz merges the clusters
+ of everything in the reordering sequence.
+ </para>
<para>
- Now imagine <literal>D</literal> moves before
- <literal>B</literal>:
+ For example, let us again start with the character sequence (top
+ row) and initial cluster values (bottom row):
</para>
<programlisting>
- A,D,B,C,E
- 0,3,1,2,4
-</programlisting>
+ A,B,C,D,E
+ 0,1,2,3,4
+ </programlisting>
<para>
- Now, if <literal>D</literal> ligates with <literal>B</literal>, we
+ If <literal>D</literal> is reordered before <literal>B</literal>,
+ then HarfBuzz merges the <literal>B</literal>,
+ <literal>C</literal>, and <literal>D</literal> clusters, and we
get:
</para>
<programlisting>
- A,DB,C,E
- 0,3 ,2,4
-</programlisting>
+ A,D,B,C,E
+ 0,1,1,1,4
+ </programlisting>
<para>
- In a different scenario, <literal>A</literal> and
- <literal>B</literal> could have ligated
- <emphasis>before</emphasis> <literal>D</literal> reordered; that
- would have resulted in:
+ This is clearly not ideal, but it is the only sensible way to
+ maintain a monotonic sequence of cluster values and retain the
+ true relationship between glyphs and characters.
+ </para>
+ </section>
+ <section id="the-distinction-between-levels-0-and-1">
+ <title>The distinction between levels 0 and 1</title>
+ <para>
+ The preceding examples demonstrate the main effects of using
+ cluster levels 0 and 1. The only difference between the two
+ levels is this: in level 0, at the very beginning of the shaping
+ process, HarfBuzz also merges clusters between any base character
+ and all Unicode marks (combining or not) that follow it.
+ </para>
+ <para>
+ For example, let us start with the following character sequence
+ (top row) and accompanying initial cluster values (bottom row):
+ </para>
+ <programlisting>
+ A,acute,B
+ 0,1 ,2
+ </programlisting>
+ <para>
+ The <literal>acute</literal> is a Unicode mark. If HarfBuzz is
+ using cluster level 0 on this sequence, then the
+ <literal>A</literal> and <literal>acute</literal> clusters will
+ merge, and the result will become:
</para>
<programlisting>
- AB,D,C,E
- 0 ,3,2,4
-</programlisting>
+ A,acute,B
+ 0,0 ,2
+ </programlisting>
+ <para>
+ This initial cluster merging is the default behavior of the
+ Windows shaping engine, and the old HarfBuzz codebase copied
+ that behavior to maintain compatibility. Consequently, it has
+ remained the default behavior in the new HarfBuzz codebase.
+ </para>
+ <para>
+ But this initial cluster-merging behavior makes it impossible to
+ color diacritic marks differently from their base
+ characters. That is why, in level 1, HarfBuzz does not perform
+ the initial merging step.
+ </para>
+ <para>
+ For client programs that rely on HarfBuzz cluster values to
+ perform cursor positioning, level 0 is more convenient. But
+ relying on cluster boundaries for cursor positioning is wrong: cursor
+ positions should be determined based on Unicode grapheme
+ boundaries, not on shaping-cluster boundaries. As such, level 1
+ clusters are preferred.
+ </para>
+ <para>
+ One last note about levels 0 and 1. HarfBuzz currently does not allow a
+ <literal>MultipleSubst</literal> lookup to replace a glyph with zero
+ glyphs (in other words, to delete a glyph). But, in some other situations,
+ glyphs can be deleted. In those cases, if the glyph being deleted is
+ the last glyph of its cluster, HarfBuzz makes sure to merge the cluster
+ with a neighboring cluster.
+ </para>
+ <para>
+ This is done primarily to make sure that the starting cluster of the
+ text always has the cluster index pointing to the start of the text
+ for the run; more than one client currently relies on this
+ guarantee.
+ </para>
+ <para>
+ Incidentally, Apple's CoreText does something else to maintain the
+ same promise: it inserts a glyph with id 65535 at the beginning of
+ the glyph string if the glyph corresponding to the first character
+ in the run was deleted. HarfBuzz might do something similar in the
+ future.
+ </para>
+ </section>
+ <section id="level-2">
+ <title>Level 2</title>
+ <para>
+ HarfBuzz's level 2 cluster behavior uses a significantly
+ different model than that of level 0 and level 1.
+ </para>
<para>
- There's no way to differentiate between these two scenarios based
- on the cluster numbers alone.
+ The level 2 behavior is easy to describe, but it may be
+ difficult to understand in practical terms. In brief, level 2
+ performs no merging of clusters whatsoever.
</para>
<para>
- Another problem happens with ligatures under level 2 if the
- direction of the text is forced to opposite of its natural
- direction (e.g. left-to-right Arabic). But that's too much of a
- corner case to worry about.
+ When glyphs form a ligature (or when some other feature
+ substitutes multiple glyphs with one glyph), the cluster value
+ of the first glyph is retained as the cluster value for the
+ ligature. However, no subsequent clusters — including
+ marks and modifiers — are affected.
</para>
- </sect2>
-</sect1>
+ <para>
+ Level 2 cluster behavior is less complex than level 0 or level
+ 1, but there are a few cases in which processing cluster values
+ produced at level 2 may be tricky.
+ </para>
+ <section id="ligatures-with-combining-marks-in-level-2">
+ <title>Ligatures with combining marks in level 2</title>
+ <para>
+ The first example of how HarfBuzz's level 2 cluster behavior
+ can be tricky is when the text to be shaped includes combining
+ marks attached to ligatures.
+ </para>
+ <para>
+ Let us start with an input sequence with the following
+ characters (top row) and initial cluster values (bottom row):
+ </para>
+ <programlisting>
+ A,acute,B,breve,C,circumflex
+ 0,1 ,2,3 ,4,5
+ </programlisting>
+ <para>
+ If the sequence <literal>A,B,C</literal> forms a ligature,
+ then these are the cluster values HarfBuzz will return under
+ the various cluster levels:
+ </para>
+ <para>
+ Level 0:
+ </para>
+ <programlisting>
+ ABC,acute,breve,circumflex
+ 0 ,0 ,0 ,0
+ </programlisting>
+ <para>
+ Level 1:
+ </para>
+ <programlisting>
+ ABC,acute,breve,circumflex
+ 0 ,0 ,0 ,5
+ </programlisting>
+ <para>
+ Level 2:
+ </para>
+ <programlisting>
+ ABC,acute,breve,circumflex
+ 0 ,1 ,3 ,5
+ </programlisting>
+ <para>
+ Making sense of the level 2 result is the hardest for a client
+ program, because there is nothing in the cluster values that
+ indicates that <literal>B</literal> and <literal>C</literal>
+ formed a ligature with <literal>A</literal>.
+ </para>
+ <para>
+ In contrast, the "merged" cluster values of the mark glyphs
+ that are seen in the level 0 and level 1 output are evidence
+ that a ligature substitution took place.
+ </para>
+ </section>
+ <section id="reordering-in-level-2">
+ <title>Reordering in level 2</title>
+ <para>
+ Another example of how HarfBuzz's level 2 cluster behavior
+ can be tricky is when glyphs reorder. Consider an input sequence
+ with the following characters (top row) and initial cluster
+ values (bottom row):
+ </para>
+ <programlisting>
+ A,B,C,D,E
+ 0,1,2,3,4
+ </programlisting>
+ <para>
+ Now imagine <literal>D</literal> moves before
+ <literal>B</literal> in a reordering operation. The cluster
+ values will then be:
+ </para>
+ <programlisting>
+ A,D,B,C,E
+ 0,3,1,2,4
+ </programlisting>
+ <para>
+ Next, if <literal>D</literal> forms a ligature with
+ <literal>B</literal>, the output is:
+ </para>
+ <programlisting>
+ A,DB,C,E
+ 0,3 ,2,4
+ </programlisting>
+ <para>
+ However, in a different scenario, in which the shaping rules
+ of the script instead caused <literal>A</literal> and
+ <literal>B</literal> to form a ligature
+ <emphasis>before</emphasis> the <literal>D</literal> reordered, the
+ result would be:
+ </para>
+ <programlisting>
+ AB,D,C,E
+ 0 ,3,2,4
+ </programlisting>
+ <para>
+ There is no way for a client program to differentiate between
+ these two scenarios based on the cluster values
+ alone. Consequently, client programs that use level 2 might
+ need to undertake additional work in order to manage cursor
+ positioning, text attributes, or other desired features.
+ </para>
+ </section>
+ <section id="other-considerations-in-level-2">
+ <title>Other considerations in level 2</title>
+ <para>
+ There may be other problems encountered with ligatures under
+ level 2, such as if the direction of the text is forced to
+ opposite of its natural direction (for example, left-to-right
+ Arabic). But, generally speaking, these other scenarios are
+ minor corner cases that are too obscure for most client
+ programs to need to worry about.
+ </para>
+ </section>
+ </section>
</chapter>
More information about the HarfBuzz
mailing list