[HarfBuzz] harfbuzz: Branch 'master' - 3 commits
Simon Hausmann
hausmann at kemper.freedesktop.org
Tue Oct 23 07:05:50 PDT 2007
src/harfbuzz-external.h | 48 +++++++++++++
src/harfbuzz-shaper.cpp | 165 +++++++++++++++++++++++++++++++++++++++++++++---
src/harfbuzz-shaper.h | 20 ++++-
3 files changed, 222 insertions(+), 11 deletions(-)
New commits:
commit 60cb4777337c09fde74a6a67fb2904bd3d71d948
Author: Lars Knoll <lars at trolltech.com>
Date: Tue Oct 23 15:36:41 2007 +0200
added API for finding word and sentence boundaries.
Signed-off-by: Simon Hausmann <shausman at trolltech.com>
diff --git a/src/harfbuzz-external.h b/src/harfbuzz-external.h
index 520c571..cb76b0c 100644
--- a/src/harfbuzz-external.h
+++ b/src/harfbuzz-external.h
@@ -91,7 +91,40 @@ typedef enum
HB_Grapheme_LVT
} HB_GraphemeClass;
+
+typedef enum
+{
+ HB_Word_Other,
+ HB_Word_Format,
+ HB_Word_Katakana,
+ HB_Word_ALetter,
+ HB_Word_MidLetter,
+ HB_Word_MidNum,
+ HB_Word_Numeric,
+ HB_Word_ExtendNumLet
+} HB_WordClass;
+
+
+typedef enum
+{
+ HB_Sentence_Other,
+ HB_Sentence_Sep,
+ HB_Sentence_Format,
+ HB_Sentence_Sp,
+ HB_Sentence_Lower,
+ HB_Sentence_Upper,
+ HB_Sentence_OLetter,
+ HB_Sentence_Numeric,
+ HB_Sentence_ATerm,
+ HB_Sentence_STerm,
+ HB_Sentence_Close
+} HB_SentenceClass;
+
+HB_GraphemeClass HB_GetGraphemeClass(HB_UChar32 ch);
+HB_WordClass HB_GetWordClass(HB_UChar32 ch);
+HB_SentenceClass HB_GetSentenceClass(HB_UChar32 ch);
HB_LineBreakClass HB_GetLineBreakClass(HB_UChar32 ch);
+
void HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *grapheme, HB_LineBreakClass *lineBreak);
void HB_GetUnicodeCharProperties(HB_UChar32 ch, HB_CharCategory *category, int *combiningClass);
HB_CharCategory HB_GetUnicodeCharCategory(HB_UChar32 ch);
diff --git a/src/harfbuzz-shaper.cpp b/src/harfbuzz-shaper.cpp
index f775762..33e4951 100644
--- a/src/harfbuzz-shaper.cpp
+++ b/src/harfbuzz-shaper.cpp
@@ -646,6 +646,137 @@ void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
}
}
+
+enum BreakRule { NoBreak = 0, Break = 1, Middle = 2 };
+
+static const hb_uint8 wordbreakTable[HB_Word_ExtendNumLet + 1][HB_Word_ExtendNumLet + 1] = {
+// Other Format Katakana ALetter MidLetter MidNum Numeric ExtendNumLet
+ { Break, Break, Break, Break, Break, Break, Break, Break }, // Other
+ { Break, Break, Break, Break, Break, Break, Break, Break }, // Format
+ { Break, Break, NoBreak, Break, Break, Break, Break, NoBreak }, // Katakana
+ { Break, Break, Break, NoBreak, Middle, Break, NoBreak, NoBreak }, // ALetter
+ { Break, Break, Break, Break, Break, Break, Break, Break }, // MidLetter
+ { Break, Break, Break, Break, Break, Break, Break, Break }, // MidNum
+ { Break, Break, Break, NoBreak, Break, Middle, NoBreak, NoBreak }, // Numeric
+ { Break, Break, NoBreak, NoBreak, Break, Break, NoBreak, NoBreak }, // ExtendNumLet
+};
+
+void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+ const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/,
+ HB_CharAttributes *attributes)
+{
+ if (stringLength == 0)
+ return;
+ uint brk = HB_GetWordClass(string[0]);
+ attributes[0].wordBoundary = true;
+ for (hb_uint32 i = 1; i < stringLength; ++i) {
+ if (!attributes[i].charStop) {
+ attributes[i].wordBoundary = false;
+ continue;
+ }
+ hb_uint32 nbrk = HB_GetWordClass(string[i]);
+ if (nbrk == HB_Word_Format) {
+ attributes[i].wordBoundary = (HB_GetSentenceClass(string[i-1]) == HB_Sentence_Sep);
+ continue;
+ }
+ BreakRule rule = (BreakRule)wordbreakTable[brk][nbrk];
+ if (rule == Middle) {
+ rule = Break;
+ hb_uint32 lookahead = i + 1;
+ while (lookahead < stringLength) {
+ hb_uint32 testbrk = HB_GetWordClass(string[lookahead]);
+ if (testbrk == HB_Word_Format && HB_GetSentenceClass(string[lookahead]) != HB_Sentence_Sep) {
+ ++lookahead;
+ continue;
+ }
+ if (testbrk == brk) {
+ rule = NoBreak;
+ while (i < lookahead)
+ attributes[i++].wordBoundary = false;
+ nbrk = testbrk;
+ }
+ break;
+ }
+ }
+ attributes[i].wordBoundary = (rule == Break);
+ brk = nbrk;
+ }
+}
+
+
+enum SentenceBreakStates {
+ SB_Initial,
+ SB_Upper,
+ SB_UpATerm,
+ SB_ATerm,
+ SB_ATermC,
+ SB_ACS,
+ SB_STerm,
+ SB_STermC,
+ SB_SCS,
+ SB_BAfter,
+ SB_Break,
+ SB_Look
+};
+
+static const hb_uint8 sentenceBreakTable[HB_Sentence_Close + 1][HB_Sentence_Close + 1] = {
+// Other Sep Format Sp Lower Upper OLetter Numeric ATerm STerm Close
+ { SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_ATerm , SB_STerm , SB_Initial }, // SB_Initial,
+ { SB_Initial, SB_BAfter , SB_Upper , SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm , SB_Initial }, // SB_Upper
+
+ { SB_Look , SB_BAfter , SB_UpATerm, SB_ACS , SB_Initial, SB_Upper , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_UpATerm
+ { SB_Look , SB_BAfter , SB_ATerm , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATerm
+ { SB_Look , SB_BAfter , SB_ATermC , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Look , SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATermC,
+ { SB_Look , SB_BAfter , SB_ACS , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Look , SB_ATerm , SB_STerm , SB_Look }, // SB_ACS,
+
+ { SB_Break , SB_BAfter , SB_STerm , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STerm,
+ { SB_Break , SB_BAfter , SB_STermC , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STermC,
+ { SB_Break , SB_BAfter , SB_SCS , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_Break }, // SB_SCS,
+ { SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break }, // SB_BAfter,
+};
+
+void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+ const HB_ScriptItem */*items*/, hb_uint32 /*numItems*/,
+ HB_CharAttributes *attributes)
+{
+ if (stringLength == 0)
+ return;
+ hb_uint32 brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[0])];
+ attributes[0].sentenceBoundary = true;
+ for (hb_uint32 i = 1; i < stringLength; ++i) {
+ if (!attributes[i].charStop) {
+ attributes[i].sentenceBoundary = false;
+ continue;
+ }
+ brk = sentenceBreakTable[brk][HB_GetSentenceClass(string[i])];
+ if (brk == SB_Look) {
+ brk = SB_Break;
+ hb_uint32 lookahead = i + 1;
+ while (lookahead < stringLength) {
+ hb_uint32 sbrk = HB_GetSentenceClass(string[lookahead]);
+ if (sbrk != HB_Sentence_Other && sbrk != HB_Sentence_Numeric && sbrk != HB_Sentence_Close) {
+ break;
+ } else if (sbrk == HB_Sentence_Lower) {
+ brk = SB_Initial;
+ break;
+ }
+ ++lookahead;
+ }
+ if (brk == SB_Initial) {
+ while (i < lookahead)
+ attributes[i++].sentenceBoundary = false;
+ }
+ }
+ if (brk == SB_Break) {
+ attributes[i].sentenceBoundary = true;
+ brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[i])];
+ } else {
+ attributes[i].sentenceBoundary = false;
+ }
+ }
+}
+
+
static inline char *tag_to_string(HB_UInt tag)
{
static char string[5];
diff --git a/src/harfbuzz-shaper.h b/src/harfbuzz-shaper.h
index 108566a..864c3f4 100644
--- a/src/harfbuzz-shaper.h
+++ b/src/harfbuzz-shaper.h
@@ -107,16 +107,28 @@ typedef enum {
typedef struct {
- /*HB_LineBreakType*/ unsigned int lineBreakType :2;
- /*HB_Bool*/ unsigned int whiteSpace :1; /* A unicode whitespace character, except NBSP, ZWNBSP */
- /*HB_Bool*/ unsigned int charStop :1; /* Valid cursor position (for left/right arrow) */
- unsigned int unused :4;
+ /*HB_LineBreakType*/ hb_uint8 lineBreakType :2;
+ /*HB_Bool*/ hb_uint8 whiteSpace :1; /* A unicode whitespace character, except NBSP, ZWNBSP */
+ /*HB_Bool*/ hb_uint8 charStop :1; /* Valid cursor position (for left/right arrow) */
+ /*HB_Bool*/ hb_uint8 wordBoundary :1;
+ /*HB_Bool*/ hb_uint8 sentenceBoundary :1;
+ hb_uint8 unused :2;
} HB_CharAttributes;
void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
const HB_ScriptItem *items, hb_uint32 numItems,
HB_CharAttributes *attributes);
+/* requires HB_GetCharAttributes to be called before */
+void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+ const HB_ScriptItem *items, hb_uint32 numItems,
+ HB_CharAttributes *attributes);
+
+/* requires HB_GetCharAttributes to be called before */
+void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+ const HB_ScriptItem *items, hb_uint32 numItems,
+ HB_CharAttributes *attributes);
+
typedef enum {
HB_LeftToRight = 0,
commit 6cf5502bbef801ff2b5dc4169584e54237dc764e
Author: Lars Knoll <lars at trolltech.com>
Date: Tue Oct 23 14:48:33 2007 +0200
correctly set grapheme boundaries for hangul.
Signed-off-by: Simon Hausmann <shausman at trolltech.com>
diff --git a/src/harfbuzz-shaper.cpp b/src/harfbuzz-shaper.cpp
index 5f951b5..f775762 100644
--- a/src/harfbuzz-shaper.cpp
+++ b/src/harfbuzz-shaper.cpp
@@ -110,7 +110,7 @@ static const hb_uint8 graphemeTable[HB_Grapheme_LVT + 1][HB_Grapheme_LVT + 1] =
{ false, true , true , true , false, false, false, false, false, false }, // Extend,
{ true , true , true , true , true , false, true , true , true , true }, // L,
{ true , true , true , true , true , false, false, true , false, true }, // V,
- { true , true , true , true , true , false, false, false, false, false }, // T,
+ { true , true , true , true , true , true , false, false, false, false }, // T,
{ true , true , true , true , true , false, true , true , true , true }, // LV,
{ true , true , true , true , true , false, true , true , true , true }, // LVT
};
commit 04278e1188744c6ddb7c6c8fa00cdfb955d04c7e
Author: Lars Knoll <lars at trolltech.com>
Date: Tue Oct 23 12:16:06 2007 +0200
modification to harfbuzz to get proper support for grapheme boundaries in there.
Signed-off-by: Simon Hausmann <shausman at trolltech.com>
diff --git a/src/harfbuzz-external.h b/src/harfbuzz-external.h
index db887f2..520c571 100644
--- a/src/harfbuzz-external.h
+++ b/src/harfbuzz-external.h
@@ -77,7 +77,22 @@ typedef enum
HB_Symbol_Other /* So */
} HB_CharCategory;
+typedef enum
+{
+ HB_Grapheme_Other,
+ HB_Grapheme_CR,
+ HB_Grapheme_LF,
+ HB_Grapheme_Control,
+ HB_Grapheme_Extend,
+ HB_Grapheme_L,
+ HB_Grapheme_V,
+ HB_Grapheme_T,
+ HB_Grapheme_LV,
+ HB_Grapheme_LVT
+} HB_GraphemeClass;
+
HB_LineBreakClass HB_GetLineBreakClass(HB_UChar32 ch);
+void HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *grapheme, HB_LineBreakClass *lineBreak);
void HB_GetUnicodeCharProperties(HB_UChar32 ch, HB_CharCategory *category, int *combiningClass);
HB_CharCategory HB_GetUnicodeCharCategory(HB_UChar32 ch);
int HB_GetUnicodeCharCombiningClass(HB_UChar32 ch);
diff --git a/src/harfbuzz-shaper.cpp b/src/harfbuzz-shaper.cpp
index a08d570..5f951b5 100644
--- a/src/harfbuzz-shaper.cpp
+++ b/src/harfbuzz-shaper.cpp
@@ -100,14 +100,30 @@ static const hb_uint8 breakTable[HB_LineBreak_JT+1][HB_LineBreak_JT+1] =
#undef CP
#undef PB
-
+static const hb_uint8 graphemeTable[HB_Grapheme_LVT + 1][HB_Grapheme_LVT + 1] =
+{
+// Other, CR, LF, Control,Extend,L, V, T, LV, LVT
+ { true , true , true , true , true , true , true , true , true , true }, // Other,
+ { true , true , true , true , true , true , true , true , true , true }, // CR,
+ { true , false, true , true , true , true , true , true , true , true }, // LF,
+ { true , true , true , true , true , true , true , true , true , true }, // Control,
+ { false, true , true , true , false, false, false, false, false, false }, // Extend,
+ { true , true , true , true , true , false, true , true , true , true }, // L,
+ { true , true , true , true , true , false, false, true , false, true }, // V,
+ { true , true , true , true , true , false, false, false, false, false }, // T,
+ { true , true , true , true , true , false, true , true , true , true }, // LV,
+ { true , true , true , true , true , false, true , true , true , true }, // LVT
+};
+
static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttributes *charAttributes)
{
if (!len)
return;
// ##### can this fail if the first char is a surrogate?
- int cls = HB_GetLineBreakClass(*uc);
+ HB_LineBreakClass cls;
+ HB_GraphemeClass grapheme;
+ HB_GetGraphemeAndLineBreakClass(*uc, &grapheme, &cls);
// handle case where input starts with an LF
if (cls == HB_LineBreak_LF)
cls = HB_LineBreak_BK;
@@ -120,14 +136,17 @@ static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttribute
charAttributes[i].whiteSpace = false;
charAttributes[i].charStop = true;
- int ncls = HB_GetLineBreakClass(uc[i]);
+ HB_UChar32 code = uc[i];
+ HB_GraphemeClass ngrapheme;
+ HB_LineBreakClass ncls;
+ HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
// handle surrogates
if (ncls == HB_LineBreak_SG) {
if (HB_IsHighSurrogate(uc[i]) && i < len - 1 && HB_IsLowSurrogate(uc[i+1])) {
continue;
} else if (HB_IsLowSurrogate(uc[i]) && HB_IsHighSurrogate(uc[i-1])) {
- HB_UChar32 code = HB_SurrogateToUcs4(uc[i-1], uc[i]);
- ncls = HB_GetLineBreakClass(code);
+ code = HB_SurrogateToUcs4(uc[i-1], uc[i]);
+ HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
charAttributes[i].charStop = false;
} else {
ncls = HB_LineBreak_AL;
@@ -137,8 +156,8 @@ static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttribute
// set white space and char stop flag
if (ncls >= HB_LineBreak_SP)
charAttributes[i].whiteSpace = true;
- if (ncls == HB_LineBreak_CM)
- charAttributes[i].charStop = false;
+
+ charAttributes[i].charStop = graphemeTable[ngrapheme][grapheme];
HB_LineBreakType lineBreakType = HB_NoBreak;
if (cls >= HB_LineBreak_LF) {
@@ -197,6 +216,7 @@ static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttribute
cls = ncls;
next_no_cls_update:
lcls = ncls;
+ grapheme = ngrapheme;
charAttributes[i-1].lineBreakType = lineBreakType;
}
charAttributes[len-1].lineBreakType = HB_ForcedBreak;
More information about the HarfBuzz
mailing list