[HarfBuzz] harfbuzz: Branch 'master' - 3 commits

Tue Oct 23 07:05:50 PDT 2007

src/harfbuzz-external.h |   48 +++++++++++++
 src/harfbuzz-shaper.cpp |  165 +++++++++++++++++++++++++++++++++++++++++++++---
 src/harfbuzz-shaper.h   |   20 ++++-
 3 files changed, 222 insertions(+), 11 deletions(-)

New commits:
commit 60cb4777337c09fde74a6a67fb2904bd3d71d948
Author: Lars Knoll <lars at trolltech.com>
Date:   Tue Oct 23 15:36:41 2007 +0200

    added API for finding word and sentence boundaries.
    
    Signed-off-by: Simon Hausmann <shausman at trolltech.com>

diff --git a/src/harfbuzz-external.h b/src/harfbuzz-external.h
index 520c571..cb76b0c 100644
--- a/src/harfbuzz-external.h
+++ b/src/harfbuzz-external.h
@@ -91,7 +91,40 @@ typedef enum
     HB_Grapheme_LVT
 } HB_GraphemeClass;
 
+
+typedef enum
+{
+    HB_Word_Other,
+    HB_Word_Format,
+    HB_Word_Katakana,
+    HB_Word_ALetter,
+    HB_Word_MidLetter,
+    HB_Word_MidNum,
+    HB_Word_Numeric,
+    HB_Word_ExtendNumLet
+} HB_WordClass;
+
+
+typedef enum
+{
+    HB_Sentence_Other,
+    HB_Sentence_Sep,
+    HB_Sentence_Format,
+    HB_Sentence_Sp,
+    HB_Sentence_Lower,
+    HB_Sentence_Upper,
+    HB_Sentence_OLetter,
+    HB_Sentence_Numeric,
+    HB_Sentence_ATerm,
+    HB_Sentence_STerm,
+    HB_Sentence_Close
+} HB_SentenceClass;
+
+HB_GraphemeClass HB_GetGraphemeClass(HB_UChar32 ch);
+HB_WordClass HB_GetWordClass(HB_UChar32 ch);
+HB_SentenceClass HB_GetSentenceClass(HB_UChar32 ch);
 HB_LineBreakClass HB_GetLineBreakClass(HB_UChar32 ch);
+
 void HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *grapheme, HB_LineBreakClass *lineBreak);
 void HB_GetUnicodeCharProperties(HB_UChar32 ch, HB_CharCategory *category, int *combiningClass);
 HB_CharCategory HB_GetUnicodeCharCategory(HB_UChar32 ch);
diff --git a/src/harfbuzz-shaper.cpp b/src/harfbuzz-shaper.cpp
index f775762..33e4951 100644
--- a/src/harfbuzz-shaper.cpp
+++ b/src/harfbuzz-shaper.cpp
@@ -646,6 +646,137 @@ void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
     }
 }
 
+
+enum BreakRule { NoBreak = 0, Break = 1, Middle = 2 };
+
+static const hb_uint8 wordbreakTable[HB_Word_ExtendNumLet + 1][HB_Word_ExtendNumLet + 1] = {
+//        Other    Format   Katakana ALetter  MidLetter MidNum  Numeric  ExtendNumLet
+    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // Other
+    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // Format 
+    {   Break,   Break, NoBreak,   Break,   Break,   Break,   Break, NoBreak }, // Katakana
+    {   Break,   Break,   Break, NoBreak,  Middle,   Break, NoBreak, NoBreak }, // ALetter
+    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // MidLetter
+    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // MidNum
+    {   Break,   Break,   Break, NoBreak,   Break,  Middle, NoBreak, NoBreak }, // Numeric
+    {   Break,   Break, NoBreak, NoBreak,   Break,   Break, NoBreak, NoBreak }, // ExtendNumLet
+};
+
+void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+                          const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/,
+                          HB_CharAttributes *attributes)
+{
+    if (stringLength == 0)
+        return;
+    uint brk = HB_GetWordClass(string[0]);
+    attributes[0].wordBoundary = true;
+    for (hb_uint32 i = 1; i < stringLength; ++i) {
+        if (!attributes[i].charStop) {
+            attributes[i].wordBoundary = false;
+            continue;
+        }
+        hb_uint32 nbrk = HB_GetWordClass(string[i]);
+        if (nbrk == HB_Word_Format) {
+            attributes[i].wordBoundary = (HB_GetSentenceClass(string[i-1]) == HB_Sentence_Sep);
+            continue;
+        }
+        BreakRule rule = (BreakRule)wordbreakTable[brk][nbrk];
+        if (rule == Middle) {
+            rule = Break;
+            hb_uint32 lookahead = i + 1;
+            while (lookahead < stringLength) {
+                hb_uint32 testbrk = HB_GetWordClass(string[lookahead]);
+                if (testbrk == HB_Word_Format && HB_GetSentenceClass(string[lookahead]) != HB_Sentence_Sep) {
+                    ++lookahead;
+                    continue;
+                }
+                if (testbrk == brk) {
+                    rule = NoBreak;
+                    while (i < lookahead)
+                        attributes[i++].wordBoundary = false;
+                    nbrk = testbrk;
+                }
+                break;
+            }
+        }
+        attributes[i].wordBoundary = (rule == Break);
+        brk = nbrk;
+    }
+}
+
+
+enum SentenceBreakStates {
+    SB_Initial,
+    SB_Upper,
+    SB_UpATerm, 
+    SB_ATerm,
+    SB_ATermC, 
+    SB_ACS, 
+    SB_STerm, 
+    SB_STermC, 
+    SB_SCS,
+    SB_BAfter, 
+    SB_Break,
+    SB_Look
+};
+
+static const hb_uint8 sentenceBreakTable[HB_Sentence_Close + 1][HB_Sentence_Close + 1] = {
+//        Other       Sep         Format      Sp          Lower       Upper       OLetter     Numeric     ATerm       STerm       Close
+      { SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper  , SB_Initial, SB_Initial, SB_ATerm  , SB_STerm  , SB_Initial }, // SB_Initial,
+      { SB_Initial, SB_BAfter , SB_Upper  , SB_Initial, SB_Initial, SB_Upper  , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm  , SB_Initial }, // SB_Upper
+      
+      { SB_Look   , SB_BAfter , SB_UpATerm, SB_ACS    , SB_Initial, SB_Upper  , SB_Break  , SB_Initial, SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_UpATerm
+      { SB_Look   , SB_BAfter , SB_ATerm  , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Initial, SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_ATerm
+      { SB_Look   , SB_BAfter , SB_ATermC , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Look   , SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_ATermC,
+      { SB_Look   , SB_BAfter , SB_ACS    , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Look   , SB_ATerm  , SB_STerm  , SB_Look    }, // SB_ACS,
+      
+      { SB_Break  , SB_BAfter , SB_STerm  , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_STermC  }, // SB_STerm,
+      { SB_Break  , SB_BAfter , SB_STermC , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_STermC  }, // SB_STermC,
+      { SB_Break  , SB_BAfter , SB_SCS    , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_Break   }, // SB_SCS,
+      { SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break   }, // SB_BAfter,
+};
+
+void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+                              const HB_ScriptItem */*items*/, hb_uint32 /*numItems*/,
+                              HB_CharAttributes *attributes)
+{
+    if (stringLength == 0)
+        return;
+    hb_uint32 brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[0])];
+    attributes[0].sentenceBoundary = true;
+    for (hb_uint32 i = 1; i < stringLength; ++i) {
+        if (!attributes[i].charStop) {
+            attributes[i].sentenceBoundary = false;
+            continue;
+        }
+        brk = sentenceBreakTable[brk][HB_GetSentenceClass(string[i])];
+        if (brk == SB_Look) {
+            brk = SB_Break;
+            hb_uint32 lookahead = i + 1;
+            while (lookahead < stringLength) {
+                hb_uint32 sbrk = HB_GetSentenceClass(string[lookahead]);
+                if (sbrk != HB_Sentence_Other && sbrk != HB_Sentence_Numeric && sbrk != HB_Sentence_Close) {
+                    break;
+                } else if (sbrk == HB_Sentence_Lower) {
+                    brk = SB_Initial;
+                    break;
+                }
+                ++lookahead;
+            }
+            if (brk == SB_Initial) {
+                while (i < lookahead)
+                    attributes[i++].sentenceBoundary = false;
+            }
+        }
+        if (brk == SB_Break) {
+            attributes[i].sentenceBoundary = true;
+            brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[i])];
+        } else {
+            attributes[i].sentenceBoundary = false;
+        }
+    }
+}
+
+
 static inline char *tag_to_string(HB_UInt tag)
 {
     static char string[5];
diff --git a/src/harfbuzz-shaper.h b/src/harfbuzz-shaper.h
index 108566a..864c3f4 100644
--- a/src/harfbuzz-shaper.h
+++ b/src/harfbuzz-shaper.h
@@ -107,16 +107,28 @@ typedef enum {
 
 
 typedef struct {
-    /*HB_LineBreakType*/ unsigned int lineBreakType  :2;
-    /*HB_Bool*/ unsigned int whiteSpace              :1;     /* A unicode whitespace character, except NBSP, ZWNBSP */
-    /*HB_Bool*/ unsigned int charStop                :1;     /* Valid cursor position (for left/right arrow) */
-    unsigned int unused                  :4;
+    /*HB_LineBreakType*/ hb_uint8 lineBreakType  :2;
+    /*HB_Bool*/ hb_uint8 whiteSpace              :1;     /* A unicode whitespace character, except NBSP, ZWNBSP */
+    /*HB_Bool*/ hb_uint8 charStop                :1;     /* Valid cursor position (for left/right arrow) */
+    /*HB_Bool*/ hb_uint8 wordBoundary            :1;
+    /*HB_Bool*/ hb_uint8 sentenceBoundary        :1;
+    hb_uint8 unused                  :2;
 } HB_CharAttributes;
 
 void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
                           const HB_ScriptItem *items, hb_uint32 numItems,
                           HB_CharAttributes *attributes);
 
+/* requires HB_GetCharAttributes to be called before */
+void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+                          const HB_ScriptItem *items, hb_uint32 numItems,
+                          HB_CharAttributes *attributes);
+
+/* requires HB_GetCharAttributes to be called before */
+void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
+                              const HB_ScriptItem *items, hb_uint32 numItems,
+                              HB_CharAttributes *attributes);
+
 
 typedef enum {
     HB_LeftToRight = 0,
commit 6cf5502bbef801ff2b5dc4169584e54237dc764e
Author: Lars Knoll <lars at trolltech.com>
Date:   Tue Oct 23 14:48:33 2007 +0200

    correctly set grapheme boundaries for hangul.
    
    Signed-off-by: Simon Hausmann <shausman at trolltech.com>

diff --git a/src/harfbuzz-shaper.cpp b/src/harfbuzz-shaper.cpp
index 5f951b5..f775762 100644
--- a/src/harfbuzz-shaper.cpp
+++ b/src/harfbuzz-shaper.cpp
@@ -110,7 +110,7 @@ static const hb_uint8 graphemeTable[HB_Grapheme_LVT + 1][HB_Grapheme_LVT + 1] =
     { false, true , true , true , false, false, false, false, false, false }, // Extend,
     { true , true , true , true , true , false, true , true , true , true  }, // L, 
     { true , true , true , true , true , false, false, true , false, true  }, // V, 
-    { true , true , true , true , true , false, false, false, false, false }, // T, 
+    { true , true , true , true , true , true , false, false, false, false }, // T, 
     { true , true , true , true , true , false, true , true , true , true  }, // LV, 
     { true , true , true , true , true , false, true , true , true , true  }, // LVT
 };
commit 04278e1188744c6ddb7c6c8fa00cdfb955d04c7e
Author: Lars Knoll <lars at trolltech.com>
Date:   Tue Oct 23 12:16:06 2007 +0200

    modification to harfbuzz to get proper support for grapheme boundaries in there.
    
    Signed-off-by: Simon Hausmann <shausman at trolltech.com>

diff --git a/src/harfbuzz-external.h b/src/harfbuzz-external.h
index db887f2..520c571 100644
--- a/src/harfbuzz-external.h
+++ b/src/harfbuzz-external.h
@@ -77,7 +77,22 @@ typedef enum
     HB_Symbol_Other              /*   So */
 } HB_CharCategory;
 
+typedef enum
+{
+    HB_Grapheme_Other, 
+    HB_Grapheme_CR,
+    HB_Grapheme_LF,
+    HB_Grapheme_Control,
+    HB_Grapheme_Extend,
+    HB_Grapheme_L, 
+    HB_Grapheme_V, 
+    HB_Grapheme_T, 
+    HB_Grapheme_LV, 
+    HB_Grapheme_LVT
+} HB_GraphemeClass;
+
 HB_LineBreakClass HB_GetLineBreakClass(HB_UChar32 ch);
+void HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *grapheme, HB_LineBreakClass *lineBreak);
 void HB_GetUnicodeCharProperties(HB_UChar32 ch, HB_CharCategory *category, int *combiningClass);
 HB_CharCategory HB_GetUnicodeCharCategory(HB_UChar32 ch);
 int HB_GetUnicodeCharCombiningClass(HB_UChar32 ch);
diff --git a/src/harfbuzz-shaper.cpp b/src/harfbuzz-shaper.cpp
index a08d570..5f951b5 100644
--- a/src/harfbuzz-shaper.cpp
+++ b/src/harfbuzz-shaper.cpp
@@ -100,14 +100,30 @@ static const hb_uint8 breakTable[HB_LineBreak_JT+1][HB_LineBreak_JT+1] =
 #undef CP
 #undef PB
 
-
+static const hb_uint8 graphemeTable[HB_Grapheme_LVT + 1][HB_Grapheme_LVT + 1] =
+{
+//      Other, CR,    LF,    Control,Extend,L,    V,     T,     LV,    LVT
+    { true , true , true , true , true , true , true , true , true , true  }, // Other, 
+    { true , true , true , true , true , true , true , true , true , true  }, // CR,
+    { true , false, true , true , true , true , true , true , true , true  }, // LF,
+    { true , true , true , true , true , true , true , true , true , true  }, // Control,
+    { false, true , true , true , false, false, false, false, false, false }, // Extend,
+    { true , true , true , true , true , false, true , true , true , true  }, // L, 
+    { true , true , true , true , true , false, false, true , false, true  }, // V, 
+    { true , true , true , true , true , false, false, false, false, false }, // T, 
+    { true , true , true , true , true , false, true , true , true , true  }, // LV, 
+    { true , true , true , true , true , false, true , true , true , true  }, // LVT
+};
+    
 static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttributes *charAttributes)
 {
     if (!len)
         return;
 
     // ##### can this fail if the first char is a surrogate?
-    int cls = HB_GetLineBreakClass(*uc);
+    HB_LineBreakClass cls;
+    HB_GraphemeClass grapheme;
+    HB_GetGraphemeAndLineBreakClass(*uc, &grapheme, &cls);
     // handle case where input starts with an LF
     if (cls == HB_LineBreak_LF)
         cls = HB_LineBreak_BK;
@@ -120,14 +136,17 @@ static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttribute
         charAttributes[i].whiteSpace = false;
         charAttributes[i].charStop = true;
 
-        int ncls = HB_GetLineBreakClass(uc[i]);
+        HB_UChar32 code = uc[i];
+        HB_GraphemeClass ngrapheme;
+        HB_LineBreakClass ncls;
+        HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
         // handle surrogates
         if (ncls == HB_LineBreak_SG) {
             if (HB_IsHighSurrogate(uc[i]) && i < len - 1 && HB_IsLowSurrogate(uc[i+1])) {
                 continue;
             } else if (HB_IsLowSurrogate(uc[i]) && HB_IsHighSurrogate(uc[i-1])) {
-                HB_UChar32 code = HB_SurrogateToUcs4(uc[i-1], uc[i]);
-                ncls = HB_GetLineBreakClass(code);
+                code = HB_SurrogateToUcs4(uc[i-1], uc[i]);
+                HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
                 charAttributes[i].charStop = false;
             } else {
                 ncls = HB_LineBreak_AL;
@@ -137,8 +156,8 @@ static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttribute
         // set white space and char stop flag
         if (ncls >= HB_LineBreak_SP)
             charAttributes[i].whiteSpace = true;
-        if (ncls == HB_LineBreak_CM)
-            charAttributes[i].charStop = false;
+
+        charAttributes[i].charStop = graphemeTable[ngrapheme][grapheme];
 
         HB_LineBreakType lineBreakType = HB_NoBreak;
         if (cls >= HB_LineBreak_LF) {
@@ -197,6 +216,7 @@ static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttribute
         cls = ncls;
     next_no_cls_update:
         lcls = ncls;
+        grapheme = ngrapheme;
         charAttributes[i-1].lineBreakType = lineBreakType;
     }
     charAttributes[len-1].lineBreakType = HB_ForcedBreak;