[Libreoffice-commits] .: Branch 'libreoffice-3-4-2' - i18npool/qa i18npool/source

Michael Meeks michael at kemper.freedesktop.org
Tue Jul 19 03:52:41 PDT 2011


 i18npool/qa/cppunit/test_breakiterator.cxx          |   51 ++++-
 i18npool/source/breakiterator/breakiteratorImpl.cxx |  200 ++++++++++++++------
 2 files changed, 191 insertions(+), 60 deletions(-)

New commits:
commit c0d66a599cf34b9b2410727b9015bd3aac58b024
Author: Caolán McNamara <caolanm at redhat.com>
Date:   Tue Jul 19 09:21:10 2011 +0100

    Resolves: fdo#38095 half&full width forms need to remain asian
    
    Signed-off-by: Fridrich Å trba <fridrich.strba at bluewin.ch>
    Signed-off-by: Michael Meeks <michael.meeks at novell.com>

diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
index 0f42fe7..7b4df32 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -57,11 +57,13 @@ public:
     void testLineBreaking();
     void testGraphemeIteration();
     void testWeak();
+    void testAsian();
 
     CPPUNIT_TEST_SUITE(TestBreakIterator);
     CPPUNIT_TEST(testLineBreaking);
     CPPUNIT_TEST(testGraphemeIteration);
     CPPUNIT_TEST(testWeak);
+    CPPUNIT_TEST(testAsian);
     CPPUNIT_TEST_SUITE_END();
 
 private:
@@ -104,7 +106,7 @@ void TestBreakIterator::testGraphemeIteration()
     aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IN"));
 
     {
-        sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
+        const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
         ::rtl::OUString aTest1(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
 
         sal_Int32 nDone=0;
@@ -118,7 +120,7 @@ void TestBreakIterator::testGraphemeIteration()
     }
 
     {
-        sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
+        const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
         ::rtl::OUString aTest1(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
 
         sal_Int32 nDone=0;
@@ -132,7 +134,7 @@ void TestBreakIterator::testGraphemeIteration()
     }
 
     {
-        sal_Unicode TA_HALANT_MA_HALANT_YA  [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
+        const sal_Unicode TA_HALANT_MA_HALANT_YA  [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
         ::rtl::OUString aTest1(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
 
         sal_Int32 nDone=0;
@@ -156,10 +158,12 @@ void TestBreakIterator::testWeak()
     aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
 
     {
-        sal_Unicode WEAKS[] =
+        const sal_Unicode WEAKS[] =
         {
             0x0001, 0x0002,
             0x0020, 0x00A0,
+            0x2150, 0x215F, //Number Forms, fractions
+            0x2160, 0x2180, //Number Forms, roman numerals
             0x2200, 0x22FF, //Mathematical Operators
             0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
             0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
@@ -184,6 +188,45 @@ void TestBreakIterator::testWeak()
     }
 }
 
+//A test to ensure that certain ranges and codepoints that are categorized as
+//asian remain as asian, so that existing docs that depend on this don't silently
+//change font for those asian chars.
+//See https://bugs.freedesktop.org/show_bug.cgi?id=38095
+void TestBreakIterator::testAsian()
+{
+    lang::Locale aLocale;
+    aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
+    aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
+
+    {
+        const sal_Unicode ASIANS[] =
+        {
+            //some typical CJK chars
+            0x4E00, 0x62FF,
+            //The full HalfWidth and FullWidth block has historically been
+            //designated as taking the CJK font :-(
+            //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
+            //UAX24 as "Common" i.e. by that logic WEAK
+            0xFF10, 0xFF19,
+            //HalfWidth and FullWidth forms of ASCII A-z, categorized under
+            //UAX25 as "Latin", i.e. by that logic LATIN
+            0xFF21, 0xFF5A
+        };
+        ::rtl::OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
+
+        for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
+        {
+            sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
+            rtl::OStringBuffer aMsg;
+            aMsg.append(RTL_CONSTASCII_STRINGPARAM("Char 0x"));
+            aMsg.append(static_cast<sal_Int32>(aAsians.getStr()[i]), 16);
+            aMsg.append(RTL_CONSTASCII_STRINGPARAM(" should have been asian"));
+            CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
+                nScript == i18n::ScriptType::ASIAN);
+        }
+    }
+}
+
 TestBreakIterator::TestBreakIterator()
 {
     m_xContext = cppu::defaultBootstrap_InitialComponentContext();
diff --git a/i18npool/source/breakiterator/breakiteratorImpl.cxx b/i18npool/source/breakiterator/breakiteratorImpl.cxx
index 601acef..377e575 100644
--- a/i18npool/source/breakiterator/breakiteratorImpl.cxx
+++ b/i18npool/source/breakiterator/breakiteratorImpl.cxx
@@ -443,67 +443,155 @@ sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
         return 0;
 }
 
-static sal_Int16 scriptTypes[] = {
-    ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
-    ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
-// 15
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
-    ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
-// 30
-    ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-// 45
-    ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
-    ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-// 60
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
-// 75
-    ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-// 90
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
-// 105
-    ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
-// 120
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-// 135
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
-    ScriptType::COMPLEX,
-    ScriptType::WEAK};
-
-#define scriptListCount SAL_N_ELEMENTS(scriptTypes)
+namespace
+{
+    //See unicode/uscript.h
+    static sal_Int16 scriptTypes[] =
+    {
+        ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
+        ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
+    // 15
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
+        ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
+    // 30
+        ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+    // 45
+        ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
+        ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+    // 60
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
+    // 75
+        ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+    // 90
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
+    // 105
+        ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
+    // 120
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+    // 135
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
+        ScriptType::COMPLEX,
+        ScriptType::WEAK
+    };
+
+#   define scriptTypesCount SAL_N_ELEMENTS(scriptTypes)
+
+    sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
+    {
+        sal_Int16 nRet;
+        int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
+        if (script < 0)
+            nRet = ScriptType::WEAK;
+        else if (static_cast<size_t>(script) >= SAL_N_ELEMENTS(scriptTypes))
+            nRet = ScriptType::COMPLEX;         // anything new is going to be pretty wild
+        else
+            nRet = scriptTypes[script];
+        return nRet;
+    }
+
+    struct UBlock2Script
+    {
+        UBlockCode from;
+        UBlockCode to;
+        sal_Int16 script;
+    };
+
+    static UBlock2Script scriptList[] =
+    {
+        {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
+        {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN},
+        {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
+        {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
+        {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
+        {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
+        {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
+        {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
+        {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
+        {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
+        {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
+        {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
+        {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
+        {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
+        {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
+        {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
+        {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
+        {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
+        {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
+    };
+
+    #define scriptListCount SAL_N_ELEMENTS(scriptTypes)
+
+    //always sets rScriptType
+    //
+    //returns true for characters historically explicitly assigned to
+    //latin/weak/asian
+    //
+    //returns false for characters that historically implicitly assigned to
+    //weak as unknown
+    bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
+    {
+        bool bKnown = true;
+        //handle specific characters always as weak:
+        //  0x01 - this breaks a word
+        //  0x02 - this can be inside a word
+        //  0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
+        if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
+            rScriptType = ScriptType::WEAK;
+        // workaround for Coptic
+        else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
+            rScriptType = ScriptType::LATIN;
+        else
+        {
+            UBlockCode block=ublock_getCode(currentChar);
+            size_t i = 0;
+            while (i < scriptListCount)
+            {
+                if (block <= scriptList[i].to)
+                    break;
+                ++i;
+            }
+            if (i < scriptListCount && block >= scriptList[i].from)
+                rScriptType = scriptList[i].script;
+            else
+            {
+                rScriptType = ScriptType::WEAK;
+                bKnown = false;
+            }
+        }
+        return bKnown;
+    }
+}
 
 sal_Int16  BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
 {
-        static sal_uInt32 lastChar = 0;
-        static sal_Int16 nRet = 0;
+    static sal_uInt32 lastChar = 0;
+    static sal_Int16 nRet = 0;
 
-        if (currentChar != lastChar) {
-            lastChar = currentChar;
+    if (currentChar != lastChar)
+    {
+        lastChar = currentChar;
 
-            int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
-            if (script < 0)
-                nRet = ScriptType::WEAK;
-            else if (static_cast<size_t>(script) >= SAL_N_ELEMENTS(scriptTypes))
-                nRet = ScriptType::COMPLEX;         // anything new is going to be pretty wild
-            else
-                nRet = scriptTypes[script];
-        }
-        return nRet;
+        if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
+            nRet = getScriptClassByUAX24Script(currentChar);
+    }
+
+    return nRet;
 }
 
 static inline sal_Bool operator == (const Locale& l1, const Locale& l2) {


More information about the Libreoffice-commits mailing list