[Libreoffice-commits] core.git: sal/qa sal/textenc

Stephan Bergmann sbergman at redhat.com
Wed Sep 13 06:35:16 UTC 2017


 sal/qa/rtl/textenc/rtl_textcvt.cxx |  332 +++++++++++++++++++++++++++++++++----
 sal/textenc/tcvtutf8.cxx           |   73 +++++---
 2 files changed, 352 insertions(+), 53 deletions(-)

New commits:
commit 08e78607ec6bc820c52ab3df1a5d3738e049b90d
Author: Stephan Bergmann <sbergman at redhat.com>
Date:   Wed Sep 13 08:28:32 2017 +0200

    Make reading UTF-8 strict
    
    Consider non-shortest forms, surrogates, and representations of values larger
    than 0x10FFFF (which can even cover five or six bytes, for historical reasons)
    as "invalid" (they used to be considered as "undefined" instead).
    
    This is in response to fc670f637d4271246691904fd649358ce2e7be59 "svtools: HTML
    import: don't put lone surrogates in OUString" (which can now be reverted again
    in a follow-up commit).  My fear would have been that some places in the code
    rely on the original, relaxed handling, but at least 'make check' still
    succeeded for me.
    
    Change-Id: I017e6c04ed3c577c3694b417167f853987a1d1ce

diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index d698bc22cd74..3c36852bebfc 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -453,6 +453,8 @@ public:
 
     void testComplexCut();
 
+    void testInvalidUtf8();
+
     void testSRCBUFFERTOSMALL();
 
     void testMime();
@@ -465,6 +467,7 @@ public:
     CPPUNIT_TEST(testSingleByte);
     CPPUNIT_TEST(testComplex);
     CPPUNIT_TEST(testComplexCut);
+    CPPUNIT_TEST(testInvalidUtf8);
     CPPUNIT_TEST(testSRCBUFFERTOSMALL);
     CPPUNIT_TEST(testMime);
     CPPUNIT_TEST(testWindows);
@@ -2330,35 +2333,6 @@ void Test::testComplex() {
               true,
               false,
               RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-            { RTL_TEXTENCODING_UTF8,
-              RTL_CONSTASCII_STRINGPARAM(
-                  "\xC0\x80\xE0\x80\x81\xF0\x80\x80\x82\xF8\x80\x80\x80\x83"
-                  "\xFC\x80\x80\x80\x80\x84"),
-              { 0x0000,0x0001,0x0002,0x0003,0x0004 },
-              5,
-              false,
-              true,
-              false,
-              false,
-              RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-            { RTL_TEXTENCODING_UTF8,
-              RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\xED\xB4\x93"),
-              { 0xD849,0xDD13 },
-              2,
-              false,
-              true,
-              false,
-              false,
-              RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-            { RTL_TEXTENCODING_UTF8,
-              RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\x41"),
-              { 0xD849,0x0041 },
-              2,
-              false,
-              true,
-              false,
-              false,
-              RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
 
             // Test Java UTF-8:
 
@@ -2664,6 +2638,306 @@ void Test::testComplexCut() {
 #endif
 }
 
+void Test::testInvalidUtf8() {
+    // UTF-8, invalid bytes:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\x80\xBF\xFE\xFF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD\uFFFD\uFFFD\uFFFD"),
+            OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, non-shortest two-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0\x80"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(2), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, cut two-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+        CPPUNIT_ASSERT(converted <= 1);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, non-shortest three-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x9F\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, cut three-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+        CPPUNIT_ASSERT(converted <= 2);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, cut three-byte sequence followed by more:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80."), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD."), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, surrogates:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr,
+            RTL_CONSTASCII_STRINGPARAM("\xED\xA0\x80\xED\xB0\x80"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, non-shortest four-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF0\x8F\xBF\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, too-large four-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF4\x90\x80\x80"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, five-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr,
+            RTL_CONSTASCII_STRINGPARAM("\xFB\xBF\xBF\xBF\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(5), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, six-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr,
+            RTL_CONSTASCII_STRINGPARAM("\xFD\xBF\xBF\xBF\xBF\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // Java UTF-8, U+0000:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_JAVA_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\0"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // Java UTF-8, U+10000:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_JAVA_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM(u8"\U00010000"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+}
+
 void Test::testSRCBUFFERTOSMALL() {
     rtl_TextToUnicodeConverter cv = rtl_createTextToUnicodeConverter(
         RTL_TEXTENCODING_EUC_JP);
diff --git a/sal/textenc/tcvtutf8.cxx b/sal/textenc/tcvtutf8.cxx
index c1a6949c01c5..4943f6987a29 100644
--- a/sal/textenc/tcvtutf8.cxx
+++ b/sal/textenc/tcvtutf8.cxx
@@ -30,6 +30,7 @@
 struct ImplUtf8ToUnicodeContext
 {
     sal_uInt32 nUtf32;
+    int nBytes;
     int nShift;
     bool bCheckBom;
 };
@@ -65,18 +66,9 @@ sal_Size ImplConvertUtf8ToUnicode(
     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
 {
-    /*
-       This function is very liberal with the UTF-8 input.  Accepted are:
-       - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
-       - surrogates (e.g., ED A0 80 to represent U+D800)
-       - encodings with up to six bytes (everything outside the range
-         U+0000..10FFFF is considered "undefined")
-       The first two of these points allow this routine to translate from both
-       RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
-      */
-
     bool bJavaUtf8 = pData != nullptr;
     sal_uInt32 nUtf32 = 0;
+    int nBytes;
     int nShift = -1;
     bool bCheckBom = true;
     sal_uInt32 nInfo = 0;
@@ -88,19 +80,22 @@ sal_Size ImplConvertUtf8ToUnicode(
     if (pContext != nullptr)
     {
         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
+        nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
     }
 
     while (pSrcBufPtr < pSrcBufEnd)
     {
-        bool bUndefined = false;
         bool bConsume = true;
         sal_uInt32 nChar = *pSrcBufPtr++;
         if (nShift < 0)
+            // Allow (illegal) 5 and 6 byte sequences, so they are read as a
+            // single individual bad character:
             if (nChar <= 0x7F)
             {
                 nUtf32 = nChar;
+                nBytes = 1;
                 goto transform;
             }
             else if (nChar <= 0xBF)
@@ -108,26 +103,31 @@ sal_Size ImplConvertUtf8ToUnicode(
             else if (nChar <= 0xDF)
             {
                 nUtf32 = (nChar & 0x1F) << 6;
+                nBytes = 2;
                 nShift = 0;
             }
             else if (nChar <= 0xEF)
             {
                 nUtf32 = (nChar & 0x0F) << 12;
+                nBytes = 3;
                 nShift = 6;
             }
             else if (nChar <= 0xF7)
             {
                 nUtf32 = (nChar & 0x07) << 18;
+                nBytes = 4;
                 nShift = 12;
             }
             else if (nChar <= 0xFB)
             {
                 nUtf32 = (nChar & 0x03) << 24;
+                nBytes = 5;
                 nShift = 18;
             }
             else if (nChar <= 0xFD)
             {
                 nUtf32 = (nChar & 0x01) << 30;
+                nBytes = 6;
                 nShift = 24;
             }
             else
@@ -154,28 +154,52 @@ sal_Size ImplConvertUtf8ToUnicode(
         continue;
 
     transform:
-        if (!bCheckBom || nUtf32 != 0xFEFF
+        if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
             || bJavaUtf8)
         {
+            switch (nBytes) {
+            case 1:
+                if (bJavaUtf8 && nUtf32 == 0) {
+                    goto bad_input;
+                }
+                break;
+            case 2:
+                if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
+                    goto bad_input;
+                }
+                break;
+            case 3:
+                if (nUtf32 < 0x800
+                    || (!bJavaUtf8
+                        && (rtl::isHighSurrogate(nUtf32)
+                            || rtl::isLowSurrogate(nUtf32))))
+                {
+                    goto bad_input;
+                }
+                break;
+            case 4:
+                if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
+                    || bJavaUtf8)
+                {
+                    goto bad_input;
+                }
+                break;
+            default:
+                goto bad_input;
+            }
             if (nUtf32 <= 0xFFFF)
                 if (pDestBufPtr != pDestBufEnd)
                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
                 else
                     goto no_output;
-            else if (rtl::isUnicodeCodePoint(nUtf32))
-                if (pDestBufEnd - pDestBufPtr >= 2)
-                {
-                    *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
-                    *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
-                }
-                else
-                    goto no_output;
-            else
+            else if (pDestBufEnd - pDestBufPtr >= 2)
             {
-                bUndefined = true;
-                goto bad_input;
+                *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
+                *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
             }
+            else
+                goto no_output;
         }
         nShift = -1;
         bCheckBom = false;
@@ -183,7 +207,7 @@ sal_Size ImplConvertUtf8ToUnicode(
 
     bad_input:
         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
-                    bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
+                    false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
                     &nInfo))
         {
         case sal::detail::textenc::BAD_INPUT_STOP:
@@ -238,6 +262,7 @@ sal_Size ImplConvertUtf8ToUnicode(
     if (pContext != nullptr)
     {
         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
+        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
     }


More information about the Libreoffice-commits mailing list