[Libreoffice-commits] core.git: sc/qa sc/source

Dennis Francis (via logerrit) logerrit at kemper.freedesktop.org
Wed Aug 18 18:29:06 UTC 2021


 sc/qa/unit/subsequent_export-test.cxx |    4 -
 sc/source/filter/oox/richstring.cxx   |  112 +++++++++++++++++++++++++++++++++-
 2 files changed, 113 insertions(+), 3 deletions(-)

New commits:
commit 2d5ba784a341aea1b7b2403842d2521d1548ea8f
Author:     Dennis Francis <dennis.francis at collabora.com>
AuthorDate: Tue Aug 17 14:38:21 2021 +0530
Commit:     Andras Timar <andras.timar at collabora.com>
CommitDate: Wed Aug 18 20:28:28 2021 +0200

    tdf#118470: sc oox: recover escaped unicode chars in strings import
    
    according to OOX open spec 2.1.1742 Part 1 Section 22.9.2.19, ST_Xstring
    (Escaped String). In this implementation, some restrictions mentioned in
    this spec are not kept for simplicity.
    
    Change-Id: If27797a9625d49be54c600c8a864965f1101ceb1
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120665
    Tested-by: Jenkins
    Reviewed-by: Andras Timar <andras.timar at collabora.com>

diff --git a/sc/qa/unit/subsequent_export-test.cxx b/sc/qa/unit/subsequent_export-test.cxx
index 31c0aae3dd71..e1d3f78bb93b 100644
--- a/sc/qa/unit/subsequent_export-test.cxx
+++ b/sc/qa/unit/subsequent_export-test.cxx
@@ -3672,9 +3672,9 @@ void ScExportTest::testTdf80149()
     CPPUNIT_ASSERT_EQUAL(OUString("row 1"), rDoc.GetString(0, 0, 0));
 
     // Without the fix in place, this test would have failed with
-    // - Expected: Character 0x16 is here ->>_x0016_<<--
+    // - Expected: Character 0x16 is here ->><<--
     // - Actual  :
-    CPPUNIT_ASSERT_EQUAL(OUString("Character 0x16 is here ->>_x0016_<<--"), rDoc.GetString(1, 0, 0));
+    CPPUNIT_ASSERT_EQUAL(OUString("Character 0x16 is here ->><<--"), rDoc.GetString(1, 0, 0));
     CPPUNIT_ASSERT_EQUAL(OUString("File opens in libre office, but can't be saved as xlsx"), rDoc.GetString(2, 0, 0));
     CPPUNIT_ASSERT_EQUAL(OUString("row 2"), rDoc.GetString(0, 1, 0));
     CPPUNIT_ASSERT_EQUAL(OUString("Subsequent rows get truncated"), rDoc.GetString(1, 1, 0));
diff --git a/sc/source/filter/oox/richstring.cxx b/sc/source/filter/oox/richstring.cxx
index a9d058f75ba5..7f8809824caa 100644
--- a/sc/source/filter/oox/richstring.cxx
+++ b/sc/source/filter/oox/richstring.cxx
@@ -48,6 +48,116 @@ bool lclNeedsRichTextFormat( const oox::xls::Font* pFont )
     return pFont && pFont->needsRichTextFormat();
 }
 
+sal_Int32 lcl_getHexLetterValue(sal_Unicode nCode)
+{
+    if (nCode >= '0' && nCode <= '9')
+        return nCode - '0';
+
+    if (nCode >= 'A' && nCode <= 'F')
+        return nCode - 'A' + 10;
+
+    if (nCode >= 'a' && nCode <= 'f')
+        return nCode - 'a' + 10;
+
+    return -1;
+}
+
+bool lcl_validEscape(sal_Unicode nCode)
+{
+    // Valid XML chars that can be escaped (ignoring the restrictions) as in the OOX open spec
+    // 2.1.1742 Part 1 Section 22.9.2.19, ST_Xstring (Escaped String)
+    if (nCode == 0x000D || nCode == 0x000A || nCode == 0x0009 || nCode == 0x005F)
+        return true;
+
+    // Other valid XML chars in basic multilingual plane that cannot be escaped.
+    if ((nCode >= 0x0020 && nCode <= 0xD7FF) || (nCode >= 0xE000 && nCode <= 0xFFFD))
+        return false;
+
+    return true;
+}
+
+OUString lcl_unEscapeUnicodeChars(const OUString& rSrc)
+{
+    // Example: Escaped representation of unicode char 0x000D is _x000D_
+
+    sal_Int32 nLen = rSrc.getLength();
+    if (!nLen)
+        return rSrc;
+
+    sal_Int32 nStart = 0;
+    bool bFound = true;
+    const OUString aPrefix = "_x";
+    sal_Int32 nPrefixStart = rSrc.indexOf(aPrefix, nStart);
+
+    if (nPrefixStart == -1)
+        return rSrc;
+
+    OUStringBuffer aBuf(rSrc);
+    sal_Int32 nOffset = 0; // index offset in aBuf w.r.t rSrc.
+
+    do
+    {
+        sal_Int32 nEnd = -1;
+        sal_Unicode nCode = 0;
+        bool bFoundThis = false;
+        for (sal_Int32 nIdx = 0; nIdx < 5; ++nIdx)
+        {
+            sal_Int32 nThisIdx = nPrefixStart + nIdx + 2;
+            if (nThisIdx >= nLen)
+                break;
+
+            sal_Unicode nThisCode = rSrc[nThisIdx];
+            sal_Int32 nLetter = lcl_getHexLetterValue(nThisCode);
+
+            if (!nIdx && nLetter < 0)
+                break;
+
+            if (nLetter >= 0)
+            {
+                nCode = (nCode << 4) + static_cast<sal_Unicode>(nLetter);
+            }
+            else if (nThisCode == '_')
+            {
+                nEnd = nThisIdx + 1;
+                bFoundThis = true;
+                break;
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        if (bFoundThis)
+        {
+            // nEnd is already set inside the inner loop in this case.
+            if (lcl_validEscape(nCode))
+            {
+                bFound = true;
+                sal_Int32 nEscStrLen = nEnd - nPrefixStart;
+                aBuf.remove(nPrefixStart - nOffset, nEscStrLen);
+                aBuf.insert(nPrefixStart - nOffset, nCode);
+
+                nOffset += nEscStrLen - 1;
+            }
+        }
+        else
+        {
+            // Start the next search just after last "_x"
+            nEnd = nPrefixStart + 2;
+        }
+
+        nStart = nEnd;
+        nPrefixStart = rSrc.indexOf(aPrefix, nStart);
+    }
+    while (nPrefixStart != -1);
+
+    if (bFound)
+        return aBuf.makeStringAndClear();
+
+    return rSrc;
+}
+
 } // namespace
 
 RichStringPortion::RichStringPortion( const WorkbookHelper& rHelper ) :
@@ -59,7 +169,7 @@ RichStringPortion::RichStringPortion( const WorkbookHelper& rHelper ) :
 
 void RichStringPortion::setText( const OUString& rText )
 {
-    maText = rText;
+    maText = lcl_unEscapeUnicodeChars(rText);
 }
 
 FontRef const & RichStringPortion::createFont()


More information about the Libreoffice-commits mailing list