[Libreoffice-commits] core.git: sax/source

dante (via logerrit) logerrit at kemper.freedesktop.org
Tue Jan 19 07:20:25 UTC 2021


 sax/source/expatwrap/saxwriter.cxx |  242 ++++++++++++++++++++++++-------------
 1 file changed, 157 insertions(+), 85 deletions(-)

New commits:
commit ae5ec5b944cf2378806498072c50d473a3ac62ed
Author:     dante <dante19031999 at gmail.com>
AuthorDate: Fri Jan 1 19:42:37 2021 +0100
Commit:     Stephan Bergmann <sbergman at redhat.com>
CommitDate: Tue Jan 19 08:19:46 2021 +0100

    Use customized xml entities on xmleport.
    
    This will be mainly used on matml export for unicode characters.
    It will be used mostly for mathml.
    
    Change-Id: I59b96d44facbd01fa517317a0ae54d64d29b0a19
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/108562
    Tested-by: Jenkins
    Reviewed-by: Stephan Bergmann <sbergman at redhat.com>

diff --git a/sax/source/expatwrap/saxwriter.cxx b/sax/source/expatwrap/saxwriter.cxx
index f6a58b0bb4cb..04fd90762cd9 100644
--- a/sax/source/expatwrap/saxwriter.cxx
+++ b/sax/source/expatwrap/saxwriter.cxx
@@ -117,6 +117,9 @@ private:
     /// @throws SAXException
     void FinishStartElement();
 
+    // Search for the correct replacement
+    const ReplacementPair* findXMLReplacement(const sal_Unicode* pStr, sal_Int32 nStrLen);
+
 public:
     explicit SaxWriterHelper(Reference<XOutputStream> const& m_TempOut)
         : m_out(m_TempOut)
@@ -193,6 +196,10 @@ public:
     void setCustomEntityNames(
         const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>&
             replacements);
+
+    // Calculate length for convertToXML
+    sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization,
+                                bool bNormalizeWhitespace);
 };
 
 const bool g_bValidCharsBelow32[32] = {
@@ -282,10 +289,10 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen,
 
     for (sal_Int32 i = 0; i < nStrLen; i++)
     {
-        sal_uInt16 c = pStr[i];
+        sal_Unicode c = pStr[i];
         if (IsInvalidChar(c))
             bRet = false;
-        else if ((c >= 0x0001) && (c <= 0x007F))
+        else if ((c >= 0x0001) && (c <= 0x007F)) // Deal with ascii
         {
             if (bDoNormalization)
             {
@@ -414,26 +421,80 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen,
                 rPos++;
             }
         }
-        else if (c >= 0xd800 && c < 0xdc00)
-        {
-            // 1. surrogate: save (until 2. surrogate)
-            OSL_ENSURE(nSurrogate == 0, "left-over Unicode surrogate");
-            nSurrogate = ((c & 0x03ff) + 0x0040);
-        }
-        else if (c >= 0xdc00 && c < 0xe000)
+        else
         {
-            // 2. surrogate: write as UTF-8
-            OSL_ENSURE(nSurrogate != 0, "lone 2nd Unicode surrogate");
+            // Deal with replacements
+            if (bDoNormalization && !m_Replacements.empty())
+            {
+                // search
+                const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i);
+
+                // replace
+                if (it != nullptr)
+                {
+                    OString name = ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8);
+                    if (rPos + name.getLength() > SEQUENCESIZE)
+                        AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>(name.getStr()),
+                                 name.getLength());
+                    else
+                    {
+                        memcpy(&(pTarget[rPos]), name.getStr(), name.getLength());
+                        rPos += name.getLength();
+                    }
+                    i += it->replacement.getLength() - 1;
+                    continue;
+                }
+            }
+
+            // Deal with other uniciode cases
+            if (c >= 0xd800 && c < 0xdc00)
+            {
+                // 1. surrogate: save (until 2. surrogate)
+                OSL_ENSURE(nSurrogate == 0, "left-over Unicode surrogate");
+                nSurrogate = ((c & 0x03ff) + 0x0040);
+            }
+            else if (c >= 0xdc00 && c < 0xe000)
+            {
+                // 2. surrogate: write as UTF-8
+                OSL_ENSURE(nSurrogate != 0, "lone 2nd Unicode surrogate");
+
+                nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
+                if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
+                {
+                    sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)),
+                                          sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)),
+                                          sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)),
+                                          sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) };
+                    if ((rPos + 4) > SEQUENCESIZE)
+                        AddBytes(pTarget, rPos, aBytes, 4);
+                    else
+                    {
+                        pTarget[rPos] = aBytes[0];
+                        rPos++;
+                        pTarget[rPos] = aBytes[1];
+                        rPos++;
+                        pTarget[rPos] = aBytes[2];
+                        rPos++;
+                        pTarget[rPos] = aBytes[3];
+                        rPos++;
+                    }
+                }
+                else
+                {
+                    OSL_FAIL("illegal Unicode character");
+                    bRet = false;
+                }
 
-            nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
-            if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
+                // reset surrogate
+                nSurrogate = 0;
+            }
+            else if (c > 0x07FF)
             {
-                sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)),
-                                      sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)),
-                                      sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)),
-                                      sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) };
-                if ((rPos + 4) > SEQUENCESIZE)
-                    AddBytes(pTarget, rPos, aBytes, 4);
+                sal_Int8 aBytes[]
+                    = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)),
+                        sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
+                if ((rPos + 3) > SEQUENCESIZE)
+                    AddBytes(pTarget, rPos, aBytes, 3);
                 else
                 {
                     pTarget[rPos] = aBytes[0];
@@ -442,50 +503,24 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen,
                     rPos++;
                     pTarget[rPos] = aBytes[2];
                     rPos++;
-                    pTarget[rPos] = aBytes[3];
-                    rPos++;
                 }
             }
             else
             {
-                OSL_FAIL("illegal Unicode character");
-                bRet = false;
-            }
-
-            // reset surrogate
-            nSurrogate = 0;
-        }
-        else if (c > 0x07FF)
-        {
-            sal_Int8 aBytes[]
-                = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)),
-                    sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
-            if ((rPos + 3) > SEQUENCESIZE)
-                AddBytes(pTarget, rPos, aBytes, 3);
-            else
-            {
-                pTarget[rPos] = aBytes[0];
-                rPos++;
-                pTarget[rPos] = aBytes[1];
-                rPos++;
-                pTarget[rPos] = aBytes[2];
-                rPos++;
-            }
-        }
-        else
-        {
-            sal_Int8 aBytes[]
-                = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
-            if ((rPos + 2) > SEQUENCESIZE)
-                AddBytes(pTarget, rPos, aBytes, 2);
-            else
-            {
-                pTarget[rPos] = aBytes[0];
-                rPos++;
-                pTarget[rPos] = aBytes[1];
-                rPos++;
+                sal_Int8 aBytes[]
+                    = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
+                if ((rPos + 2) > SEQUENCESIZE)
+                    AddBytes(pTarget, rPos, aBytes, 2);
+                else
+                {
+                    pTarget[rPos] = aBytes[0];
+                    rPos++;
+                    pTarget[rPos] = aBytes[1];
+                    rPos++;
+                }
             }
         }
+
         OSL_ENSURE(rPos <= SEQUENCESIZE, "not reset current position");
         if (rPos == SEQUENCESIZE)
             rPos = writeSequence();
@@ -848,7 +883,8 @@ bool SaxWriterHelper::comment(const OUString& rComment)
     return bRet;
 }
 
-sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bNormalizeWhitespace)
+sal_Int32 SaxWriterHelper::calcXMLByteLength(const OUString& rStr, bool bDoNormalization,
+                                             bool bNormalizeWhitespace)
 {
     sal_Int32 nOutputLength = 0;
     sal_uInt32 nSurrogate = 0;
@@ -897,26 +933,45 @@ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bN
                 nOutputLength++;
             }
         }
-        else if (c >= 0xd800 && c < 0xdc00)
-        {
-            // save surrogate
-            nSurrogate = ((c & 0x03ff) + 0x0040);
-        }
-        else if (c >= 0xdc00 && c < 0xe000)
-        {
-            // 2. surrogate: write as UTF-8 (if range is OK
-            nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
-            if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
-                nOutputLength += 4;
-            nSurrogate = 0;
-        }
-        else if (c > 0x07FF)
-        {
-            nOutputLength += 3;
-        }
         else
         {
-            nOutputLength += 2;
+            // Deal with replacements
+            if (bDoNormalization && !m_Replacements.empty())
+            {
+                // search
+                const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i);
+
+                if (it != nullptr)
+                {
+                    nOutputLength
+                        += ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8).getLength();
+                    i += it->replacement.getLength() - 1;
+                    continue;
+                }
+            }
+
+            // Deal with other unicode cases
+            if (c >= 0xd800 && c < 0xdc00)
+            {
+                // save surrogate
+                nSurrogate = ((c & 0x03ff) + 0x0040);
+            }
+            else if (c >= 0xdc00 && c < 0xe000)
+            {
+                // 2. surrogate: write as UTF-8 (if range is OK
+                nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
+                if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
+                    nOutputLength += 4;
+                nSurrogate = 0;
+            }
+            else if (c > 0x07FF)
+            {
+                nOutputLength += 3;
+            }
+            else
+            {
+                nOutputLength += 2;
+            }
         }
 
         // surrogate processing
@@ -927,6 +982,23 @@ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bN
     return nOutputLength;
 }
 
+const ReplacementPair* SaxWriterHelper::findXMLReplacement(const sal_Unicode* pStr,
+                                                           sal_Int32 nStrLen)
+{
+    for (size_t iter = 0; iter < m_Replacements.size(); ++iter)
+    {
+        if (m_Replacements[iter].replacement.getLength() > nStrLen)
+            continue;
+        sal_Int32 matches = m_Replacements[iter].replacement.compareTo(
+            std::u16string_view(pStr, m_Replacements[iter].replacement.getLength()));
+        if (matches == 0)
+            return &m_Replacements[iter];
+        if (matches > 0)
+            return nullptr;
+    }
+    return nullptr;
+}
+
 /** returns position of first ascii 10 within the string, -1 when no 10 in string.
  */
 sal_Int32 getFirstLineBreak(const OUString& str) throw()
@@ -1115,7 +1187,7 @@ void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeLi
         sal_Int32 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0;
 
         nLength++; // "<"
-        nLength += calcXMLByteLength(aName, false, false); // the tag name
+        nLength += m_pSaxWriterHelper->calcXMLByteLength(aName, false, false); // the tag name
 
         sal_Int16 n;
         for (n = 0; n < static_cast<sal_Int16>(nAttribCount); n++)
@@ -1123,13 +1195,13 @@ void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeLi
             nLength++; // " "
             OUString tmp = xAttribs->getNameByIndex(n);
 
-            nLength += calcXMLByteLength(tmp, false, false);
+            nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, false, false);
 
             nLength += 2; // ="
 
             tmp = xAttribs->getValueByIndex(n);
 
-            nLength += calcXMLByteLength(tmp, true, true);
+            nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, true, true);
 
             nLength += 1; // "
         }
@@ -1191,7 +1263,7 @@ void SAXWriter::endElement(const OUString& aName)
         // only ascii chars allowed
         sal_Int32 nLength(0);
         if (m_bAllowLineBreak)
-            nLength = 3 + calcXMLByteLength(aName, false, false);
+            nLength = 3 + m_pSaxWriterHelper->calcXMLByteLength(aName, false, false);
         sal_Int32 nPrefix = getIndentPrefixLength(nLength);
 
         if (nPrefix >= 0)
@@ -1233,7 +1305,7 @@ void SAXWriter::characters(const OUString& aChars)
             {
                 sal_Int32 nFirstLineBreakOccurrence = getFirstLineBreak(aChars);
 
-                nLength = calcXMLByteLength(aChars, !m_bIsCDATA, false);
+                nLength = m_pSaxWriterHelper->calcXMLByteLength(aChars, !m_bIsCDATA, false);
                 nIndentPrefix = getIndentPrefixLength(
                     nFirstLineBreakOccurrence >= 0 ? nFirstLineBreakOccurrence : nLength);
             }
@@ -1280,11 +1352,11 @@ void SAXWriter::processingInstruction(const OUString& aTarget, const OUString& a
     if (m_bAllowLineBreak)
     {
         nLength = 2; // "<?"
-        nLength += calcXMLByteLength(aTarget, false, false);
+        nLength += m_pSaxWriterHelper->calcXMLByteLength(aTarget, false, false);
 
         nLength += 1; // " "
 
-        nLength += calcXMLByteLength(aData, false, false);
+        nLength += m_pSaxWriterHelper->calcXMLByteLength(aData, false, false);
 
         nLength += 2; // "?>"
     }
@@ -1355,7 +1427,7 @@ void SAXWriter::comment(const OUString& sComment)
     if (m_bAllowLineBreak)
     {
         nLength = 4; // "<!--"
-        nLength += calcXMLByteLength(sComment, false, false);
+        nLength += m_pSaxWriterHelper->calcXMLByteLength(sComment, false, false);
 
         nLength += 3;
     }
@@ -1398,7 +1470,7 @@ void SAXWriter::unknown(const OUString& sString)
 
     sal_Int32 nLength(0);
     if (m_bAllowLineBreak)
-        nLength = calcXMLByteLength(sString, false, false);
+        nLength = m_pSaxWriterHelper->calcXMLByteLength(sString, false, false);
 
     sal_Int32 nPrefix = getIndentPrefixLength(nLength);
     if (nPrefix >= 0)


More information about the Libreoffice-commits mailing list