[Libreoffice-commits] core.git: Branch 'distro/collabora/cd-5.3' - include/rtl sw/qa writerfilter/source

Mike Kaganski mike.kaganski at collabora.com
Wed Nov 22 15:27:36 UTC 2017


 include/rtl/ustring.h                                 |    4 +-
 include/rtl/ustring.hxx                               |    4 +-
 sw/qa/extras/ooxmlexport/data/tdf111964.docx          |binary
 sw/qa/extras/ooxmlexport/ooxmlexport9.cxx             |   10 ++++++
 writerfilter/source/ooxml/OOXMLFastContextHandler.cxx |   28 +++++++++++++++++-
 5 files changed, 43 insertions(+), 3 deletions(-)

New commits:
commit f3af5926f93f46e96caa6b85e08723f150c94463
Author: Mike Kaganski <mike.kaganski at collabora.com>
Date:   Wed Aug 23 09:09:57 2017 +0300

    tdf#111964: only trim XML whitespace
    
    OUString::trim() uses rtl_uString_newTrim, which relies upon
    rtl_ImplIsWhitespace. The latter treats as whitespaces not only
    characters with values less than or equal to 32, but also Unicode
    General Punctuation area Space and some Control characters. Thus,
    using OUString::trim() is incorrect when the goal is to trim XML
    whitespace, which is defined as one of 0x09, 0x0A, 0x0D, 0x20.
    
    The comments for OUString::trim() and rtl_uString_newTrim are
    corrected to describe which characters are considered whitespace.
    
    A unit test included.
    
    Change-Id: I45a132be923a52dcd5a4c35aeecb53d423b49fec
    Reviewed-on: https://gerrit.libreoffice.org/41444
    Reviewed-by: Mike Kaganski <mike.kaganski at collabora.com>
    Tested-by: Mike Kaganski <mike.kaganski at collabora.com>
    (cherry picked from commit 5b518ab051cc04e672ceb01da42b06625a1a4ce9)
    Reviewed-on: https://gerrit.libreoffice.org/44758
    Reviewed-by: Aron Budea <aron.budea at collabora.com>
    Tested-by: Aron Budea <aron.budea at collabora.com>

diff --git a/include/rtl/ustring.h b/include/rtl/ustring.h
index 831ecd66d9be..50dbd75a5ecc 100644
--- a/include/rtl/ustring.h
+++ b/include/rtl/ustring.h
@@ -2023,7 +2023,9 @@ SAL_DLLPUBLIC void SAL_CALL rtl_uString_newToAsciiUpperCase(
     string.
 
     The new string results from removing all characters with values less than
-    or equal to 32 (the space character) form both ends of str.
+    or equal to 32 (the space character), and also Unicode General Punctuation
+    area Space and some Control characters, form both ends of str (see
+    rtl_ImplIsWhitespace).
 
     This function cannot be used for language-specific conversion.  The new
     string does not necessarily have a reference count of 1 (in cases where
diff --git a/include/rtl/ustring.hxx b/include/rtl/ustring.hxx
index 337e8509a53d..bc87c2936eef 100644
--- a/include/rtl/ustring.hxx
+++ b/include/rtl/ustring.hxx
@@ -2635,7 +2635,9 @@ public:
       of the string.
 
       All characters that have codes less than or equal to
-      32 (the space character) are considered to be white space.
+      32 (the space character), and Unicode General Punctuation area Space
+      and some Control characters are considered to be white space (see
+      rtl_ImplIsWhitespace).
       If the string doesn't contain white spaces at both ends,
       then the new string is assigned with str.
 
diff --git a/sw/qa/extras/ooxmlexport/data/tdf111964.docx b/sw/qa/extras/ooxmlexport/data/tdf111964.docx
new file mode 100644
index 000000000000..7cb85a1d87df
Binary files /dev/null and b/sw/qa/extras/ooxmlexport/data/tdf111964.docx differ
diff --git a/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx b/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx
index b2c8a417109c..8c1537ffc268 100644
--- a/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx
+++ b/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx
@@ -384,6 +384,16 @@ DECLARE_OOXMLEXPORT_TEST(testTdf107684, "tdf107684.odt")
         assertXPath(pXmlDoc, "//w:style[@w:styleId='Heading1']/w:pPr/w:outlineLvl", 1);
 }
 
+DECLARE_OOXMLEXPORT_TEST(testTdf111964, "tdf111964.docx")
+{
+    xmlDocPtr pXmlDoc = parseExport("word/document.xml");
+    if (!pXmlDoc)
+        return;
+    // Unicode spaces that are not XML whitespace must not be trimmed
+    const sal_Unicode sWSReference [] { 0x2002, 0x2002, 0x2002, 0x2002, 0x2002, 0 };
+    assertXPathContent(pXmlDoc, "/w:document/w:body/w:p/w:r[4]/w:t", sWSReference);
+}
+
 CPPUNIT_PLUGIN_IMPLEMENT();
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
index bb59ed9bebdc..1485ce0ed177 100644
--- a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
+++ b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
@@ -620,6 +620,32 @@ void OOXMLFastContextHandler::endTxbxContent()
     mpParserState->endTxbxContent();
 }
 
+namespace {
+// XML schema defines white space as one of four characters:
+// #x9 (tab), #xA (line feed), #xD (carriage return), and #x20 (space)
+bool IsXMLWhitespace(sal_Unicode cChar)
+{
+    return cChar == 0x9 || cChar == 0xA || cChar == 0xD || cChar == 0x20;
+}
+
+OUString TrimXMLWhitespace(const OUString & sText)
+{
+    sal_Int32 nTrimmedStart = 0;
+    const sal_Int32 nLen = sText.getLength();
+    sal_Int32 nTrimmedEnd = nLen - 1;
+    while (nTrimmedStart < nLen && IsXMLWhitespace(sText[nTrimmedStart]))
+        ++nTrimmedStart;
+    while (nTrimmedStart <= nTrimmedEnd && IsXMLWhitespace(sText[nTrimmedEnd]))
+        --nTrimmedEnd;
+    if ((nTrimmedStart == 0) && (nTrimmedEnd == nLen - 1))
+        return sText;
+    else if (nTrimmedStart > nTrimmedEnd)
+        return OUString();
+    else
+        return sText.copy(nTrimmedStart, nTrimmedEnd-nTrimmedStart+1);
+}
+}
+
 void OOXMLFastContextHandler::text(const OUString & sText)
 {
     if (isForwardEvents())
@@ -631,7 +657,7 @@ void OOXMLFastContextHandler::text(const OUString & sText)
         // tabs are converted to spaces
         if (!IsPreserveSpace())
         {
-            sNormalizedText = sNormalizedText.trim().replaceAll("\t", " ");
+            sNormalizedText = TrimXMLWhitespace(sNormalizedText).replaceAll("\t", " ");
         }
         mpStream->utext(reinterpret_cast < const sal_uInt8 * >
                         (sNormalizedText.getStr()),


More information about the Libreoffice-commits mailing list