[Libreoffice-commits] core.git: sax/source

Eike Rathke erack at redhat.com
Fri Mar 3 16:27:51 UTC 2017


 sax/source/tools/fastserializer.cxx |  127 +++++++++++++++++++++++++++++++-----
 sax/source/tools/fastserializer.hxx |    3 
 2 files changed, 115 insertions(+), 15 deletions(-)

New commits:
commit 8b25b67d5268abbb260da968cc23b6f6c8dd31af
Author: Eike Rathke <erack at redhat.com>
Date:   Thu Mar 2 17:06:54 2017 +0100

    escape invalid XML characters with _xHHHH_ when writing escaped
    
    As defined in OOXML, see code comments.
    
    Change-Id: I8ce0075790f2d4ef6227a9474c68466e0793dce2
    Reviewed-on: https://gerrit.libreoffice.org/34824
    Reviewed-by: Eike Rathke <erack at redhat.com>
    Tested-by: Jenkins <ci at libreoffice.org>

diff --git a/sax/source/tools/fastserializer.cxx b/sax/source/tools/fastserializer.cxx
index a571829..1424d1e 100644
--- a/sax/source/tools/fastserializer.cxx
+++ b/sax/source/tools/fastserializer.cxx
@@ -59,6 +59,7 @@ namespace sax_fastparser {
         , mbMarkStackEmpty(true)
         , mpDoubleStr(nullptr)
         , mnDoubleStrCapacity(RTL_STR_MAX_VALUEOFDOUBLE)
+        , mbXescape(true)
     {
         rtl_string_new_WithLength(&mpDoubleStr, mnDoubleStrCapacity);
         mxFastTokenHandler = css::xml::sax::FastTokenHandler::create(
@@ -101,7 +102,6 @@ namespace sax_fastparser {
         write( sOutput.getStr(), sOutput.getLength(), bEscape );
     }
 
-#if OSL_DEBUG_LEVEL > 0
     /** Characters not allowed in XML 1.0
         XML 1.1 would exclude only U+0000
      */
@@ -119,7 +119,11 @@ namespace sax_fastparser {
         }
         return true;
     }
-#endif
+
+    bool isHexDigit( char c )
+    {
+        return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f');
+    }
 
     void FastSaxSerializer::write( const char* pStr, sal_Int32 nLen, bool bEscape )
     {
@@ -133,6 +137,9 @@ namespace sax_fastparser {
         }
 
         bool bGood = true;
+        const sal_Int32 kXescapeLen = 7;
+        char bufXescape[kXescapeLen+1];
+        sal_Int32 nNextXescape = 0;
         for (sal_Int32 i = 0; i < nLen; ++i)
         {
             char c = pStr[ i ];
@@ -143,24 +150,114 @@ namespace sax_fastparser {
                 case '&':   writeBytes( "&", 5 );    break;
                 case '\'':  writeBytes( "'", 6 );   break;
                 case '"':   writeBytes( """, 6 );   break;
-                case '\n':  writeBytes( "
", 5 );    break;
-                case '\r':  writeBytes( "
", 5 );    break;
+#if 0
+                case '\t':
+                            // Seems OOXML prefers the _xHHHH_ escape over the
+                            // entity in *some* cases, apparently in attribute
+                            // values but not in element data.
+                            // Would need to distinguish at a higher level.
+                            if (mbXescape)
+                            {
+                                snprintf( bufXescape, kXescapeLen+1, "_x%04x_",
+                                        static_cast<unsigned int>(static_cast<unsigned char>(c)));
+                                writeBytes( bufXescape, kXescapeLen);
+                            }
+                            else
+                            {
+                                // We did never write this, but literal tab
+                                // instead. Should we?
+                                writeBytes( "	", 4 );
+                            }
+                break;
+#endif
+                case '\n':
+#if 0
+                            if (mbXescape)
+                            {
+                                snprintf( bufXescape, kXescapeLen+1, "_x%04x_",
+                                        static_cast<unsigned int>(static_cast<unsigned char>(c)));
+                                writeBytes( bufXescape, kXescapeLen);
+                            }
+                            else
+#endif
+                            {
+                                writeBytes( "
", 5 );
+                            }
+                break;
+                case '\r':
+#if 0
+                            if (mbXescape)
+                            {
+                                snprintf( bufXescape, kXescapeLen+1, "_x%04x_",
+                                        static_cast<unsigned int>(static_cast<unsigned char>(c)));
+                                writeBytes( bufXescape, kXescapeLen);
+                            }
+                            else
+#endif
+                            {
+                                writeBytes( "
", 5 );
+                            }
+                break;
                 default:
+                            if (mbXescape)
+                            {
+                                // Escape characters not valid in XML 1.0 as
+                                // _xHHHH_. A literal "_xHHHH_" has to be
+                                // escaped as _x005F_xHHHH_ (effectively
+                                // escaping the leading '_').
+                                // See ECMA-376-1:2016 page 3736,
+                                // 22.4.2.4 bstr (Basic String)
+                                // for reference.
+                                if (c == '_' && i >= nNextXescape && i <= nLen - kXescapeLen &&
+                                        pStr[i+6] == '_' &&
+                                        ((pStr[i+1] | 0x20) == 'x') &&
+                                        isHexDigit( pStr[i+2] ) &&
+                                        isHexDigit( pStr[i+3] ) &&
+                                        isHexDigit( pStr[i+4] ) &&
+                                        isHexDigit( pStr[i+5] ))
+                                {
+                                    // OOXML has the odd habit to write some
+                                    // names using this that when re-saving
+                                    // should *not* be escaped, specifically
+                                    // _x0020_ for blanks in w:xpath values.
+                                    if (strncmp( pStr+i+2, "0020", 4) != 0)
+                                    {
+                                        writeBytes( "_x005F_", kXescapeLen);
+                                        // Remember this escapement so in
+                                        // _xHHHH_xHHHH_ only the first '_' is
+                                        // escaped.
+                                        nNextXescape = i + kXescapeLen;
+                                        break;
+                                    }
+                                }
+                                if (invalidChar(c))
+                                {
+                                    snprintf( bufXescape, kXescapeLen+1, "_x%04x_",
+                                            static_cast<unsigned int>(static_cast<unsigned char>(c)));
+                                    writeBytes( bufXescape, kXescapeLen);
+                                    break;
+                                }
+                                /* TODO: also U+FFFE and U+FFFF are not allowed
+                                 * in XML 1.0, assuming we're writing UTF-8
+                                 * those should be escaped as well to be
+                                 * conformant. Likely that would involve
+                                 * scanning for both encoded sequences and
+                                 * write as _xHHHH_? */
+                            }
 #if OSL_DEBUG_LEVEL > 0
-                            /* FIXME: we should escape such invalid characters
-                             * in the _xHHHH_ form OOXML uses. Note that also a
-                             * literal "_x0008_" would have to be escaped then
-                             * as _x005F_x0008_ (where only the leading '_' is
-                             * escaped as _x005F_). */
-                            if (invalidChar(pStr[i]))
+                            else
                             {
-                                bGood = false;
-                                // The SAL_WARN() for the single character is
-                                // issued in writeBytes(), just gather for the
-                                // SAL_WARN_IF() below.
+                                if (bGood && invalidChar(pStr[i]))
+                                {
+                                    bGood = false;
+                                    // The SAL_WARN() for the single character is
+                                    // issued in writeBytes(), just gather for the
+                                    // SAL_WARN_IF() below.
+                                }
                             }
 #endif
-                            writeBytes( &c, 1 );         break;
+                            writeBytes( &c, 1 );
+                break;
             }
         }
         SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << "'");
diff --git a/sax/source/tools/fastserializer.hxx b/sax/source/tools/fastserializer.hxx
index 482d10d..ca8b674 100644
--- a/sax/source/tools/fastserializer.hxx
+++ b/sax/source/tools/fastserializer.hxx
@@ -228,6 +228,9 @@ private:
     rtl_String *mpDoubleStr;
     sal_Int32 mnDoubleStrCapacity;
     TokenValueList maTokenValues;
+    bool mbXescape;     ///< whether to escape invalid XML characters as _xHHHH_ in write(const char*,sal_Int32,true)
+                        /* TODO: make that configurable from the outside for
+                         * some specific cases? */
 
 #ifdef DBG_UTIL
     std::stack<sal_Int32> m_DebugStartedElements;


More information about the Libreoffice-commits mailing list