[Libreoffice-commits] core.git: FastSaxSerializer: SAL_WARN() when writing invalid XML characters

Stephan Bergmann sbergman at redhat.com
Wed Mar 1 09:34:04 UTC 2017


(1)  If the input is assumed to be an arbitrary sequence of Unicode 
scalar values (i.e., may contain noncharacters, even despite the caveat 
that those should never be interchanged), the below invalidChar handling 
might want to also watch out for U+FFFE and U+FFFF.  (See 
writeValueContent in configmgr/source/writemodfile.cxx.  Somewhat oddly, 
XML 1.0 excludes those two noncharacters from its Char definition while 
not excluding any other Unicode noncharacters, U+FDD0..U+FDEF and 
U+nFFFE..U+nFFFF for n in 1..10 hex.)

On 02/28/2017 10:30 PM, Eike Rathke wrote:
> commit baca2ec8d5a457512e25b499c3cacc7a66ca853f
> Author: Eike Rathke <erack at redhat.com>
> Date:   Tue Feb 28 22:14:08 2017 +0100
>
>     FastSaxSerializer: SAL_WARN() when writing invalid XML characters
>
>     This catches things for OOXML, that could be escaped using _xHHHH_
>
>     Change-Id: I937f67dc5edd3c0e5727d74bebb736dc82bdc53d
>
> diff --git a/sax/source/tools/fastserializer.cxx b/sax/source/tools/fastserializer.cxx
> index 620fe68..a571829 100644
> --- a/sax/source/tools/fastserializer.cxx
> +++ b/sax/source/tools/fastserializer.cxx
> @@ -101,6 +101,26 @@ namespace sax_fastparser {
>          write( sOutput.getStr(), sOutput.getLength(), bEscape );
>      }
>
> +#if OSL_DEBUG_LEVEL > 0
> +    /** Characters not allowed in XML 1.0
> +        XML 1.1 would exclude only U+0000
> +     */
> +    bool invalidChar( char c )
> +    {
> +        if (static_cast<unsigned char>(c) >= 0x20)
> +            return false;
> +
> +        switch (c)
> +        {
> +            case 0x09:
> +            case 0x0a:
> +            case 0x0d:
> +                return false;
> +        }
> +        return true;
> +    }
> +#endif
> +
>      void FastSaxSerializer::write( const char* pStr, sal_Int32 nLen, bool bEscape )
>      {
>          if (nLen == -1)
> @@ -112,6 +132,7 @@ namespace sax_fastparser {
>              return;
>          }
>
> +        bool bGood = true;
>          for (sal_Int32 i = 0; i < nLen; ++i)
>          {
>              char c = pStr[ i ];
> @@ -124,9 +145,26 @@ namespace sax_fastparser {
>                  case '"':   writeBytes( """, 6 );   break;
>                  case '\n':  writeBytes( "
", 5 );    break;
>                  case '\r':  writeBytes( "
", 5 );    break;
> -                default:    writeBytes( &c, 1 );          break;
> +                default:
> +#if OSL_DEBUG_LEVEL > 0
> +                            /* FIXME: we should escape such invalid characters
> +                             * in the _xHHHH_ form OOXML uses. Note that also a
> +                             * literal "_x0008_" would have to be escaped then
> +                             * as _x005F_x0008_ (where only the leading '_' is
> +                             * escaped as _x005F_). */
> +                            if (invalidChar(pStr[i]))
> +                            {
> +                                bGood = false;
> +                                // The SAL_WARN() for the single character is
> +                                // issued in writeBytes(), just gather for the
> +                                // SAL_WARN_IF() below.
> +                            }
> +#endif
> +                            writeBytes( &c, 1 );         break;
>              }
>          }
> +        SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << "'");
> +        (void)bGood;
>      }
>
>      void FastSaxSerializer::endDocument()
> @@ -496,6 +534,21 @@ namespace sax_fastparser {
>
>      void FastSaxSerializer::writeBytes( const char* pStr, size_t nLen )
>      {
> +#if OSL_DEBUG_LEVEL > 0
> +        {
> +            bool bGood = true;
> +            for (size_t i=0; i < nLen; ++i)
> +            {
> +                if (invalidChar(pStr[i]))
> +                {
> +                    bGood = false;
> +                    SAL_WARN("sax", "FastSaxSerializer::writeBytes - illegal XML character 0x" <<
> +                            std::hex << int(static_cast<unsigned char>(pStr[i])));
> +                }
> +            }
> +            SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << "'");
> +        }
> +#endif
>          maCachedOutputStream.writeBytes( reinterpret_cast<const sal_Int8*>(pStr), nLen );
>      }
>



More information about the LibreOffice mailing list