[Libreoffice-commits] core.git: include/rtl include/svtools svtools/source sw/qa

Sat Feb 13 08:05:36 UTC 2016

include/rtl/character.hxx              |   13 ++++++++
 include/svtools/svparser.hxx           |    4 +-
 svtools/source/svhtml/parhtml.cxx      |   53 ++++++++++++++++++---------------
 svtools/source/svrtf/parrtf.cxx        |    6 +--
 svtools/source/svrtf/svparser.cxx      |   29 ++++++++++++------
 sw/qa/extras/htmlexport/data/extb.html |   10 ++++++
 sw/qa/extras/htmlexport/htmlexport.cxx |   13 ++++++++
 7 files changed, 90 insertions(+), 38 deletions(-)

New commits:
commit 4647e778993250b8c9431e2890750916fb986ecc
Author: Mark Hung <marklh9 at gmail.com>
Date:   Sun Dec 27 00:46:49 2015 +0800

    tdf#81129 Support reading non-BMP characters in HTML documents.
    
    1. Allow character entity ( &#nnnn; ) to exceed 0xffff in HTMLParser::ScanText()
    2. Return a character as sal_uInt32 ( utf32 ) instead of sal_Unicode ( utf16 )
       from SvParser::GetNextChar().
    
    Conflicts:
    	sw/qa/extras/htmlexport/htmlexport.cxx
    
    Change-Id: Ida455040970fae800f0f11471b27f53461fb78e4
    Reviewed-on: https://gerrit.libreoffice.org/21152
    Tested-by: Jenkins <ci at libreoffice.org>
    Reviewed-by: Mark Hung <marklh9 at gmail.com>

diff --git a/include/rtl/character.hxx b/include/rtl/character.hxx
index a3d09b9..49f6803 100644
--- a/include/rtl/character.hxx
+++ b/include/rtl/character.hxx
@@ -222,6 +222,19 @@ sal_uInt32 const surrogatesLowLast = 0xDFFF;
 }
 /// @endcond
 
+/** Check if a codepoint is accessible via utf16 per RFC3629
+
+    @param code  A non-BMP Unicode code point.
+
+    @return  True if the code is a valid codepoint.
+
+    @since LibreOffice 5.2
+*/
+inline bool isValidCodePoint( sal_uInt32 code)
+{
+    return code <= 0x10FFFF;
+}
+
 /** Check for high surrogate.
 
     @param code  A Unicode code point.
diff --git a/include/svtools/svparser.hxx b/include/svtools/svparser.hxx
index 3f60a40..cfbd115 100644
--- a/include/svtools/svparser.hxx
+++ b/include/svtools/svparser.hxx
@@ -59,7 +59,7 @@ protected:
     rtl_TextEncoding    eSrcEnc;        // Source encoding
 
     sal_uLong nNextChPos;
-    sal_Unicode nNextCh;                // current character for the "lex"
+    sal_uInt32 nNextCh;                // current character codepoint in UTF32 for the "lex"
 
 
     bool                bDownloadingFile : 1; // true: An external file is
@@ -128,7 +128,7 @@ public:
     inline void         SetLineNr( sal_uLong nlNum );           // inline bottom
     inline void         SetLinePos( sal_uLong nlPos );          // inline bottom
 
-    sal_Unicode GetNextChar();
+    sal_uInt32 GetNextChar();   // Return next Unicode codepoint in UTF32.
     void RereadLookahead();
 
     inline bool IsParserWorking() const { return SVPAR_WORKING == eState; }
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
index 801e4e0..a8eff6d 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -25,6 +25,7 @@
 #include <tools/color.hxx>
 #include <rtl/ustrbuf.hxx>
 #include <rtl/strbuf.hxx>
+#include <rtl/character.hxx>
 
 #include <tools/tenccvt.hxx>
 #include <tools/datetime.hxx>
@@ -429,7 +430,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
     OUStringBuffer sTmpBuffer( MAX_LEN );
     bool bContinue = true;
     bool bEqSignFound = false;
-    sal_Unicode cQuote = 0U;
+    sal_uInt32  cQuote = 0U;
 
     while( bContinue && IsParserWorking() )
     {
@@ -445,7 +446,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
                 sal_uLong nStreamPos = rInput.Tell();
                 sal_uLong nLinePos = GetLinePos();
 
-                sal_Unicode cChar = 0U;
+                sal_uInt32 cChar = 0U;
                 if( '#' == (nNextCh = GetNextChar()) )
                 {
                     nNextCh = GetNextChar();
@@ -460,10 +461,10 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
                             {
                                 cChar = cChar * 16U +
                                         ( nNextCh <= '9'
-                                          ? sal_Unicode( nNextCh - '0' )
+                                          ? sal_uInt32( nNextCh - '0' )
                                           : ( nNextCh <= 'F'
-                                              ? sal_Unicode( nNextCh - 'A' + 10 )
-                                              : sal_Unicode( nNextCh - 'a' + 10 ) ) );
+                                              ? sal_uInt32( nNextCh - 'A' + 10 )
+                                              : sal_uInt32( nNextCh - 'a' + 10 ) ) );
                                 nNextCh = GetNextChar();
                             }
                         }
@@ -471,7 +472,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
                         {
                             do
                             {
-                                cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
+                                cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
                                 nNextCh = GetNextChar();
                             }
                             while( HTML_ISDIGIT(nNextCh) );
@@ -500,6 +501,9 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
                     }
                     else
                         nNextCh = 0U;
+
+                    if ( ! rtl::isValidCodePoint( cChar ) )
+                        cChar = '?';
                 }
                 else if( HTML_ISALPHA( nNextCh ) )
                 {
@@ -507,7 +511,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
                     sal_Int32 nPos = 0L;
                     do
                     {
-                        sEntityBuffer.append( nNextCh );
+                        sEntityBuffer.appendUtf32( nNextCh );
                         nPos++;
                         nNextCh = GetNextChar();
                     }
@@ -637,7 +641,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
                 if( IsParserWorking() )
                 {
                     if( cChar )
-                        sTmpBuffer.append( cChar );
+                        sTmpBuffer.appendUtf32( cChar );
                 }
                 else if( SVPAR_PENDING==eState && '>'!=cBreak )
                 {
@@ -661,7 +665,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
         case '=':
             if( '>'==cBreak && !cQuote )
                 bEqSignFound = true;
-            sTmpBuffer.append( nNextCh );
+            sTmpBuffer.appendUtf32( nNextCh );
             break;
 
         case '\\':
@@ -684,7 +688,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
                 else if( cQuote && (cQuote==nNextCh ) )
                     cQuote = 0U;
             }
-            sTmpBuffer.append( nNextCh );
+            sTmpBuffer.appendUtf32( nNextCh );
             bEqSignFound = false;
             break;
 
@@ -695,14 +699,15 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
             }
             else
             {
-                sTmpBuffer.append( nNextCh );
+                sTmpBuffer.appendUtf32( nNextCh );
             }
+
             break;
 
         case '<':
             bEqSignFound = false;
             if( '>'==cBreak )
-                sTmpBuffer.append( nNextCh );
+                sTmpBuffer.appendUtf32( nNextCh );
             else
                 bContinue = false;      // break, String zusammen
             break;
@@ -725,7 +730,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
             if( '>'==cBreak )
             {
                 // cr/lf in tag is handled in _GetNextToken()
-                sTmpBuffer.append( nNextCh );
+                sTmpBuffer.appendUtf32( nNextCh );
                 break;
             }
             else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
@@ -752,7 +757,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
             nNextCh = ' ';
             // no break;
         case ' ':
-            sTmpBuffer.append( nNextCh );
+            sTmpBuffer.appendUtf32( nNextCh );
             if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
                                 !bReadPRE && !bReadTextArea) )
             {
@@ -787,7 +792,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
             {
                 do {
                     // All remaining characters make their way into the text.
-                    sTmpBuffer.append( nNextCh );
+                    sTmpBuffer.appendUtf32( nNextCh );
                     if( MAX_LEN == sTmpBuffer.getLength() )
                     {
                         aToken += sTmpBuffer.makeStringAndClear();
@@ -864,7 +869,7 @@ int HTMLParser::_GetNextRawToken()
                 }
                 else if( '!' == nNextCh )
                 {
-                    sTmpBuffer.append( nNextCh );
+                    sTmpBuffer.appendUtf32( nNextCh );
                     nNextCh = GetNextChar();
                 }
 
@@ -872,7 +877,7 @@ int HTMLParser::_GetNextRawToken()
                 while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
                        IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
                 {
-                    sTmpBuffer.append( nNextCh );
+                    sTmpBuffer.appendUtf32( nNextCh );
                     nNextCh = GetNextChar();
                 }
 
@@ -959,7 +964,7 @@ int HTMLParser::_GetNextRawToken()
             }
             break;
         case '-':
-            sTmpBuffer.append( nNextCh );
+            sTmpBuffer.appendUtf32( nNextCh );
             if( bReadComment )
             {
                 bool bTwoMinus = false;
@@ -970,7 +975,7 @@ int HTMLParser::_GetNextRawToken()
 
                     if( MAX_LEN == sTmpBuffer.getLength() )
                         aToken += sTmpBuffer.makeStringAndClear();
-                    sTmpBuffer.append( nNextCh );
+                    sTmpBuffer.appendUtf32( nNextCh );
                     nNextCh = GetNextChar();
                 }
 
@@ -1015,7 +1020,7 @@ int HTMLParser::_GetNextRawToken()
             // no break
         default:
             // all remaining characters are appended to the buffer
-            sTmpBuffer.append( nNextCh );
+            sTmpBuffer.appendUtf32( nNextCh );
             break;
         }
 
@@ -1095,7 +1100,7 @@ int HTMLParser::_GetNextToken()
                 {
                     OUStringBuffer sTmpBuffer;
                     do {
-                        sTmpBuffer.append( nNextCh );
+                        sTmpBuffer.appendUtf32( nNextCh );
                         if( MAX_LEN == sTmpBuffer.getLength() )
                             aToken += sTmpBuffer.makeStringAndClear();
                         nNextCh = GetNextChar();
@@ -1166,10 +1171,10 @@ int HTMLParser::_GetNextToken()
                                 }
                                 bDone = aToken.endsWith( "--" );
                                 if( !bDone )
-                                aToken += OUString(nNextCh);
+                                aToken += OUString(&nNextCh,1);
                             }
                             else
-                                aToken += OUString(nNextCh);
+                                aToken += OUString(&nNextCh,1);
                             if( !bDone )
                                 nNextCh = GetNextChar();
                         }
@@ -1261,7 +1266,7 @@ int HTMLParser::_GetNextToken()
                             bDone = '>'==nNextCh && aToken.endsWith("%");
                             if( !bDone )
                             {
-                                aToken += OUString(nNextCh);
+                                aToken += OUString(&nNextCh,1);
                                 nNextCh = GetNextChar();
                             }
                         }
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
index f6f75eb..bdc73d3 100644
--- a/svtools/source/svrtf/parrtf.cxx
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -191,7 +191,7 @@ int SvRTFParser::_GetNextToken()
                                 // can be also \{, \}, \'88
                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                                 {
-                                    sal_Unicode cAnsi = nNextCh;
+                                    sal_uInt32 cAnsi = nNextCh;
                                     while( 0xD == cAnsi )
                                         cAnsi = GetNextChar();
                                     while( 0xA == cAnsi )
@@ -382,7 +382,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
                 case '}':
                 case '{':
                 case '+':       // I found in a RTF file
-                    aStrBuffer.append(nNextCh);
+                    aStrBuffer.append(sal_Unicode(nNextCh));
                     break;
                 case '~':       // nonbreaking space
                     aStrBuffer.append(static_cast< sal_Unicode >(0xA0));
@@ -484,7 +484,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
             {
                 do {
                     // all other characters end up in the text
-                    aStrBuffer.append(nNextCh);
+                    aStrBuffer.appendUtf32(nNextCh);
 
                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
                     {
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx
index b5c377b..b862e66 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -22,6 +22,7 @@
 #include <tools/debug.hxx>
 #include <rtl/textcvt.h>
 #include <rtl/tencinfo.h>
+#include <rtl/character.hxx>
 
 #include <vector>
 
@@ -35,7 +36,7 @@ struct SvParser_Impl
     long            nTokenValue;        // extra value (RTF)
     bool        bTokenHasValue;     // indicates whether nTokenValue is valid
     int             nToken;             // actual Token
-    sal_Unicode     nNextCh;            // actual character
+    sal_uInt32      nNextCh;            // actual character
     int             nSaveToken;         // the token from Continue
 
     rtl_TextToUnicodeConverter hConv;
@@ -148,9 +149,9 @@ void SvParser::RereadLookahead()
     nNextCh = GetNextChar();
 }
 
-sal_Unicode SvParser::GetNextChar()
+sal_uInt32 SvParser::GetNextChar()
 {
-    sal_Unicode c = 0U;
+    sal_uInt32 c = 0U;
 
     // When reading multiple bytes, we don't have to care about the file
     // position when we run into the pending state. The file position is
@@ -257,7 +258,7 @@ sal_Unicode SvParser::GetNextChar()
                    )
                 {
                     // no convserion shall take place
-                    c = (sal_Unicode)c1;
+                    c = reinterpret_cast<sal_uChar&>( c1 );
                     nChars = 1;
                 }
                 else
@@ -280,6 +281,7 @@ sal_Unicode SvParser::GetNextChar()
                         // read enough characters.
                         if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
                         {
+                            sal_Unicode sCh[2];
                             while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 )
                             {
                                 rInput.ReadChar( c1 );
@@ -289,7 +291,7 @@ sal_Unicode SvParser::GetNextChar()
 
                                 nChars = rtl_convertTextToUnicode(
                                             pImplData->hConv, pImplData->hContext,
-                                            &c1, 1, &cUC, 1,
+                                            &c1, 1, sCh , 2,
                                             RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                             RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                             RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
@@ -299,7 +301,11 @@ sal_Unicode SvParser::GetNextChar()
                             {
                                 if( 1 == nChars && 0 == nInfo )
                                 {
-                                    c = cUC;
+                                    c = sal_uInt32( sCh[0] );
+                                }
+                                else if( 2 == nChars && 0 == nInfo )
+                                {
+                                    c = rtl::combineSurrogates( sCh[0], sCh[1] );
                                 }
                                 else if( 0 != nChars || 0 != nInfo )
                                 {
@@ -311,7 +317,7 @@ sal_Unicode SvParser::GetNextChar()
                                        "there is a converted character, but an error" );
                                     // There are still errors, but nothing we can
                                     // do
-                                    c = (sal_Unicode)'?';
+                                    c = (sal_uInt32)'?';
                                     nChars = 1;
                                 }
                             }
@@ -356,7 +362,7 @@ sal_Unicode SvParser::GetNextChar()
 
                                     // There are still errors, so we use the first
                                     // character and restart after that.
-                                    c = (sal_Unicode)sBuffer[0];
+                                    c = reinterpret_cast<sal_uChar&>( sBuffer[0] );
                                     rInput.SeekRel( -(nLen-1) );
                                     nChars = 1;
                                 }
@@ -378,7 +384,7 @@ sal_Unicode SvParser::GetNextChar()
                                 "there is no converted character and no error" );
                         // #73398#: If the character could not be converted,
                         // because a conversion is not available, do no conversion at all.
-                        c = (sal_Unicode)c1;
+                        c = reinterpret_cast<sal_uChar&>( c1 );
                         nChars = 1;
 
                     }
@@ -387,6 +393,10 @@ sal_Unicode SvParser::GetNextChar()
         }
         while( 0 == nChars  && !bErr );
     }
+
+    if ( ! rtl::isValidCodePoint( c ) )
+        c = (sal_uInt32) '?' ;
+
     if( bErr )
     {
         if( ERRCODE_IO_PENDING == rInput.GetError() )
@@ -405,6 +415,7 @@ sal_Unicode SvParser::GetNextChar()
     }
     else
         IncLinePos();
+
     return c;
 }
 
diff --git a/sw/qa/extras/htmlexport/data/extb.html b/sw/qa/extras/htmlexport/data/extb.html
new file mode 100644
index 0000000..be73fea
--- /dev/null
+++ b/sw/qa/extras/htmlexport/data/extb.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8"/>
+</head>
+<body>
+<p>𤭢</p>
+<p>&#x24b62;</p>
+</body>
+</html>
diff --git a/sw/qa/extras/htmlexport/htmlexport.cxx b/sw/qa/extras/htmlexport/htmlexport.cxx
index f951a0a..69b6b7d 100644
--- a/sw/qa/extras/htmlexport/htmlexport.cxx
+++ b/sw/qa/extras/htmlexport/htmlexport.cxx
@@ -272,6 +272,19 @@ DECLARE_HTMLEXPORT_TEST(testTdf83890, "tdf83890.odt")
     assertXPath(pDoc, "/html/body/ol[2]/ol", "start", "2");
 }
 
+DECLARE_HTMLEXPORT_TEST(testExtbChars, "extb.html")
+{
+    sal_uInt32  nCh = 0x24b62;
+    OUString aExpected( &nCh, 1);
+    // Assert that UTF8 encoded non-BMP Unicode character is correct
+    uno::Reference<text::XTextRange> xTextRange1 = getRun(getParagraph(1), 1);
+    CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange1->getString());
+
+    // Assert that non-BMP Unicode in character entity format is correct
+    uno::Reference<text::XTextRange> xTextRange2 = getRun(getParagraph(2), 1);
+    CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange2->getString());
+}
+
 CPPUNIT_PLUGIN_IMPLEMENT();
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */