[Libreoffice-commits] core.git: 2 commits - include/svtools sc/qa svtools/source

Jan Holesovsky kendy at collabora.com
Thu Nov 3 22:16:43 UTC 2016


 include/svtools/htmlkywd.hxx         |    1 +
 include/svtools/htmltokn.h           |    1 +
 sc/qa/unit/bugfix-test.cxx           |   25 +++++++++++++++++++++++++
 sc/qa/unit/data/html/tdf88821-2.html |   19 +++++++++++++++++++
 sc/qa/unit/data/html/tdf88821.html   |   23 +++++++++++++++++++++++
 svtools/source/svhtml/htmlkywd.cxx   |    1 +
 svtools/source/svhtml/parhtml.cxx    |    5 ++++-
 svtools/source/svrtf/svparser.cxx    |    7 +++----
 8 files changed, 77 insertions(+), 5 deletions(-)

New commits:
commit 84400eae86d7ae8e66f8247f4c4f3a717d90f8c0
Author: Jan Holesovsky <kendy at collabora.com>
Date:   Thu Nov 3 22:27:12 2016 +0100

    tdf#88821: Implement support for <meta charset="..."> for HTML import.
    
    The editengine HTML import was not handling it at all, and consequently not
    setting the right encoding when importing HTML in Calc.
    
    Change-Id: I3ca3dd20f36cfb579fb7ae4cd3da63a69d97601e

diff --git a/include/svtools/htmlkywd.hxx b/include/svtools/htmlkywd.hxx
index 54309a7..4cc2494 100644
--- a/include/svtools/htmlkywd.hxx
+++ b/include/svtools/htmlkywd.hxx
@@ -422,6 +422,7 @@
 #define OOO_STRING_SVTOOLS_HTML_O_alt "alt"
 #define OOO_STRING_SVTOOLS_HTML_O_axis "axis"
 #define OOO_STRING_SVTOOLS_HTML_O_char "char"
+#define OOO_STRING_SVTOOLS_HTML_O_charset "charset"
 #define OOO_STRING_SVTOOLS_HTML_O_class "class"
 #define OOO_STRING_SVTOOLS_HTML_O_code "code"
 #define OOO_STRING_SVTOOLS_HTML_O_codetype "codetype"
diff --git a/include/svtools/htmltokn.h b/include/svtools/htmltokn.h
index eeea777..37ca30e 100644
--- a/include/svtools/htmltokn.h
+++ b/include/svtools/htmltokn.h
@@ -308,6 +308,7 @@ HTML_OPTION_STRING_START        = HTML_OPTION_BOOL_END,
     HTML_O_ALT,
     HTML_O_AXIS,
     HTML_O_CHAR, // HTML3 Table Model Draft
+    HTML_O_CHARSET,
     HTML_O_CLASS,
     HTML_O_CODE, // HotJava
     HTML_O_CODETYPE,
diff --git a/sc/qa/unit/bugfix-test.cxx b/sc/qa/unit/bugfix-test.cxx
index 3968d38..2cf5b12 100644
--- a/sc/qa/unit/bugfix-test.cxx
+++ b/sc/qa/unit/bugfix-test.cxx
@@ -88,6 +88,7 @@ public:
     // void testTdf40110();
     void testTdf98657();
     void testTdf88821();
+    void testTdf88821_2();
 
     CPPUNIT_TEST_SUITE(ScFiltersTest);
     CPPUNIT_TEST(testTdf64229);
@@ -98,6 +99,7 @@ public:
     // CPPUNIT_TEST(testTdf40110);
     CPPUNIT_TEST(testTdf98657);
     CPPUNIT_TEST(testTdf88821);
+    CPPUNIT_TEST(testTdf88821_2);
     CPPUNIT_TEST_SUITE_END();
 private:
     uno::Reference<uno::XInterface> m_xCalcComponent;
@@ -256,6 +258,17 @@ void ScFiltersTest::testTdf88821()
     xDocSh->DoClose();
 }
 
+void ScFiltersTest::testTdf88821_2()
+{
+    ScDocShellRef xDocSh = loadDoc("tdf88821-2.", FORMAT_HTML);
+    ScDocument& rDoc = xDocSh->GetDocument();
+
+    // A2 should be 'ABCabcČŠŽčšž', not 'ABCabcČŠŽÄヘšž'
+    CPPUNIT_ASSERT_EQUAL(OStringToOUString("ABCabc\xC4\x8C\xC5\xA0\xC5\xBD\xC4\x8D\xC5\xA1\xC5\xBE", RTL_TEXTENCODING_UTF8), rDoc.GetString(0, 1, 0));
+
+    xDocSh->DoClose();
+}
+
 ScFiltersTest::ScFiltersTest()
       : ScBootstrapFixture( "/sc/qa/unit/data" )
 {
diff --git a/sc/qa/unit/data/html/tdf88821-2.html b/sc/qa/unit/data/html/tdf88821-2.html
new file mode 100644
index 0000000..e71094a
--- /dev/null
+++ b/sc/qa/unit/data/html/tdf88821-2.html
@@ -0,0 +1,19 @@
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+</head>
+<body>
+    <table border="1">
+        <tr>
+            <td>Text</td>
+            <td>Decimal</td>
+            <td>Date</td>
+        </tr>
+        <tr>
+            <td>ABCabcČŠŽčšž</td>
+            <td>10,50</td>
+            <td>30.1.2015</td>
+        </tr>
+    </table>
+</body>
+</html>
diff --git a/svtools/source/svhtml/htmlkywd.cxx b/svtools/source/svhtml/htmlkywd.cxx
index 6034082..2cc0a9e 100644
--- a/svtools/source/svhtml/htmlkywd.cxx
+++ b/svtools/source/svhtml/htmlkywd.cxx
@@ -576,6 +576,7 @@ static HTML_TokenEntry aHTMLOptionTab[] = {
     {{OOO_STRING_SVTOOLS_HTML_O_alt},       HTML_O_ALT},
     {{OOO_STRING_SVTOOLS_HTML_O_axis},      HTML_O_AXIS},
     {{OOO_STRING_SVTOOLS_HTML_O_char},      HTML_O_CHAR}, // HTML 3 Table Model Draft
+    {{OOO_STRING_SVTOOLS_HTML_O_charset},   HTML_O_CHARSET},
     {{OOO_STRING_SVTOOLS_HTML_O_class},     HTML_O_CLASS},
     {{OOO_STRING_SVTOOLS_HTML_O_code},      HTML_O_CODE}, // HotJava
     {{OOO_STRING_SVTOOLS_HTML_O_codetype},  HTML_O_CODETYPE},
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
index a47b4e4..c09ecc5 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -1928,6 +1928,10 @@ bool HTMLParser::ParseMetaOptionsImpl(
             case HTML_O_CONTENT:
                 aContent = aOption.GetString();
                 break;
+            case HTML_O_CHARSET:
+                OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
+                o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
+                break;
         }
     }
 
@@ -1942,7 +1946,6 @@ bool HTMLParser::ParseMetaOptionsImpl(
         aContent = convertLineEnd(aContent, GetSystemLineEnd());
     }
 
-
     if ( bHTTPEquiv && i_pHTTPHeader )
     {
         // Netscape seems to just ignore a closing ", so we do too
commit b297f7bbfed83f87398231740e910afe6ebfbb97
Author: Jan Holesovsky <kendy at collabora.com>
Date:   Thu Nov 3 17:14:01 2016 +0100

    tdf#88821: Set the encoding correctly for HTML files with a BOM.
    
    BOM (Byte Order Mark) in the HTML file changed the underlying eSrcEnc
    encoding, but did not actually update the rtl_TextToUnicodeConverter hConv.
    
    Subsequent changes of eSrcEnc in SetSrcEncoding() (triggered by
    'content="application/xhtml+xml; charset=UTF-8"' in the HTML file) were then
    ignored (eSrcEnc was already set to UTF-8), and the parser was happily using the
    old (Windows-1250) hConv.
    
    Change-Id: If432d59891d51c6abe3517e325ed73057d0f8610

diff --git a/sc/qa/unit/bugfix-test.cxx b/sc/qa/unit/bugfix-test.cxx
index 6213593..3968d38 100644
--- a/sc/qa/unit/bugfix-test.cxx
+++ b/sc/qa/unit/bugfix-test.cxx
@@ -87,6 +87,7 @@ public:
     void testTdf91979();
     // void testTdf40110();
     void testTdf98657();
+    void testTdf88821();
 
     CPPUNIT_TEST_SUITE(ScFiltersTest);
     CPPUNIT_TEST(testTdf64229);
@@ -96,6 +97,7 @@ public:
     CPPUNIT_TEST(testTdf91979);
     // CPPUNIT_TEST(testTdf40110);
     CPPUNIT_TEST(testTdf98657);
+    CPPUNIT_TEST(testTdf88821);
     CPPUNIT_TEST_SUITE_END();
 private:
     uno::Reference<uno::XInterface> m_xCalcComponent;
@@ -243,6 +245,16 @@ void ScFiltersTest::testTdf98657()
     CPPUNIT_ASSERT_EQUAL(double(285.0), rDoc.GetValue(ScAddress(1, 1, 0)));
 }
 
+void ScFiltersTest::testTdf88821()
+{
+    ScDocShellRef xDocSh = loadDoc("tdf88821.", FORMAT_HTML);
+    ScDocument& rDoc = xDocSh->GetDocument();
+
+    // B2 should be 'Périmètre', not 'Périmètre'
+    CPPUNIT_ASSERT_EQUAL(OStringToOUString("P\xC3\xA9rim\xC3\xA8tre", RTL_TEXTENCODING_UTF8), rDoc.GetString(1, 1, 0));
+
+    xDocSh->DoClose();
+}
 
 ScFiltersTest::ScFiltersTest()
       : ScBootstrapFixture( "/sc/qa/unit/data" )
diff --git a/sc/qa/unit/data/html/tdf88821.html b/sc/qa/unit/data/html/tdf88821.html
new file mode 100644
index 0000000..f8e22c8
--- /dev/null
+++ b/sc/qa/unit/data/html/tdf88821.html
@@ -0,0 +1,23 @@
+<meta http-equiv="Content-type" content="application/xhtml+xml; charset=UTF-8" xmlns:myObj="urn:ms-kb" xmlns:myObjConvertBool="urn:ms-bool" xmlns:myObjConvertDecimal="urn:ms-dec" xmlns:myObjConvertText="urn:ms-text" />
+<HTML xmlns:myObj="urn:ms-kb" xmlns:myObjConvertBool="urn:ms-bool" xmlns:myObjConvertDecimal="urn:ms-dec" xmlns:myObjConvertText="urn:ms-text">
+    <HEAD>
+	<STYLE>.HDR { background-color:bisque;font-weight:bold }</STYLE>
+    </HEAD>
+    <BODY>
+	<TABLE>
+	    <COLGROUP WIDTH="150" ALIGN="LEFT" />
+		<COLGROUP WIDTH="150" ALIGN="LEFT" />
+		    <TD CLASS="HDR" ALIGN="CENTER">
+			Code de la liste</TD>
+		    <TD CLASS="HDR" ALIGN="CENTER">
+			Libellé de la liste</TD>
+		    <TR>
+			<TD CLASS="TDR">
+			    ACT_PERIMETRE</TD>
+			<TD CLASS="TDR">
+			    Périmètre</TD>
+		    </TR>
+	</TABLE>
+    </BODY>
+</HTML>
+
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx
index b749400..ae6c1eb 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -104,7 +104,6 @@ void SvParser::ClearTxtConvContext()
 
 void SvParser::SetSrcEncoding( rtl_TextEncoding eEnc )
 {
-
     if( eEnc != eSrcEnc )
     {
         if( pImplData && pImplData->hConv )
@@ -172,13 +171,13 @@ sal_uInt32 SvParser::GetNextChar()
                 {
                     if( 0xfe == c1 && 0xff == c2 )
                     {
-                        eSrcEnc = RTL_TEXTENCODING_UCS2;
+                        SetSrcEncoding(RTL_TEXTENCODING_UCS2);
                         bUCS2BSrcEnc = true;
                         bSeekBack = false;
                     }
                     else if( 0xff == c1 && 0xfe == c2 )
                     {
-                        eSrcEnc = RTL_TEXTENCODING_UCS2;
+                        SetSrcEncoding(RTL_TEXTENCODING_UCS2);
                         bUCS2BSrcEnc = false;
                         bSeekBack = false;
                     }
@@ -198,7 +197,7 @@ sal_uInt32 SvParser::GetNextChar()
                         bErr = rInput.IsEof() || rInput.GetError();
                         if( !bErr && ( 0xbf == c3 ) )
                         {
-                            eSrcEnc = RTL_TEXTENCODING_UTF8;
+                            SetSrcEncoding(RTL_TEXTENCODING_UTF8);
                             bSeekBack = false;
                         }
                     }


More information about the Libreoffice-commits mailing list