[Libreoffice-commits] core.git: 2 commits - sw/qa sw/source writerfilter/source

Michael Stahl mstahl at redhat.com
Thu Jun 12 03:45:23 PDT 2014


 sw/qa/extras/rtfexport/rtfexport.cxx           |    7 +++
 sw/source/filter/inc/msfilter.hxx              |   12 +++++-
 sw/source/filter/ww8/rtfattributeoutput.cxx    |   19 ++++++++-
 sw/source/filter/ww8/writerwordglue.cxx        |   49 +++++++++++++++++++++++++
 sw/source/filter/ww8/wrtw8sty.cxx              |    3 +
 writerfilter/source/rtftok/rtfdocumentimpl.cxx |    8 ++--
 6 files changed, 89 insertions(+), 9 deletions(-)

New commits:
commit e47a02b1524061143d8e77a54eb95c77f2e6dae2
Author: Michael Stahl <mstahl at redhat.com>
Date:   Thu Jun 12 12:16:28 2014 +0200

    fdo#77979: sw: RTF export: write non-ASCII font names encoded
    
    Currently font names like "微软雅黑" (Microsoft YaHei) are
    written as "????" in the RTF export; to avoid that, set the \fcharset
    of the font entry to something that at least is able to encode
    the font name and alternate name.
    
    This requires a new function since the existing
    rtl_TextEncodingToWinCharset was changed in
    b88fe998ce8c80d7629fe70118311096615d959d to return "default" 0x01
    (for OOXML) which is quite unhelpful for RTF.
    
    This is not entirely satisfactory, as of course that is no guarantee
    that the encoding can represent all of the actual text that has the
    font applied; hence there are some \'3f in the fall-back encoded text
    of the heading of the bugdoc, which indicates that the detected
    Shift-JIS is insufficient and GB-2132 would be required; but it's not
    obvious how to do better here without iterating over all the text
    twice, and that still leaves the possibility that all text that has a
    particular font applied cannot be represented by a single non-Unicode
    encoding.
    
    But since we always write text as the \u Unicode + legacy fall-back,
    this should not be a big problem since modern RTF readers will simply
    read the Unicode.
    
    Change-Id: Ie6a42294c501d014dd9f0df82638519412ca19bb

diff --git a/sw/qa/extras/rtfexport/rtfexport.cxx b/sw/qa/extras/rtfexport/rtfexport.cxx
index 9a38b49..ce91526 100644
--- a/sw/qa/extras/rtfexport/rtfexport.cxx
+++ b/sw/qa/extras/rtfexport/rtfexport.cxx
@@ -317,6 +317,13 @@ DECLARE_RTFEXPORT_TEST(testMathRuns, "math-runs.rtf")
     CPPUNIT_ASSERT_EQUAL(OUString("\\{ left [ right ] left ( right ) \\}"), getFormula(getRun(getParagraph(1), 1)));
 }
 
+DECLARE_RTFEXPORT_TEST(testFdo77979, "fdo77979.odt")
+{
+    // font name is encoded with \fcharset of font
+    CPPUNIT_ASSERT_EQUAL(OUString("微软雅黑", 12, RTL_TEXTENCODING_UTF8),
+            getProperty<OUString>(getRun(getParagraph(1), 1), "CharFontName"));
+}
+
 DECLARE_RTFEXPORT_TEST(testFdo53113, "fdo53113.odt")
 {
     /*
diff --git a/sw/source/filter/inc/msfilter.hxx b/sw/source/filter/inc/msfilter.hxx
index babfc7e..8ff19a0 100644
--- a/sw/source/filter/inc/msfilter.hxx
+++ b/sw/source/filter/inc/msfilter.hxx
@@ -59,8 +59,7 @@ namespace sw
     {
         /** MSOffice appears to set the charset of unicode fonts to MS 932
 
-            Arial Unicode MS for example is a unicode font, but word sets
-            exported uses of it to the MS 932 charset
+            But we do "default", whatever that means.
 
             @param eTextEncoding
                 the OOo encoding to convert from
@@ -73,6 +72,15 @@ namespace sw
         */
         sal_uInt8 rtl_TextEncodingToWinCharset(rtl_TextEncoding eTextEncoding);
 
+        /** MSOffice appears to set the charset of unicode fonts to MS 932
+
+            Arial Unicode MS for example is a unicode font, but word sets
+            exported uses of it to the MS 932 charset
+
+        */
+        sal_uInt8 rtl_TextEncodingToWinCharsetRTF(OUString const& rFontName,
+                OUString const& rAltName, rtl_TextEncoding eTextEncoding);
+
         /** Import a MSWord XE field. Suitable for .doc and .rtf
 
             @param rDoc
diff --git a/sw/source/filter/ww8/rtfattributeoutput.cxx b/sw/source/filter/ww8/rtfattributeoutput.cxx
index 41e06be..162b6cc 100644
--- a/sw/source/filter/ww8/rtfattributeoutput.cxx
+++ b/sw/source/filter/ww8/rtfattributeoutput.cxx
@@ -2073,7 +2073,12 @@ void RtfAttributeOutput::CharFont(const SvxFontItem& rFont)
     m_aStylesEnd.append(OOO_STRING_SVTOOLS_RTF_LOCH);
     m_aStylesEnd.append(OOO_STRING_SVTOOLS_RTF_F);
     m_aStylesEnd.append((sal_Int32)m_rExport.maFontHelper.GetId(rFont));
-    m_rExport.eCurrentEncoding = rtl_getTextEncodingFromWindowsCharset(rtl_getBestWindowsCharsetFromTextEncoding(rFont.GetCharSet()));
+    // FIXME: this may be a tad expensive... but the charset needs to be
+    // consistent with what wwFont::WriteRtf() does
+    FontMapExport aTmp(rFont.GetFamilyName());
+    m_rExport.eCurrentEncoding = rtl_getTextEncodingFromWindowsCharset(
+            sw::ms::rtl_TextEncodingToWinCharsetRTF(
+                aTmp.msPrimary, aTmp.msSecondary, rFont.GetCharSet()));
     if (m_rExport.eCurrentEncoding == RTL_TEXTENCODING_DONTKNOW)
         m_rExport.eCurrentEncoding = m_rExport.eDefaultEncoding;
 }
@@ -3270,20 +3275,27 @@ MSWordExportBase& RtfAttributeOutput::GetExport()
 /// Start the font.
 void RtfAttributeOutput::StartFont(const OUString& rFamilyName) const
 {
-    m_rExport.Strm().WriteCharPtr(OUStringToOString(rFamilyName, m_rExport.eCurrentEncoding).getStr());
+    // write the font name hex-encoded, but without Unicode - Word at least
+    // cannot read *both* Unicode and fallback as written by OutString
+    m_rExport.Strm().WriteCharPtr(
+        msfilter::rtfutil::OutString(rFamilyName, m_rExport.eCurrentEncoding, false).getStr());
 }
 
 /// End the font.
 void RtfAttributeOutput::EndFont() const
 {
     m_rExport.Strm().WriteCharPtr(";}");
+    m_rExport.eCurrentEncoding = m_rExport.eDefaultEncoding;
 }
 
 /// Alternate name for the font.
 void RtfAttributeOutput::FontAlternateName(const OUString& rName) const
 {
     m_rExport.Strm().WriteChar('{').WriteCharPtr(OOO_STRING_SVTOOLS_RTF_IGNORE).WriteCharPtr(OOO_STRING_SVTOOLS_RTF_FALT).WriteChar(' ');
-    m_rExport.Strm().WriteCharPtr(OUStringToOString(rName, m_rExport.eCurrentEncoding).getStr()).WriteChar('}');
+    // write the font name hex-encoded, but without Unicode - Word at least
+    // cannot read *both* Unicode and fallback as written by OutString
+    m_rExport.Strm().WriteCharPtr(
+        msfilter::rtfutil::OutString(rName, m_rExport.eCurrentEncoding, false).getStr()).WriteChar('}');
 }
 
 /// Font charset.
@@ -3292,6 +3304,7 @@ void RtfAttributeOutput::FontCharset(sal_uInt8 nCharSet) const
     m_rExport.Strm().WriteCharPtr(OOO_STRING_SVTOOLS_RTF_FCHARSET);
     m_rExport.OutULong(nCharSet);
     m_rExport.Strm().WriteChar(' ');
+    m_rExport.eCurrentEncoding =rtl_getTextEncodingFromWindowsCharset(nCharSet);
 }
 
 /// Font family.
diff --git a/sw/source/filter/ww8/writerwordglue.cxx b/sw/source/filter/ww8/writerwordglue.cxx
index 56dcc9e..fe95384 100644
--- a/sw/source/filter/ww8/writerwordglue.cxx
+++ b/sw/source/filter/ww8/writerwordglue.cxx
@@ -712,6 +712,55 @@ namespace sw
             return nRet;
         }
 
+        static bool
+        CanEncode(OUString const& rString, rtl_TextEncoding const eEncoding)
+        {
+            rtl::OString tmp;
+            return rString.convertToString(&tmp, eEncoding,
+                    RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR |
+                    RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR);
+        }
+
+        sal_uInt8 rtl_TextEncodingToWinCharsetRTF(
+                OUString const& rFontName, OUString const& rAltName,
+                rtl_TextEncoding eTextEncoding)
+        {
+            sal_uInt8 nRet =
+                rtl_getBestWindowsCharsetFromTextEncoding(eTextEncoding);
+            switch (eTextEncoding)
+            {
+                case RTL_TEXTENCODING_DONTKNOW:
+                case RTL_TEXTENCODING_UCS2:
+                case RTL_TEXTENCODING_UTF7:
+                case RTL_TEXTENCODING_UTF8:
+                case RTL_TEXTENCODING_JAVA_UTF8:
+                    static struct { rtl_TextEncoding enc; sal_uInt8 charset; }
+                        const s_fallbacks [] = {
+                            { RTL_TEXTENCODING_MS_932, 0x80 }, // Shift-JIS
+                            { RTL_TEXTENCODING_MS_936, 0x86 }, // GB-2312
+                            { RTL_TEXTENCODING_MS_950, 0x88 }, // Big5
+                            { RTL_TEXTENCODING_MS_949, 0x81 }, // EUC-KR
+                        };
+                    for (size_t i = 0; i < SAL_N_ELEMENTS(s_fallbacks); ++i)
+                    {
+                        // fall back to a charset that can at least encode
+                        // the font's name
+                        if (CanEncode(rFontName, s_fallbacks[i].enc)
+                            && CanEncode(rAltName, s_fallbacks[i].enc))
+                        {
+                            return s_fallbacks[i].charset;
+                        }
+                    }
+                    SAL_INFO("sw.rtf", "no fallback charset found for font: "
+                            << rFontName << " " << rAltName);
+                    nRet = 0x01; // all hope lost: "default", whatever that is
+                    break;
+                default:
+                    break;
+            }
+            return nRet;
+        }
+
         long DateTime2DTTM( const DateTime& rDT )
         {
         /*
diff --git a/sw/source/filter/ww8/wrtw8sty.cxx b/sw/source/filter/ww8/wrtw8sty.cxx
index 588934b..5d35ae2 100644
--- a/sw/source/filter/ww8/wrtw8sty.cxx
+++ b/sw/source/filter/ww8/wrtw8sty.cxx
@@ -856,7 +856,8 @@ void wwFont::WriteRtf( const RtfAttributeOutput* rAttrOutput ) const
 {
     rAttrOutput->FontFamilyType( meFamily, *this );
     rAttrOutput->FontPitchType( mePitch );
-    rAttrOutput->FontCharset( rtl_getBestWindowsCharsetFromTextEncoding( meChrSet ) );
+    rAttrOutput->FontCharset(
+        sw::ms::rtl_TextEncodingToWinCharsetRTF(msFamilyNm, msAltNm, meChrSet));
     rAttrOutput->StartFont( msFamilyNm );
     if ( mbAlt )
         rAttrOutput->FontAlternateName( msAltNm );
commit 04d5a280beeeb6e056df68395dc9c3b3a674361b
Author: Michael Stahl <mstahl at redhat.com>
Date:   Thu Jun 12 12:04:01 2014 +0200

    related: fdo#77979: writerfilter RTF import: read encoded font name
    
    The font name is encoded in the font's charset given by \fcharset.
    
    Change-Id: Id9520649a1eb3b55f4314e140abda7399f23d925

diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.cxx b/writerfilter/source/rtftok/rtfdocumentimpl.cxx
index 0608cb2..38bf6bb 100644
--- a/writerfilter/source/rtftok/rtfdocumentimpl.cxx
+++ b/writerfilter/source/rtftok/rtfdocumentimpl.cxx
@@ -980,9 +980,6 @@ int RTFDocumentImpl::resolveChars(char ch)
 
         if (RTL_TEXTENCODING_MS_932 == m_aStates.top().nCurrentEncoding)
         {
-            // fdo#79384: Word will reject Shift-JIS following \loch
-            // but apparently OOo could read and (worse) write such documents
-            SAL_INFO_IF(m_aStates.top().eRunType != RTFParserState::DBCH, "writerfilter.rtf", "invalid Shift-JIS without DBCH");
             unsigned char uch = ch;
             if ((uch >= 0x80 && uch <= 0x9F) || uch >= 0xE0)
             {
@@ -990,6 +987,9 @@ int RTFDocumentImpl::resolveChars(char ch)
                 Strm().ReadChar(ch);
                 if (m_aStates.top().nCharsToSkip == 0)
                 {
+                    // fdo#79384: Word will reject Shift-JIS following \loch
+                    // but apparently OOo could read and (worse) write such documents
+                    SAL_INFO_IF(m_aStates.top().eRunType != RTFParserState::DBCH, "writerfilter.rtf", "invalid Shift-JIS without DBCH");
                     assert(bUnicodeChecked);
                     aBuf.append(ch);
                 }
@@ -3550,6 +3550,7 @@ int RTFDocumentImpl::dispatchValue(RTFKeyword nKeyword, int nParam)
             return 0;
 
         m_nCurrentEncoding = rtl_getTextEncodingFromWindowsCodePage(aRTFEncodings[i].codepage);
+        m_aStates.top().nCurrentEncoding = m_nCurrentEncoding;
     }
     break;
     case RTF_ANSICPG:
@@ -3560,6 +3561,7 @@ int RTFDocumentImpl::dispatchValue(RTFKeyword nKeyword, int nParam)
     break;
     case RTF_CPG:
         m_nCurrentEncoding = rtl_getTextEncodingFromWindowsCodePage(nParam);
+        m_aStates.top().nCurrentEncoding = m_nCurrentEncoding;
         break;
     case RTF_CF:
     {


More information about the Libreoffice-commits mailing list