[Libreoffice-commits] core.git: 4 commits - sw/source

Caolán McNamara caolanm at redhat.com
Wed Aug 27 07:06:18 PDT 2014


 sw/source/filter/ww8/ww8par.cxx |  136 +++++++++++++++++++++++++---------------
 sw/source/filter/ww8/ww8par.hxx |    2 
 2 files changed, 88 insertions(+), 50 deletions(-)

New commits:
commit 47b84f7e5143f445a087fc9ccc4fb29bbd88ff64
Author: Caolán McNamara <caolanm at redhat.com>
Date:   Wed Aug 27 15:03:45 2014 +0100

    Resolves: fdo#82904 non-Japanese ww95 documents claiming ms932 encoding
    
    Change-Id: I62f8d5c3cac71f83f5cdde114f66e8554a780538

diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 4fc41eb..54cfbcc 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -3037,8 +3037,51 @@ bool SwWW8ImplReader::ReadPlainChars(WW8_CP& rPos, sal_Int32 nEnd, sal_Int32 nCp
     // the correct FilePos has already been reached.
     const sal_Int32 nStrLen = std::min(nValidStrLen, SAL_MAX_INT32-1);
 
-    const rtl_TextEncoding eSrcCharSet = bVer67 ? GetCurrentCharSet() :
+    rtl_TextEncoding eSrcCharSet = bVer67 ? GetCurrentCharSet() :
         RTL_TEXTENCODING_MS_1252;
+    if (bVer67 && eSrcCharSet == RTL_TEXTENCODING_MS_932)
+    {
+        /*
+         fdo#82904
+
+         Older documents exported as word 95 that use unicode aware fonts will
+         have the charset of those fonts set to RTL_TEXTENCODING_MS_932 on
+         export as the conversion from RTL_TEXTENCODING_UNICODE. This is a serious
+         pain.
+
+         We will try and use a fallback encoding if the conversion from
+         RTL_TEXTENCODING_MS_932 fails, but you can get unlucky and get a document
+         which isn't really in RTL_TEXTENCODING_MS_932 but parts of it form
+         valid RTL_TEXTENCODING_MS_932 by chance :-(
+
+         We're not the only ones that struggle with this: Here's the help from
+         MSOffice 2003 on the topic:
+
+         <<
+          Earlier versions of Microsoft Word were sometimes used in conjunction with
+          third-party language-processing add-in programs designed to support Chinese or
+          Korean on English versions of Microsoft Windows. Use of these add-ins sometimes
+          results in incorrect text display in more recent versions of Word.
+
+          However, you can set options to convert these documents so that text is
+          displayed correctly. On the Tools menu, click Options, and then click the
+          General tab. In the English Word 6.0/95 documents list, select Contain Asian
+          text (to have Word interpret the text as Asian code page data, regardless of
+          its font) or Automatically detect Asian text (to have Word attempt to determine
+          which parts of the text are meant to be Asian).
+        >>
+
+        What we can try here is to ignore a RTL_TEXTENCODING_MS_932 codepage if
+        the language is not Japanese
+        */
+
+        const SfxPoolItem * pItem = GetFmtAttr(RES_CHRATR_CJK_LANGUAGE);
+        if (pItem != NULL && LANGUAGE_JAPANESE != static_cast<const SvxLanguageItem *>(pItem)->GetLanguage())
+        {
+            SAL_WARN("sw.ww8", "discarding word95 RTL_TEXTENCODING_MS_932 encoding");
+            eSrcCharSet = GetCharSetFromLanguage();
+        }
+    }
     const rtl_TextEncoding eSrcCJKCharSet = bVer67 ? GetCurrentCJKCharSet() :
         RTL_TEXTENCODING_MS_1252;
 
commit 4143d7bc7078fb367130e092a354b20da57585cc
Author: Caolán McNamara <caolanm at redhat.com>
Date:   Wed Aug 27 15:00:15 2014 +0100

    sync GetCurrentCJKCharSet with GetCurrentCharSet
    
    Change-Id: Ibcf1fa35617ee8d7fab6b66e3e8c8881ad55c3e5

diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 53a2a02..4fc41eb 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -2809,6 +2809,24 @@ rtl_TextEncoding SwWW8ImplReader::GetCharSetFromLanguage()
     return msfilter::util::getBestTextEncodingFromLocale(aLocale);
 }
 
+rtl_TextEncoding SwWW8ImplReader::GetCJKCharSetFromLanguage()
+{
+    /*
+     #i22206#/#i52786#
+     The (default) character set used for a run of text is the default
+     character set for the version of Word that last saved the document.
+
+     This is a bit tentative, more might be required if the concept is correct.
+     When later version of word write older 6/95 documents the charset is
+     correctly set in the character runs involved, so its hard to reproduce
+     documents that require this to be sure of the process involved.
+    */
+    const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_CJK_LANGUAGE);
+    LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM;
+    ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang));
+    return msfilter::util::getBestTextEncodingFromLocale(aLocale);
+}
+
 rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet()
 {
     /*
@@ -2846,33 +2864,12 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCJKCharSet()
     {
         if (!maFontSrcCJKCharSets.empty())
             eSrcCharSet = maFontSrcCJKCharSets.top();
-        if (!vColl.empty())
-        {
-            if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && nCharFmt >= 0 && (size_t)nCharFmt < vColl.size() )
-                eSrcCharSet = vColl[nCharFmt].GetCJKCharSet();
-            if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW && nAktColl < vColl.size())
-                eSrcCharSet = vColl[nAktColl].GetCJKCharSet();
-        }
+        if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && nCharFmt >= 0 && (size_t)nCharFmt < vColl.size() )
+            eSrcCharSet = vColl[nCharFmt].GetCJKCharSet();
+        if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW && StyleExists(nAktColl) && nAktColl < vColl.size())
+            eSrcCharSet = vColl[nAktColl].GetCJKCharSet();
         if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW)
-        { // patch from cmc for #i52786#
-            /*
-             #i22206#/#i52786#
-             The (default) character set used for a run of text is the default
-             character set for the version of Word that last saved the document.
-
-             This is a bit tentative, more might be required if the concept is correct.
-             When later version of word write older 6/95 documents the charset is
-             correctly set in the character runs involved, so its hard to reproduce
-             documents that require this to be sure of the process involved.
-            */
-            const SvxLanguageItem *pLang =
-                (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
-            if (pLang)
-            {
-                ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(pLang->GetLanguage()));
-                eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale);
-            }
-        }
+            eSrcCharSet = GetCJKCharSetFromLanguage();
     }
     return eSrcCharSet;
 }
diff --git a/sw/source/filter/ww8/ww8par.hxx b/sw/source/filter/ww8/ww8par.hxx
index 50fe2c9..96963a2 100644
--- a/sw/source/filter/ww8/ww8par.hxx
+++ b/sw/source/filter/ww8/ww8par.hxx
@@ -1922,6 +1922,7 @@ public:     // eigentlich private, geht aber leider nur public
     rtl_TextEncoding GetCurrentCharSet();
     rtl_TextEncoding GetCurrentCJKCharSet();
     rtl_TextEncoding GetCharSetFromLanguage();
+    rtl_TextEncoding GetCJKCharSetFromLanguage();
 
     void PostProcessAttrs();
     static void ReadEmbeddedData(SvMemoryStream& rStrm, SwDocShell* pDocShell, struct HyperLinksTable& hlStr);
commit 56c9850145faa9ac04c3f09633e56b6c8c22c6c4
Author: Caolán McNamara <caolanm at redhat.com>
Date:   Wed Aug 27 14:57:05 2014 +0100

    refactor into GetCharSetFromLanguage
    
    Change-Id: I54382b0dd0f6b6f21f635d75cb3ee3cefc1eb203

diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 6b4ec95..53a2a02 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -2791,6 +2791,24 @@ bool SwWW8ImplReader::ProcessSpecial(bool &rbReSync, WW8_CP nStartCp)
     return bTableRowEnd;
 }
 
+rtl_TextEncoding SwWW8ImplReader::GetCharSetFromLanguage()
+{
+    /*
+     #i22206#/#i52786#
+     The (default) character set used for a run of text is the default
+     character set for the version of Word that last saved the document.
+
+     This is a bit tentative, more might be required if the concept is correct.
+     When later version of word write older 6/95 documents the charset is
+     correctly set in the character runs involved, so its hard to reproduce
+     documents that require this to be sure of the process involved.
+    */
+    const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
+    LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM;
+    ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang));
+    return msfilter::util::getBestTextEncodingFromLocale(aLocale);
+}
+
 rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet()
 {
     /*
@@ -2809,22 +2827,7 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet()
         if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && StyleExists(nAktColl) && nAktColl < vColl.size())
             eSrcCharSet = vColl[nAktColl].GetCharSet();
         if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW)
-        {
-            /*
-             #i22206#/#i52786#
-             The (default) character set used for a run of text is the default
-             character set for the version of Word that last saved the document.
-
-             This is a bit tentative, more might be required if the concept is correct.
-             When later version of word write older 6/95 documents the charset is
-             correctly set in the character runs involved, so its hard to reproduce
-             documents that require this to be sure of the process involved.
-            */
-            const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
-            LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM;
-            ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang));
-            eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale);
-        }
+            eSrcCharSet = GetCharSetFromLanguage();
     }
     return eSrcCharSet;
 }
diff --git a/sw/source/filter/ww8/ww8par.hxx b/sw/source/filter/ww8/ww8par.hxx
index d44e508..50fe2c9 100644
--- a/sw/source/filter/ww8/ww8par.hxx
+++ b/sw/source/filter/ww8/ww8par.hxx
@@ -1921,6 +1921,7 @@ public:     // eigentlich private, geht aber leider nur public
     sal_uLong LoadDoc( SwPaM&,WW8Glossary *pGloss=0);
     rtl_TextEncoding GetCurrentCharSet();
     rtl_TextEncoding GetCurrentCJKCharSet();
+    rtl_TextEncoding GetCharSetFromLanguage();
 
     void PostProcessAttrs();
     static void ReadEmbeddedData(SvMemoryStream& rStrm, SwDocShell* pDocShell, struct HyperLinksTable& hlStr);
commit 804d60d2ee4c099f685a6e42438fa0de15ca29be
Author: Caolán McNamara <caolanm at redhat.com>
Date:   Wed Aug 27 14:04:53 2014 +0100

    duplicate with getBestTextEncodingFromLocale
    
    Change-Id: I73a69fdfee0b0f3af5bf6b4e52629dba7ed69630

diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 3395868..6b4ec95 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -2866,15 +2866,8 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCJKCharSet()
                 (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
             if (pLang)
             {
-                switch (pLang->GetLanguage())
-                {
-                    case LANGUAGE_CZECH:
-                        eSrcCharSet = RTL_TEXTENCODING_MS_1250;
-                        break;
-                    default:
-                        eSrcCharSet = RTL_TEXTENCODING_MS_1252;
-                        break;
-                }
+                ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(pLang->GetLanguage()));
+                eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale);
             }
         }
     }


More information about the Libreoffice-commits mailing list