[Libreoffice-commits] core.git: 4 commits - sw/source
Caolán McNamara
caolanm at redhat.com
Wed Aug 27 07:06:18 PDT 2014
sw/source/filter/ww8/ww8par.cxx | 136 +++++++++++++++++++++++++---------------
sw/source/filter/ww8/ww8par.hxx | 2
2 files changed, 88 insertions(+), 50 deletions(-)
New commits:
commit 47b84f7e5143f445a087fc9ccc4fb29bbd88ff64
Author: Caolán McNamara <caolanm at redhat.com>
Date: Wed Aug 27 15:03:45 2014 +0100
Resolves: fdo#82904 non-Japanese ww95 documents claiming ms932 encoding
Change-Id: I62f8d5c3cac71f83f5cdde114f66e8554a780538
diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 4fc41eb..54cfbcc 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -3037,8 +3037,51 @@ bool SwWW8ImplReader::ReadPlainChars(WW8_CP& rPos, sal_Int32 nEnd, sal_Int32 nCp
// the correct FilePos has already been reached.
const sal_Int32 nStrLen = std::min(nValidStrLen, SAL_MAX_INT32-1);
- const rtl_TextEncoding eSrcCharSet = bVer67 ? GetCurrentCharSet() :
+ rtl_TextEncoding eSrcCharSet = bVer67 ? GetCurrentCharSet() :
RTL_TEXTENCODING_MS_1252;
+ if (bVer67 && eSrcCharSet == RTL_TEXTENCODING_MS_932)
+ {
+ /*
+ fdo#82904
+
+ Older documents exported as word 95 that use unicode aware fonts will
+ have the charset of those fonts set to RTL_TEXTENCODING_MS_932 on
+ export as the conversion from RTL_TEXTENCODING_UNICODE. This is a serious
+ pain.
+
+ We will try and use a fallback encoding if the conversion from
+ RTL_TEXTENCODING_MS_932 fails, but you can get unlucky and get a document
+ which isn't really in RTL_TEXTENCODING_MS_932 but parts of it form
+ valid RTL_TEXTENCODING_MS_932 by chance :-(
+
+ We're not the only ones that struggle with this: Here's the help from
+ MSOffice 2003 on the topic:
+
+ <<
+ Earlier versions of Microsoft Word were sometimes used in conjunction with
+ third-party language-processing add-in programs designed to support Chinese or
+ Korean on English versions of Microsoft Windows. Use of these add-ins sometimes
+ results in incorrect text display in more recent versions of Word.
+
+ However, you can set options to convert these documents so that text is
+ displayed correctly. On the Tools menu, click Options, and then click the
+ General tab. In the English Word 6.0/95 documents list, select Contain Asian
+ text (to have Word interpret the text as Asian code page data, regardless of
+ its font) or Automatically detect Asian text (to have Word attempt to determine
+ which parts of the text are meant to be Asian).
+ >>
+
+ What we can try here is to ignore a RTL_TEXTENCODING_MS_932 codepage if
+ the language is not Japanese
+ */
+
+ const SfxPoolItem * pItem = GetFmtAttr(RES_CHRATR_CJK_LANGUAGE);
+ if (pItem != NULL && LANGUAGE_JAPANESE != static_cast<const SvxLanguageItem *>(pItem)->GetLanguage())
+ {
+ SAL_WARN("sw.ww8", "discarding word95 RTL_TEXTENCODING_MS_932 encoding");
+ eSrcCharSet = GetCharSetFromLanguage();
+ }
+ }
const rtl_TextEncoding eSrcCJKCharSet = bVer67 ? GetCurrentCJKCharSet() :
RTL_TEXTENCODING_MS_1252;
commit 4143d7bc7078fb367130e092a354b20da57585cc
Author: Caolán McNamara <caolanm at redhat.com>
Date: Wed Aug 27 15:00:15 2014 +0100
sync GetCurrentCJKCharSet with GetCurrentCharSet
Change-Id: Ibcf1fa35617ee8d7fab6b66e3e8c8881ad55c3e5
diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 53a2a02..4fc41eb 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -2809,6 +2809,24 @@ rtl_TextEncoding SwWW8ImplReader::GetCharSetFromLanguage()
return msfilter::util::getBestTextEncodingFromLocale(aLocale);
}
+rtl_TextEncoding SwWW8ImplReader::GetCJKCharSetFromLanguage()
+{
+ /*
+ #i22206#/#i52786#
+ The (default) character set used for a run of text is the default
+ character set for the version of Word that last saved the document.
+
+ This is a bit tentative, more might be required if the concept is correct.
+ When later version of word write older 6/95 documents the charset is
+ correctly set in the character runs involved, so its hard to reproduce
+ documents that require this to be sure of the process involved.
+ */
+ const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_CJK_LANGUAGE);
+ LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM;
+ ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang));
+ return msfilter::util::getBestTextEncodingFromLocale(aLocale);
+}
+
rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet()
{
/*
@@ -2846,33 +2864,12 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCJKCharSet()
{
if (!maFontSrcCJKCharSets.empty())
eSrcCharSet = maFontSrcCJKCharSets.top();
- if (!vColl.empty())
- {
- if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && nCharFmt >= 0 && (size_t)nCharFmt < vColl.size() )
- eSrcCharSet = vColl[nCharFmt].GetCJKCharSet();
- if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW && nAktColl < vColl.size())
- eSrcCharSet = vColl[nAktColl].GetCJKCharSet();
- }
+ if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && nCharFmt >= 0 && (size_t)nCharFmt < vColl.size() )
+ eSrcCharSet = vColl[nCharFmt].GetCJKCharSet();
+ if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW && StyleExists(nAktColl) && nAktColl < vColl.size())
+ eSrcCharSet = vColl[nAktColl].GetCJKCharSet();
if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW)
- { // patch from cmc for #i52786#
- /*
- #i22206#/#i52786#
- The (default) character set used for a run of text is the default
- character set for the version of Word that last saved the document.
-
- This is a bit tentative, more might be required if the concept is correct.
- When later version of word write older 6/95 documents the charset is
- correctly set in the character runs involved, so its hard to reproduce
- documents that require this to be sure of the process involved.
- */
- const SvxLanguageItem *pLang =
- (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
- if (pLang)
- {
- ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(pLang->GetLanguage()));
- eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale);
- }
- }
+ eSrcCharSet = GetCJKCharSetFromLanguage();
}
return eSrcCharSet;
}
diff --git a/sw/source/filter/ww8/ww8par.hxx b/sw/source/filter/ww8/ww8par.hxx
index 50fe2c9..96963a2 100644
--- a/sw/source/filter/ww8/ww8par.hxx
+++ b/sw/source/filter/ww8/ww8par.hxx
@@ -1922,6 +1922,7 @@ public: // eigentlich private, geht aber leider nur public
rtl_TextEncoding GetCurrentCharSet();
rtl_TextEncoding GetCurrentCJKCharSet();
rtl_TextEncoding GetCharSetFromLanguage();
+ rtl_TextEncoding GetCJKCharSetFromLanguage();
void PostProcessAttrs();
static void ReadEmbeddedData(SvMemoryStream& rStrm, SwDocShell* pDocShell, struct HyperLinksTable& hlStr);
commit 56c9850145faa9ac04c3f09633e56b6c8c22c6c4
Author: Caolán McNamara <caolanm at redhat.com>
Date: Wed Aug 27 14:57:05 2014 +0100
refactor into GetCharSetFromLanguage
Change-Id: I54382b0dd0f6b6f21f635d75cb3ee3cefc1eb203
diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 6b4ec95..53a2a02 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -2791,6 +2791,24 @@ bool SwWW8ImplReader::ProcessSpecial(bool &rbReSync, WW8_CP nStartCp)
return bTableRowEnd;
}
+rtl_TextEncoding SwWW8ImplReader::GetCharSetFromLanguage()
+{
+ /*
+ #i22206#/#i52786#
+ The (default) character set used for a run of text is the default
+ character set for the version of Word that last saved the document.
+
+ This is a bit tentative, more might be required if the concept is correct.
+ When later version of word write older 6/95 documents the charset is
+ correctly set in the character runs involved, so its hard to reproduce
+ documents that require this to be sure of the process involved.
+ */
+ const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
+ LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM;
+ ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang));
+ return msfilter::util::getBestTextEncodingFromLocale(aLocale);
+}
+
rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet()
{
/*
@@ -2809,22 +2827,7 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet()
if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && StyleExists(nAktColl) && nAktColl < vColl.size())
eSrcCharSet = vColl[nAktColl].GetCharSet();
if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW)
- {
- /*
- #i22206#/#i52786#
- The (default) character set used for a run of text is the default
- character set for the version of Word that last saved the document.
-
- This is a bit tentative, more might be required if the concept is correct.
- When later version of word write older 6/95 documents the charset is
- correctly set in the character runs involved, so its hard to reproduce
- documents that require this to be sure of the process involved.
- */
- const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
- LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM;
- ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang));
- eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale);
- }
+ eSrcCharSet = GetCharSetFromLanguage();
}
return eSrcCharSet;
}
diff --git a/sw/source/filter/ww8/ww8par.hxx b/sw/source/filter/ww8/ww8par.hxx
index d44e508..50fe2c9 100644
--- a/sw/source/filter/ww8/ww8par.hxx
+++ b/sw/source/filter/ww8/ww8par.hxx
@@ -1921,6 +1921,7 @@ public: // eigentlich private, geht aber leider nur public
sal_uLong LoadDoc( SwPaM&,WW8Glossary *pGloss=0);
rtl_TextEncoding GetCurrentCharSet();
rtl_TextEncoding GetCurrentCJKCharSet();
+ rtl_TextEncoding GetCharSetFromLanguage();
void PostProcessAttrs();
static void ReadEmbeddedData(SvMemoryStream& rStrm, SwDocShell* pDocShell, struct HyperLinksTable& hlStr);
commit 804d60d2ee4c099f685a6e42438fa0de15ca29be
Author: Caolán McNamara <caolanm at redhat.com>
Date: Wed Aug 27 14:04:53 2014 +0100
duplicate with getBestTextEncodingFromLocale
Change-Id: I73a69fdfee0b0f3af5bf6b4e52629dba7ed69630
diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx
index 3395868..6b4ec95 100644
--- a/sw/source/filter/ww8/ww8par.cxx
+++ b/sw/source/filter/ww8/ww8par.cxx
@@ -2866,15 +2866,8 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCJKCharSet()
(const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE);
if (pLang)
{
- switch (pLang->GetLanguage())
- {
- case LANGUAGE_CZECH:
- eSrcCharSet = RTL_TEXTENCODING_MS_1250;
- break;
- default:
- eSrcCharSet = RTL_TEXTENCODING_MS_1252;
- break;
- }
+ ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(pLang->GetLanguage()));
+ eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale);
}
}
}
More information about the Libreoffice-commits
mailing list