[Libreoffice-commits] .: src/lib
Libreoffice Gerrit user
logerrit at kemper.freedesktop.org
Fri Jan 18 00:00:27 PST 2013
src/lib/MSPUBCollector.cpp | 103 +++++++++++++++++++++++---
src/lib/MSPUBCollector.h | 11 ++
src/lib/MSPUBParser.cpp | 2
src/lib/MSPUBParser97.cpp | 2
src/lib/libmspub_utils.cpp | 178 ++++++++++++++-------------------------------
src/lib/libmspub_utils.h | 8 --
6 files changed, 163 insertions(+), 141 deletions(-)
New commits:
commit 17f68425119bb587ca8db474beb34884511b9a12
Author: Brennan T. Vincent <brennanv at email.arizona.edu>
Date: Fri Jan 18 00:55:25 2013 -0700
Autodetect character set for pre-unicode MSPUB versions (still need to test for languages other than Russian, but appears to be working)
diff --git a/src/lib/MSPUBCollector.cpp b/src/lib/MSPUBCollector.cpp
index 635c049..5653df8 100644
--- a/src/lib/MSPUBCollector.cpp
+++ b/src/lib/MSPUBCollector.cpp
@@ -13,7 +13,7 @@
* License.
*
* Major Contributor(s):
- * Copyright (C) 2012 Brennan Vincent <brennanv at email.arizona.edu>
+ * Copyright (C) 2012-2013 Brennan Vincent <brennanv at email.arizona.edu>
* Copyright (C) 2012 Fridrich Strba <fridrich.strba at bluewin.ch>
*
*
@@ -29,12 +29,16 @@
*/
#include <math.h>
+
+#include <unicode/ucsdet.h>
+
#include "MSPUBCollector.h"
#include "libmspub_utils.h"
#include "MSPUBConstants.h"
#include "MSPUBTypes.h"
#include "PolygonUtils.h"
#include "Coordinate.h"
+
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
@@ -169,8 +173,10 @@ libmspub::MSPUBCollector::MSPUBCollector(libwpg::WPGPaintInterface *painter) :
m_shapeInfosBySeqNum(), m_masterPages(),
m_shapesWithCoordinatesRotated90(),
m_masterPagesByPageSeqNum(),
- m_encoding(), m_tableCellTextEndsVector(), m_stringOffsetsByTextId(),
- m_calculationValuesSeen(), m_pageSeqNumsOrdered()
+ m_tableCellTextEndsVector(), m_stringOffsetsByTextId(),
+ m_calculationValuesSeen(), m_pageSeqNumsOrdered(),
+ m_encodingHeuristic(false), m_allText(),
+ m_calculatedEncoding()
{
}
@@ -186,9 +192,9 @@ void libmspub::MSPUBCollector::setNextTableCellTextEnds(
m_tableCellTextEndsVector.push_back(ends);
}
-void libmspub::MSPUBCollector::setEncoding(Encoding encoding)
+void libmspub::MSPUBCollector::useEncodingHeuristic()
{
- m_encoding = encoding;
+ m_encodingHeuristic = true;
}
void libmspub::MSPUBCollector::setShapeShadow(unsigned seqNum, const Shadow &shadow)
@@ -784,7 +790,7 @@ boost::function<void(void)> libmspub::MSPUBCollector::paintShape(const ShapeInfo
{
WPXString textString;
appendCharacters(textString, text[i_lines].spans[i_spans].chars,
- m_encoding.get_value_or(UTF_16));
+ getCalculatedEncoding());
WPXPropertyList charProps = getCharStyleProps(text[i_lines].spans[i_spans].style, text[i_lines].style.m_defaultCharStyleIndex);
m_painter->startTextSpan(charProps);
m_painter->insertText(textString);
@@ -801,6 +807,68 @@ boost::function<void(void)> libmspub::MSPUBCollector::paintShape(const ShapeInfo
return &no_op;
}
+const char *libmspub::MSPUBCollector::getCalculatedEncoding() const
+{
+ if (m_calculatedEncoding.is_initialized())
+ {
+ return m_calculatedEncoding.get();
+ }
+ // modern versions are somewhat sane and use Unicode
+ if (! m_encodingHeuristic)
+ {
+ m_calculatedEncoding = "UTF-16LE";
+ return m_calculatedEncoding.get();
+ }
+ // for older versions of PUB, see if we can get ICU to tell us the encoding.
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector *ucd = NULL;
+ const UCharsetMatch **matches = NULL;
+ const UCharsetMatch *ucm = NULL;
+ ucd = ucsdet_open(&status);
+ int matchesFound = -1;
+ const char *name = NULL;
+ const char *windowsName = NULL;
+ if (m_allText.empty())
+ {
+ goto csd_fail;
+ }
+ if (U_FAILURE(status))
+ {
+ goto csd_fail;
+ }
+ // don't worry, the below call doesn't require a null-terminated string.
+ ucsdet_setText(ucd, (const char *)(&m_allText[0]), m_allText.size(), &status);
+ if (U_FAILURE(status))
+ {
+ goto csd_fail;
+ }
+ matches = ucsdet_detectAll(ucd, &matchesFound, &status);
+ if (U_FAILURE(status))
+ {
+ goto csd_fail;
+ }
+ //find best fit that is an actual Windows encoding
+ for (int i = 0; i < matchesFound; ++i)
+ {
+ ucm = matches[i];
+ name = ucsdet_getName(ucm, &status);
+ if (U_FAILURE(status))
+ {
+ goto csd_fail;
+ }
+ windowsName = windowsCharsetNameByOriginalCharset(name);
+ if (windowsName)
+ {
+ m_calculatedEncoding = windowsName;
+ ucsdet_close(ucd);
+ return windowsName;
+ }
+ }
+csd_fail:
+ ucsdet_close(ucd);
+ return "windows-1252"; // Pretty likely to give garbage text, but it's the best we can do.
+}
+
void libmspub::MSPUBCollector::setShapeLineBackColor(unsigned shapeSeqNum,
ColorReference backColor)
{
@@ -1142,7 +1210,7 @@ WPXPropertyList libmspub::MSPUBCollector::getCharStyleProps(const CharacterStyle
{
WPXString str;
appendCharacters(str, m_fonts[style.fontIndex.get()],
- m_encoding.get_value_or(UTF_16));
+ getCalculatedEncoding());
ret.insert("style:font-name", str);
}
else if (defaultCharStyle.fontIndex.is_initialized() &&
@@ -1150,14 +1218,14 @@ WPXPropertyList libmspub::MSPUBCollector::getCharStyleProps(const CharacterStyle
{
WPXString str;
appendCharacters(str, m_fonts[defaultCharStyle.fontIndex.get()],
- m_encoding.get_value_or(UTF_16));
+ getCalculatedEncoding());
ret.insert("style:font-name", str);
}
else if (!m_fonts.empty())
{
WPXString str;
appendCharacters(str, m_fonts[0],
- m_encoding.get_value_or(UTF_16));
+ getCalculatedEncoding());
ret.insert("style:font-name", str);
}
switch (style.superSubType)
@@ -1325,9 +1393,26 @@ bool libmspub::MSPUBCollector::addTextString(const std::vector<TextParagraph> &s
{
MSPUB_DEBUG_MSG(("addTextString, id: 0x%x\n", id));
m_textStringsById[id] = str;
+ if (m_encodingHeuristic)
+ {
+ ponderStringEncoding(str);
+ }
return true; //FIXME: Warn if the string already existed in the map.
}
+void libmspub::MSPUBCollector::ponderStringEncoding(
+ const std::vector<TextParagraph> &str)
+{
+ for (unsigned i = 0; i < str.size(); ++i)
+ {
+ for (unsigned j = 0; j < str[i].spans.size(); ++j)
+ {
+ const std::vector<unsigned char> &chars = str[i].spans[j].chars;
+ m_allText.insert(m_allText.end(), chars.begin(), chars.end());
+ }
+ }
+}
+
void libmspub::MSPUBCollector::setWidthInEmu(unsigned long widthInEmu)
{
//FIXME: Warn if this is called twice
diff --git a/src/lib/MSPUBCollector.h b/src/lib/MSPUBCollector.h
index be39b0d..03e7dc5 100644
--- a/src/lib/MSPUBCollector.h
+++ b/src/lib/MSPUBCollector.h
@@ -13,7 +13,7 @@
* License.
*
* Major Contributor(s):
- * Copyright (C) 2012 Brennan Vincent <brennanv at email.arizona.edu>
+ * Copyright (C) 2012-2013 Brennan Vincent <brennanv at email.arizona.edu>
* Copyright (C) 2012 Fridrich Strba <fridrich.strba at bluewin.ch>
*
* All Rights Reserved.
@@ -135,7 +135,8 @@ public:
void addDefaultParagraphStyle(const ParagraphStyle &style);
void addPaletteColor(Color);
bool setCurrentGroupSeqNum(unsigned seqNum);
- void setEncoding(Encoding encoding);
+
+ void useEncodingHeuristic();
void setNextTableCellTextEnds(const std::vector<unsigned> &ends);
void setTextStringOffset(unsigned textId, unsigned offset);
@@ -181,11 +182,13 @@ private:
std::set<unsigned> m_masterPages;
std::set<unsigned> m_shapesWithCoordinatesRotated90;
std::map<unsigned, unsigned> m_masterPagesByPageSeqNum;
- boost::optional<Encoding> m_encoding;
std::vector<std::vector<unsigned> > m_tableCellTextEndsVector;
std::map<unsigned, unsigned> m_stringOffsetsByTextId;
mutable std::vector<bool> m_calculationValuesSeen;
std::vector<unsigned> m_pageSeqNumsOrdered;
+ bool m_encodingHeuristic;
+ std::vector<unsigned char> m_allText;
+ mutable boost::optional<const char *> m_calculatedEncoding;
// helper functions
std::vector<int> getShapeAdjustValues(const ShapeInfo &info) const;
boost::optional<unsigned> getMasterPageSeqNum(unsigned pageSeqNum) const;
@@ -208,6 +211,8 @@ private:
WPXPropertyList getCharStyleProps(const CharacterStyle &, boost::optional<unsigned> defaultCharStyleIndex) const;
WPXPropertyList getParaStyleProps(const ParagraphStyle &, boost::optional<unsigned> defaultParaStyleIndex) const;
double getSpecialValue(const ShapeInfo &info, const CustomShape &shape, int arg, const std::vector<int> &adjustValues) const;
+ void ponderStringEncoding(const std::vector<TextParagraph> &str);
+ const char *getCalculatedEncoding() const;
public:
static WPXString getColorString(const Color &);
};
diff --git a/src/lib/MSPUBParser.cpp b/src/lib/MSPUBParser.cpp
index 6d1a04c..548dd48 100644
--- a/src/lib/MSPUBParser.cpp
+++ b/src/lib/MSPUBParser.cpp
@@ -488,7 +488,7 @@ bool libmspub::MSPUBParser::parseFontChunk(
if (subSubInfo.id == EMBEDDED_FONT_NAME)
{
name = WPXString();
- appendCharacters(name.get(), subSubInfo.stringData, UTF_16);
+ appendCharacters(name.get(), subSubInfo.stringData, "UTF-16");
}
else if (subSubInfo.id == EMBEDDED_EOT)
{
diff --git a/src/lib/MSPUBParser97.cpp b/src/lib/MSPUBParser97.cpp
index e4ce736..8b8efd9 100644
--- a/src/lib/MSPUBParser97.cpp
+++ b/src/lib/MSPUBParser97.cpp
@@ -34,7 +34,7 @@
libmspub::MSPUBParser97::MSPUBParser97(WPXInputStream *input, MSPUBCollector *collector)
: MSPUBParser2k(input, collector), m_isBanner(false)
{
- m_collector->setEncoding(WIN_1252);
+ m_collector->useEncodingHeuristic();
}
unsigned short libmspub::MSPUBParser97::getTextMarker() const
diff --git a/src/lib/libmspub_utils.cpp b/src/lib/libmspub_utils.cpp
index ce10f5d..17c3d5c 100644
--- a/src/lib/libmspub_utils.cpp
+++ b/src/lib/libmspub_utils.cpp
@@ -29,9 +29,14 @@
* instead of those above.
*/
+#include <unicode/ucnv.h>
+#include <unicode/utypes.h>
+
#include <string.h> // for memcpy
#include <math.h>
#include <zlib.h>
+#include <cstring>
+
#include "libmspub_utils.h"
#ifndef M_PI
@@ -40,6 +45,40 @@
#define ZLIB_CHUNK 16384
+using std::strcmp;
+const char *libmspub::windowsCharsetNameByOriginalCharset(const char *name)
+{
+ if (strcmp(name, "Shift_JIS") == 0)
+ {
+ return "windows-932";
+ }
+ if (strcmp(name, "GB18030") == 0)
+ {
+ return "windows-936";
+ }
+ if (strcmp(name, "Big5") == 0)
+ {
+ return "windows-950";
+ }
+ if (strcmp(name, "ISO-8859-1") == 0)
+ {
+ return "windows-1252";
+ }
+ if (strcmp(name, "ISO-8859-2") == 0)
+ {
+ return "windows-1250";
+ }
+ if (strcmp(name, "windows-1251") == 0)
+ {
+ return "windows-1251";
+ }
+ if (strcmp(name, "windows-1256") == 0)
+ {
+ return "windows-1256";
+ }
+ return NULL;
+}
+
const char *libmspub::mimeByImgType(ImgType type)
{
switch (type)
@@ -182,69 +221,6 @@ WPXBinaryData libmspub::inflateData(WPXBinaryData deflated)
namespace
{
-static uint32_t _win1252ToUCS4(unsigned char win1252Character)
-{
- switch (win1252Character)
- {
- case 0x80:
- return 0x20AC;
- case 0x82:
- return 0x201A;
- case 0x83:
- return 0x0192;
- case 0x84:
- return 0x201E;
- case 0x85:
- return 0x2026;
- case 0x86:
- return 0x2020;
- case 0x87:
- return 0x2021;
- case 0x88:
- return 0x02C6;
- case 0x89:
- return 0x2030;
- case 0x8A:
- return 0x0160;
- case 0x8B:
- return 0x2039;
- case 0x8C:
- return 0x0152;
- case 0x8E:
- return 0x017D;
- case 0x91:
- return 0x2018;
- case 0x92:
- return 0x2019;
- case 0x93:
- return 0x201C;
- case 0x94:
- return 0x201D;
- case 0x95:
- return 0x2022;
- case 0x96:
- return 0x2013;
- case 0x97:
- return 0x2014;
- case 0x98:
- return 0x02DC;
- case 0x99:
- return 0x2122;
- case 0x9A:
- return 0x0161;
- case 0x9B:
- return 0x203A;
- case 0x9C:
- return 0x0153;
- case 0x9E:
- return 0x017E;
- case 0x9F:
- return 0x0178;
- default:
- return win1252Character;
- }
-}
-
static void _appendUCS4(WPXString &text, unsigned ucs4Character)
{
unsigned char first;
@@ -388,71 +364,31 @@ void libmspub::readNBytes(WPXInputStream *input, unsigned long length, std::vect
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
+
void libmspub::appendCharacters(WPXString &text, const std::vector<unsigned char> characters,
- Encoding encoding)
+ const char *encoding)
{
- switch (encoding)
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter *conv = NULL;
+ conv = ucnv_open(encoding, &status);
+ if (U_SUCCESS(status))
{
- case UTF_16:
- for (std::vector<unsigned char>::const_iterator iter = characters.begin();
- iter != characters.end();)
+ // ICU documentation claims that character-by-character processing is faster "for small amounts of data" and "'normal' charsets"
+ // (in any case, it is more convenient :) )
+ const char *src = (const char *)&characters[0];
+ const char *srcLimit = (const char *)src + characters.size();
+ while (src < srcLimit)
{
- uint16_t high_surrogate = 0;
- bool fail = false;
- uint32_t ucs4Character = 0;
- while (true)
+ uint32_t ucs4Character = (uint32_t)ucnv_getNextUChar(conv, &src, srcLimit, &status);
+ if (U_SUCCESS(status))
{
- if (iter == characters.end())
- {
- fail = true;
- break;
- }
- uint16_t character = *iter++;
- character |= (uint16_t)(*iter++) << 8;
- if (character >= 0xdc00 && character < 0xe000) /* low surrogate */
- {
- if (high_surrogate)
- {
- ucs4Character = SURROGATE_VALUE(high_surrogate, character);
- high_surrogate = 0;
- break;
- }
- else
- {
- fail = true;
- break;
- }
- }
- else
- {
- if (high_surrogate)
- {
- fail = true;
- break;
- }
- if (character >= 0xd800 && character < 0xdc00) /* high surrogate */
- high_surrogate = character;
- else
- {
- ucs4Character = character;
- break;
- }
- }
+ _appendUCS4(text, ucs4Character);
}
- if (fail)
- throw libmspub::GenericException();
-
- _appendUCS4(text, ucs4Character);
}
- break;
- case WIN_1252:
- for (std::vector<unsigned char>::const_iterator iter = characters.begin();
- iter != characters.end(); ++iter)
- {
- uint32_t ucs4 = _win1252ToUCS4(*iter);
- _appendUCS4(text, ucs4);
- }
- break;
+ }
+ if (conv)
+ {
+ ucnv_close(conv);
}
}
diff --git a/src/lib/libmspub_utils.h b/src/lib/libmspub_utils.h
index 9b902da..c2bfb0d 100644
--- a/src/lib/libmspub_utils.h
+++ b/src/lib/libmspub_utils.h
@@ -92,12 +92,8 @@ typedef unsigned __int64 uint64_t;
namespace libmspub
{
-enum Encoding
-{
- UTF_16,
- WIN_1252
-};
const char *mimeByImgType(ImgType type);
+const char *windowsCharsetNameByOriginalCharset(const char *name);
uint16_t readU16(const unsigned char *input, unsigned offset);
uint32_t readU32(const unsigned char *input, unsigned offset);
@@ -113,7 +109,7 @@ double readFixedPoint(WPXInputStream *input);
double toFixedPoint(int fp);
void readNBytes(WPXInputStream *input, unsigned long length, std::vector<unsigned char> &out);
-void appendCharacters(WPXString &text, std::vector<unsigned char> characters, Encoding encoding);
+void appendCharacters(WPXString &text, std::vector<unsigned char> characters, const char *encoding);
bool stillReading(WPXInputStream *input, unsigned long until);
More information about the Libreoffice-commits
mailing list