[Libreoffice-commits] .: src/lib

Libreoffice Gerrit user logerrit at kemper.freedesktop.org
Fri Jan 18 00:00:27 PST 2013


 src/lib/MSPUBCollector.cpp |  103 +++++++++++++++++++++++---
 src/lib/MSPUBCollector.h   |   11 ++
 src/lib/MSPUBParser.cpp    |    2 
 src/lib/MSPUBParser97.cpp  |    2 
 src/lib/libmspub_utils.cpp |  178 ++++++++++++++-------------------------------
 src/lib/libmspub_utils.h   |    8 --
 6 files changed, 163 insertions(+), 141 deletions(-)

New commits:
commit 17f68425119bb587ca8db474beb34884511b9a12
Author: Brennan T. Vincent <brennanv at email.arizona.edu>
Date:   Fri Jan 18 00:55:25 2013 -0700

    Autodetect character set for pre-unicode MSPUB versions (still need to test for languages other than Russian, but appears to be working)

diff --git a/src/lib/MSPUBCollector.cpp b/src/lib/MSPUBCollector.cpp
index 635c049..5653df8 100644
--- a/src/lib/MSPUBCollector.cpp
+++ b/src/lib/MSPUBCollector.cpp
@@ -13,7 +13,7 @@
  * License.
  *
  * Major Contributor(s):
- * Copyright (C) 2012 Brennan Vincent <brennanv at email.arizona.edu>
+ * Copyright (C) 2012-2013 Brennan Vincent <brennanv at email.arizona.edu>
  * Copyright (C) 2012 Fridrich Strba <fridrich.strba at bluewin.ch>
  *
  *
@@ -29,12 +29,16 @@
  */
 
 #include <math.h>
+
+#include <unicode/ucsdet.h>
+
 #include "MSPUBCollector.h"
 #include "libmspub_utils.h"
 #include "MSPUBConstants.h"
 #include "MSPUBTypes.h"
 #include "PolygonUtils.h"
 #include "Coordinate.h"
+
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
@@ -169,8 +173,10 @@ libmspub::MSPUBCollector::MSPUBCollector(libwpg::WPGPaintInterface *painter) :
   m_shapeInfosBySeqNum(), m_masterPages(),
   m_shapesWithCoordinatesRotated90(),
   m_masterPagesByPageSeqNum(),
-  m_encoding(), m_tableCellTextEndsVector(), m_stringOffsetsByTextId(),
-  m_calculationValuesSeen(), m_pageSeqNumsOrdered()
+  m_tableCellTextEndsVector(), m_stringOffsetsByTextId(),
+  m_calculationValuesSeen(), m_pageSeqNumsOrdered(),
+  m_encodingHeuristic(false), m_allText(),
+  m_calculatedEncoding()
 {
 }
 
@@ -186,9 +192,9 @@ void libmspub::MSPUBCollector::setNextTableCellTextEnds(
   m_tableCellTextEndsVector.push_back(ends);
 }
 
-void libmspub::MSPUBCollector::setEncoding(Encoding encoding)
+void libmspub::MSPUBCollector::useEncodingHeuristic()
 {
-  m_encoding = encoding;
+  m_encodingHeuristic = true;
 }
 
 void libmspub::MSPUBCollector::setShapeShadow(unsigned seqNum, const Shadow &shadow)
@@ -784,7 +790,7 @@ boost::function<void(void)> libmspub::MSPUBCollector::paintShape(const ShapeInfo
       {
         WPXString textString;
         appendCharacters(textString, text[i_lines].spans[i_spans].chars,
-                         m_encoding.get_value_or(UTF_16));
+                         getCalculatedEncoding());
         WPXPropertyList charProps = getCharStyleProps(text[i_lines].spans[i_spans].style, text[i_lines].style.m_defaultCharStyleIndex);
         m_painter->startTextSpan(charProps);
         m_painter->insertText(textString);
@@ -801,6 +807,68 @@ boost::function<void(void)> libmspub::MSPUBCollector::paintShape(const ShapeInfo
   return &no_op;
 }
 
+const char *libmspub::MSPUBCollector::getCalculatedEncoding() const
+{
+  if (m_calculatedEncoding.is_initialized())
+  {
+    return m_calculatedEncoding.get();
+  }
+  // modern versions are somewhat sane and use Unicode
+  if (! m_encodingHeuristic)
+  {
+    m_calculatedEncoding = "UTF-16LE";
+    return m_calculatedEncoding.get();
+  }
+  // for older versions of PUB, see if we can get ICU to tell us the encoding.
+  UErrorCode status = U_ZERO_ERROR;
+  UCharsetDetector *ucd = NULL;
+  const UCharsetMatch **matches = NULL;
+  const UCharsetMatch *ucm = NULL;
+  ucd = ucsdet_open(&status);
+  int matchesFound = -1;
+  const char *name = NULL;
+  const char *windowsName = NULL;
+  if (m_allText.empty())
+  {
+    goto csd_fail;
+  }
+  if (U_FAILURE(status))
+  {
+    goto csd_fail;
+  }
+  // don't worry, the below call doesn't require a null-terminated string.
+  ucsdet_setText(ucd, (const char *)(&m_allText[0]), m_allText.size(), &status);
+  if (U_FAILURE(status))
+  {
+    goto csd_fail;
+  }
+  matches = ucsdet_detectAll(ucd, &matchesFound, &status);
+  if (U_FAILURE(status))
+  {
+    goto csd_fail;
+  }
+  //find best fit that is an actual Windows encoding
+  for (int i = 0; i < matchesFound; ++i)
+  {
+    ucm = matches[i];
+    name = ucsdet_getName(ucm, &status);
+    if (U_FAILURE(status))
+    {
+      goto csd_fail;
+    }
+    windowsName = windowsCharsetNameByOriginalCharset(name);
+    if (windowsName)
+    {
+      m_calculatedEncoding = windowsName;
+      ucsdet_close(ucd);
+      return windowsName;
+    }
+  }
+csd_fail:
+  ucsdet_close(ucd);
+  return "windows-1252"; // Pretty likely to give garbage text, but it's the best we can do.
+}
+
 void libmspub::MSPUBCollector::setShapeLineBackColor(unsigned shapeSeqNum,
     ColorReference backColor)
 {
@@ -1142,7 +1210,7 @@ WPXPropertyList libmspub::MSPUBCollector::getCharStyleProps(const CharacterStyle
   {
     WPXString str;
     appendCharacters(str, m_fonts[style.fontIndex.get()],
-                     m_encoding.get_value_or(UTF_16));
+                     getCalculatedEncoding());
     ret.insert("style:font-name", str);
   }
   else if (defaultCharStyle.fontIndex.is_initialized() &&
@@ -1150,14 +1218,14 @@ WPXPropertyList libmspub::MSPUBCollector::getCharStyleProps(const CharacterStyle
   {
     WPXString str;
     appendCharacters(str, m_fonts[defaultCharStyle.fontIndex.get()],
-                     m_encoding.get_value_or(UTF_16));
+                     getCalculatedEncoding());
     ret.insert("style:font-name", str);
   }
   else if (!m_fonts.empty())
   {
     WPXString str;
     appendCharacters(str, m_fonts[0],
-                     m_encoding.get_value_or(UTF_16));
+                     getCalculatedEncoding());
     ret.insert("style:font-name", str);
   }
   switch (style.superSubType)
@@ -1325,9 +1393,26 @@ bool libmspub::MSPUBCollector::addTextString(const std::vector<TextParagraph> &s
 {
   MSPUB_DEBUG_MSG(("addTextString, id: 0x%x\n", id));
   m_textStringsById[id] = str;
+  if (m_encodingHeuristic)
+  {
+    ponderStringEncoding(str);
+  }
   return true; //FIXME: Warn if the string already existed in the map.
 }
 
+void libmspub::MSPUBCollector::ponderStringEncoding(
+  const std::vector<TextParagraph> &str)
+{
+  for (unsigned i = 0; i < str.size(); ++i)
+  {
+    for (unsigned j = 0; j < str[i].spans.size(); ++j)
+    {
+      const std::vector<unsigned char> &chars = str[i].spans[j].chars;
+      m_allText.insert(m_allText.end(), chars.begin(), chars.end());
+    }
+  }
+}
+
 void libmspub::MSPUBCollector::setWidthInEmu(unsigned long widthInEmu)
 {
   //FIXME: Warn if this is called twice
diff --git a/src/lib/MSPUBCollector.h b/src/lib/MSPUBCollector.h
index be39b0d..03e7dc5 100644
--- a/src/lib/MSPUBCollector.h
+++ b/src/lib/MSPUBCollector.h
@@ -13,7 +13,7 @@
  * License.
  *
  * Major Contributor(s):
- * Copyright (C) 2012 Brennan Vincent <brennanv at email.arizona.edu>
+ * Copyright (C) 2012-2013 Brennan Vincent <brennanv at email.arizona.edu>
  * Copyright (C) 2012 Fridrich Strba <fridrich.strba at bluewin.ch>
  *
  * All Rights Reserved.
@@ -135,7 +135,8 @@ public:
   void addDefaultParagraphStyle(const ParagraphStyle &style);
   void addPaletteColor(Color);
   bool setCurrentGroupSeqNum(unsigned seqNum);
-  void setEncoding(Encoding encoding);
+
+  void useEncodingHeuristic();
 
   void setNextTableCellTextEnds(const std::vector<unsigned> &ends);
   void setTextStringOffset(unsigned textId, unsigned offset);
@@ -181,11 +182,13 @@ private:
   std::set<unsigned> m_masterPages;
   std::set<unsigned> m_shapesWithCoordinatesRotated90;
   std::map<unsigned, unsigned> m_masterPagesByPageSeqNum;
-  boost::optional<Encoding> m_encoding;
   std::vector<std::vector<unsigned> > m_tableCellTextEndsVector;
   std::map<unsigned, unsigned> m_stringOffsetsByTextId;
   mutable std::vector<bool> m_calculationValuesSeen;
   std::vector<unsigned> m_pageSeqNumsOrdered;
+  bool m_encodingHeuristic;
+  std::vector<unsigned char> m_allText;
+  mutable boost::optional<const char *> m_calculatedEncoding;
   // helper functions
   std::vector<int> getShapeAdjustValues(const ShapeInfo &info) const;
   boost::optional<unsigned> getMasterPageSeqNum(unsigned pageSeqNum) const;
@@ -208,6 +211,8 @@ private:
   WPXPropertyList getCharStyleProps(const CharacterStyle &, boost::optional<unsigned> defaultCharStyleIndex) const;
   WPXPropertyList getParaStyleProps(const ParagraphStyle &, boost::optional<unsigned> defaultParaStyleIndex) const;
   double getSpecialValue(const ShapeInfo &info, const CustomShape &shape, int arg, const std::vector<int> &adjustValues) const;
+  void ponderStringEncoding(const std::vector<TextParagraph> &str);
+  const char *getCalculatedEncoding() const;
 public:
   static WPXString getColorString(const Color &);
 };
diff --git a/src/lib/MSPUBParser.cpp b/src/lib/MSPUBParser.cpp
index 6d1a04c..548dd48 100644
--- a/src/lib/MSPUBParser.cpp
+++ b/src/lib/MSPUBParser.cpp
@@ -488,7 +488,7 @@ bool libmspub::MSPUBParser::parseFontChunk(
             if (subSubInfo.id == EMBEDDED_FONT_NAME)
             {
               name = WPXString();
-              appendCharacters(name.get(), subSubInfo.stringData, UTF_16);
+              appendCharacters(name.get(), subSubInfo.stringData, "UTF-16");
             }
             else if (subSubInfo.id == EMBEDDED_EOT)
             {
diff --git a/src/lib/MSPUBParser97.cpp b/src/lib/MSPUBParser97.cpp
index e4ce736..8b8efd9 100644
--- a/src/lib/MSPUBParser97.cpp
+++ b/src/lib/MSPUBParser97.cpp
@@ -34,7 +34,7 @@
 libmspub::MSPUBParser97::MSPUBParser97(WPXInputStream *input, MSPUBCollector *collector)
   : MSPUBParser2k(input, collector), m_isBanner(false)
 {
-  m_collector->setEncoding(WIN_1252);
+  m_collector->useEncodingHeuristic();
 }
 
 unsigned short libmspub::MSPUBParser97::getTextMarker() const
diff --git a/src/lib/libmspub_utils.cpp b/src/lib/libmspub_utils.cpp
index ce10f5d..17c3d5c 100644
--- a/src/lib/libmspub_utils.cpp
+++ b/src/lib/libmspub_utils.cpp
@@ -29,9 +29,14 @@
  * instead of those above.
  */
 
+#include <unicode/ucnv.h>
+#include <unicode/utypes.h>
+
 #include <string.h> // for memcpy
 #include <math.h>
 #include <zlib.h>
+#include <cstring>
+
 #include "libmspub_utils.h"
 
 #ifndef M_PI
@@ -40,6 +45,40 @@
 
 #define ZLIB_CHUNK 16384
 
+using std::strcmp;
+const char *libmspub::windowsCharsetNameByOriginalCharset(const char *name)
+{
+  if (strcmp(name, "Shift_JIS") == 0)
+  {
+    return "windows-932";
+  }
+  if (strcmp(name, "GB18030") == 0)
+  {
+    return "windows-936";
+  }
+  if (strcmp(name, "Big5") == 0)
+  {
+    return "windows-950";
+  }
+  if (strcmp(name, "ISO-8859-1") == 0)
+  {
+    return "windows-1252";
+  }
+  if (strcmp(name, "ISO-8859-2") == 0)
+  {
+    return "windows-1250";
+  }
+  if (strcmp(name, "windows-1251") == 0)
+  {
+    return "windows-1251";
+  }
+  if (strcmp(name, "windows-1256") == 0)
+  {
+    return "windows-1256";
+  }
+  return NULL;
+}
+
 const char *libmspub::mimeByImgType(ImgType type)
 {
   switch (type)
@@ -182,69 +221,6 @@ WPXBinaryData libmspub::inflateData(WPXBinaryData deflated)
 namespace
 {
 
-static uint32_t _win1252ToUCS4(unsigned char win1252Character)
-{
-  switch (win1252Character)
-  {
-  case 0x80:
-    return 0x20AC;
-  case 0x82:
-    return 0x201A;
-  case 0x83:
-    return 0x0192;
-  case 0x84:
-    return 0x201E;
-  case 0x85:
-    return 0x2026;
-  case 0x86:
-    return 0x2020;
-  case 0x87:
-    return 0x2021;
-  case 0x88:
-    return 0x02C6;
-  case 0x89:
-    return 0x2030;
-  case 0x8A:
-    return 0x0160;
-  case 0x8B:
-    return 0x2039;
-  case 0x8C:
-    return 0x0152;
-  case 0x8E:
-    return 0x017D;
-  case 0x91:
-    return 0x2018;
-  case 0x92:
-    return 0x2019;
-  case 0x93:
-    return 0x201C;
-  case 0x94:
-    return 0x201D;
-  case 0x95:
-    return 0x2022;
-  case 0x96:
-    return 0x2013;
-  case 0x97:
-    return 0x2014;
-  case 0x98:
-    return 0x02DC;
-  case 0x99:
-    return 0x2122;
-  case 0x9A:
-    return 0x0161;
-  case 0x9B:
-    return 0x203A;
-  case 0x9C:
-    return 0x0153;
-  case 0x9E:
-    return 0x017E;
-  case 0x9F:
-    return 0x0178;
-  default:
-    return win1252Character;
-  }
-}
-
 static void _appendUCS4(WPXString &text, unsigned ucs4Character)
 {
   unsigned char first;
@@ -388,71 +364,31 @@ void libmspub::readNBytes(WPXInputStream *input, unsigned long length, std::vect
 
 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
 
+
 void libmspub::appendCharacters(WPXString &text, const std::vector<unsigned char> characters,
-                                Encoding encoding)
+                                const char *encoding)
 {
-  switch (encoding)
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter *conv = NULL;
+  conv = ucnv_open(encoding, &status);
+  if (U_SUCCESS(status))
   {
-  case UTF_16:
-    for (std::vector<unsigned char>::const_iterator iter = characters.begin();
-         iter != characters.end();)
+    // ICU documentation claims that character-by-character processing is faster "for small amounts of data" and "'normal' charsets"
+    // (in any case, it is more convenient :) )
+    const char *src = (const char *)&characters[0];
+    const char *srcLimit = (const char *)src + characters.size();
+    while (src < srcLimit)
     {
-      uint16_t high_surrogate = 0;
-      bool fail = false;
-      uint32_t ucs4Character = 0;
-      while (true)
+      uint32_t ucs4Character = (uint32_t)ucnv_getNextUChar(conv, &src, srcLimit, &status);
+      if (U_SUCCESS(status))
       {
-        if (iter == characters.end())
-        {
-          fail = true;
-          break;
-        }
-        uint16_t character = *iter++;
-        character |= (uint16_t)(*iter++) << 8;
-        if (character >= 0xdc00 && character < 0xe000) /* low surrogate */
-        {
-          if (high_surrogate)
-          {
-            ucs4Character = SURROGATE_VALUE(high_surrogate, character);
-            high_surrogate = 0;
-            break;
-          }
-          else
-          {
-            fail = true;
-            break;
-          }
-        }
-        else
-        {
-          if (high_surrogate)
-          {
-            fail = true;
-            break;
-          }
-          if (character >= 0xd800 && character < 0xdc00) /* high surrogate */
-            high_surrogate = character;
-          else
-          {
-            ucs4Character = character;
-            break;
-          }
-        }
+        _appendUCS4(text, ucs4Character);
       }
-      if (fail)
-        throw libmspub::GenericException();
-
-      _appendUCS4(text, ucs4Character);
     }
-    break;
-  case WIN_1252:
-    for (std::vector<unsigned char>::const_iterator iter = characters.begin();
-         iter != characters.end(); ++iter)
-    {
-      uint32_t ucs4 = _win1252ToUCS4(*iter);
-      _appendUCS4(text, ucs4);
-    }
-    break;
+  }
+  if (conv)
+  {
+    ucnv_close(conv);
   }
 }
 
diff --git a/src/lib/libmspub_utils.h b/src/lib/libmspub_utils.h
index 9b902da..c2bfb0d 100644
--- a/src/lib/libmspub_utils.h
+++ b/src/lib/libmspub_utils.h
@@ -92,12 +92,8 @@ typedef unsigned __int64 uint64_t;
 
 namespace libmspub
 {
-enum Encoding
-{
-  UTF_16,
-  WIN_1252
-};
 const char *mimeByImgType(ImgType type);
+const char *windowsCharsetNameByOriginalCharset(const char *name);
 
 uint16_t readU16(const unsigned char *input, unsigned offset);
 uint32_t readU32(const unsigned char *input, unsigned offset);
@@ -113,7 +109,7 @@ double readFixedPoint(WPXInputStream *input);
 double toFixedPoint(int fp);
 void readNBytes(WPXInputStream *input, unsigned long length, std::vector<unsigned char> &out);
 
-void appendCharacters(WPXString &text, std::vector<unsigned char> characters, Encoding encoding);
+void appendCharacters(WPXString &text, std::vector<unsigned char> characters, const char *encoding);
 
 bool stillReading(WPXInputStream *input, unsigned long until);
 


More information about the Libreoffice-commits mailing list