[Libreoffice-commits] .: detect-charset.patch

Libreoffice Gerrit user logerrit at kemper.freedesktop.org
Mon Jan 21 06:03:10 PST 2013


 detect-charset.patch |  237 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 171 insertions(+), 66 deletions(-)

New commits:
commit 6ccfa769ce735728cae0c6ad32b195b62f6ac00f
Author: Fridrich Å trba <fridrich.strba at bluewin.ch>
Date:   Mon Jan 21 15:00:43 2013 +0100

    Adapt the charset detection patch to ICU

diff --git a/detect-charset.patch b/detect-charset.patch
index 2e7e9c5..ada0767 100644
--- a/detect-charset.patch
+++ b/detect-charset.patch
@@ -1,79 +1,184 @@
-diff --git a/src/lib/CDRParser.cpp b/src/lib/CDRParser.cpp
-index a4e7b17..80a07f6 100644
---- a/src/lib/CDRParser.cpp
-+++ b/src/lib/CDRParser.cpp
-@@ -43,10 +43,18 @@
- #define DUMP_PREVIEW_IMAGE 0
- #endif
+From 44d988e5df8a782705ebe6a477b5ae1b173418bf Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fridrich=20=C5=A0trba?= <fridrich.strba at bluewin.ch>
+Date: Mon, 21 Jan 2013 14:58:31 +0100
+Subject: [PATCH] Use ICU to guess encoding
+
+---
+ configure.ac             | 16 +++++++++
+ src/lib/Makefile.am      |  4 +--
+ src/lib/libcdr_utils.cpp | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
+ src/lib/libcdr_utils.h   |  4 +++
+ 4 files changed, 108 insertions(+), 2 deletions(-)
+
+diff --git a/configure.ac b/configure.ac
+index 1e32311..e5619cf 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -62,6 +62,22 @@ PKG_CHECK_MODULES([ZLIB],[zlib],[],[
+ AC_SUBST(ZLIB_CFLAGS)
+ AC_SUBST(ZLIB_LIBS)
  
-+#ifndef GUESS_CHARACTER_ENCODING
-+#define GUESS_CHARACTER_ENCODING 1
-+#endif
++# ========
++# Find icu
++# ========
++AC_PATH_PROG([ICU_CONFIG],[icu-config])
++AC_MSG_CHECKING([ICU installation])
++if ${ICU_CONFIG} --cflags >/dev/null 2>&1; then
++	ICU_CFLAGS=`${ICU_CONFIG} --cppflags-searchpath`
++	ICU_LIBS=`${ICU_CONFIG} --ldflags`
++	AC_MSG_RESULT([found])
++else
++	AC_MSG_ERROR([libicu config program icu-config not found])
++fi
++AC_SUBST(ICU_CFLAGS)
++AC_SUBST(ICU_LIBS)
 +
- #ifndef M_PI
- #define M_PI 3.14159265358979323846
- #endif
- 
-+#if GUESS_CHARACTER_ENCODING
-+#include <libcharguess/universal.h>
-+#endif
 +
+ # =================================
+ # Libtool/Version Makefile settings
+ # =================================
+diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
+index 7255d40..bff4ce3 100644
+--- a/src/lib/Makefile.am
++++ b/src/lib/Makefile.am
+@@ -12,9 +12,9 @@ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_include_HEADERS = \
+ 	CDRStringVector.h \
+ 	CMXDocument.h
+ 
+-AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(DEBUG_CXXFLAGS)
++AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(ICU_CFLAGS) $(DEBUG_CXXFLAGS)
+ 
+-libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD  = $(LIBCDR_LIBS) $(ZLIB_LIBS) @LIBCDR_WIN32_RESOURCE@
++libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD  = $(LIBCDR_LIBS) $(ZLIB_LIBS) $(ICU_LIBS) @LIBCDR_WIN32_RESOURCE@
+ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_DEPENDENCIES = @LIBCDR_WIN32_RESOURCE@
+ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LDFLAGS = $(version_info) -export-dynamic -no-undefined
+ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_SOURCES = \
+diff --git a/src/lib/libcdr_utils.cpp b/src/lib/libcdr_utils.cpp
+index ef94582..28162e3 100644
+--- a/src/lib/libcdr_utils.cpp
++++ b/src/lib/libcdr_utils.cpp
+@@ -27,6 +27,8 @@
+  * instead of those above.
+  */
+ 
++#include <string.h>
++#include <unicode/ucsdet.h>
+ #include "libcdr_utils.h"
+ 
+ #define CDR_NUM_ELEMENTS(array) sizeof(array)/sizeof(array[0])
+@@ -36,6 +38,86 @@
  namespace
  {
  
-@@ -2288,6 +2296,16 @@ void libcdr::CDRParser::readStlt(WPXInputStream *input, unsigned length)
-   }
- }
- 
-+#if GUESS_CHARACTER_ENCODING
-+static const char *getEncoding(const unsigned char *buffer, unsigned bufferLength)
++static unsigned short getEncodingFromICUName(const char *name)
 +{
-+  UniversalDetector detector;
-+  detector.HandleData((const char *)buffer, bufferLength);
-+  detector.DataEnd();
-+  return detector.GetCharset();
++  // ANSI
++  if (strcmp(name, "ISO-8859-1") == 0)
++    return 0;
++  if (strcmp(name, "windows-1252") == 0)
++    return 0;
++  // CENTRAL EUROPE
++  if (strcmp(name, "ISO-8859-2") == 0)
++    return 0xee;
++  if (strcmp(name, "windows-1250") == 0)
++    return 0xee;
++  // RUSSIAN
++  if (strcmp(name, "ISO-8859-5") == 0)
++    return 0xcc;
++  if (strcmp(name, "windows-1251") == 0)
++    return 0xcc;
++  if (strcmp(name, "KOI8-R") == 0)
++    return 0xcc;
++  // ARABIC
++  if (strcmp(name, "ISO-8859-6") == 0)
++    return 0xb2;
++  if (strcmp(name, "windows-1256") == 0)
++    return 0xb2;
++  // TURKISH
++  if (strcmp(name, "ISO-8859-9") == 0)
++    return 0xa2;
++  if (strcmp(name, "windows-1254") == 0)
++    return 0xa2;
++  // GREEK
++  if (strcmp(name, "ISO-8859-7") == 0)
++    return 0xa1;
++  if (strcmp(name, "windows-1253") == 0)
++    return 0xa1;
++  // HEBREW
++  if (strcmp(name, "ISO-8859-8") == 0)
++    return 0xb1;
++  if (strcmp(name, "windows-1255") == 0)
++    return 0xb1;
++
++  return 0;
 +}
-+#endif
 +
- void libcdr::CDRParser::readTxsm(WPXInputStream *input, unsigned length)
- {
-   if (m_version < 700)
-@@ -2410,7 +2428,12 @@ void libcdr::CDRParser::readTxsm(WPXInputStream *input, unsigned length)
-         if (tmpCharDescription & 0x01)
-           appendCharacters(text, tmpTextData);
-         else
-+        {
-+#if GUESS_CHARACTER_ENCODING
-+          CDR_DEBUG_MSG(("CDRParser::readTxsm - Detected chunk encoding %s\n", getEncoding(&tmpTextData[0], (unsigned)tmpTextData.size())));
-+#endif
-           appendCharacters(text, tmpTextData, charStyles[(tmpCharDescription >> 16) & 0xff].m_charSet);
-+        }
-       }
-       tmpTextData.clear();
-       tmpCharDescription = (uint32_t)(charDescriptions[i] & 0xffffff);
-@@ -2424,7 +2447,12 @@ void libcdr::CDRParser::readTxsm(WPXInputStream *input, unsigned length)
-     if (tmpCharDescription & 0x01)
-       appendCharacters(text, tmpTextData);
-     else
++
++static unsigned short getEncoding(const unsigned char *buffer, unsigned bufferLength)
++{
++  UErrorCode status = U_ZERO_ERROR;
++  UCharsetDetector *csd = 0;
++  const UCharsetMatch *csm = 0;
++  try
++  {
++    csd = ucsdet_open(&status);
++    if (U_FAILURE(status))
++      throw libcdr::EncodingException();
++    ucsdet_setText(csd, (const char *)buffer, bufferLength, &status);
++    if (U_FAILURE(status))
++      throw libcdr::EncodingException();
++    ucsdet_enableInputFilter(csd, TRUE);
++    csm = ucsdet_detect(csd, &status);
++    if (U_FAILURE(status))
++      throw libcdr::EncodingException();
++    const char *name = ucsdet_getName(csm, &status);
++    if (U_FAILURE(status))
++      throw libcdr::EncodingException();
++    if (name)
 +    {
-+#if GUESS_CHARACTER_ENCODING
-+      CDR_DEBUG_MSG(("CDRParser::readTxsm - Detected chunk encoding %s\n", getEncoding(&tmpTextData[0], (unsigned)tmpTextData.size())));
-+#endif
-       appendCharacters(text, tmpTextData, charStyles[(tmpCharDescription >> 16) & 0xff].m_charSet);
++      unsigned short encoding = getEncodingFromICUName(name);
++      ucsdet_close(csd);
++      return encoding;
 +    }
-   }
-   tmpTextData.clear();
-   CDR_DEBUG_MSG(("CDRParser::readTxsm - Text: %s\n", text.cstr()));
-diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
-index 7ca2ecb..c27eda5 100644
---- a/src/lib/Makefile.am
-+++ b/src/lib/Makefile.am
-@@ -14,7 +14,7 @@ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_include_HEADERS = \
++    ucsdet_close(csd);
++    return 0;
++  }
++  catch (const libcdr::EncodingException &)
++  {
++    ucsdet_close(csd);
++    return 0;
++  }
++}
++
+ static void _appendUCS4(WPXString &text, unsigned ucs4Character)
+ {
+   unsigned char first;
+@@ -450,6 +532,10 @@ void libcdr::appendCharacters(WPXString &text, std::vector<unsigned char> charac
+     0x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7,
+     0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x01B0, 0x20AB, 0x00FF
+   };
++
++  if (!charset && characters.size())
++    charset = getEncoding(&characters[0], characters.size());
++
+   for (std::vector<unsigned char>::const_iterator iter = characters.begin();
+        iter != characters.end(); ++iter)
+   {
+diff --git a/src/lib/libcdr_utils.h b/src/lib/libcdr_utils.h
+index 320891a..5958b75 100644
+--- a/src/lib/libcdr_utils.h
++++ b/src/lib/libcdr_utils.h
+@@ -133,6 +133,10 @@ class UnknownPrecisionException
+ {
+ };
  
- AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(DEBUG_CXXFLAGS)
++class EncodingException
++{
++};
++
+ } // namespace libcdr
  
--libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD  = $(LIBCDR_LIBS) $(ZLIB_LIBS) @LIBCDR_WIN32_RESOURCE@
-+libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD  = $(LIBCDR_LIBS) $(ZLIB_LIBS) @LIBCDR_WIN32_RESOURCE@ -lcharguess
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_DEPENDENCIES = @LIBCDR_WIN32_RESOURCE@
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LDFLAGS = $(version_info) -export-dynamic -no-undefined
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_SOURCES = \
+ #endif // __LIBCDR_UTILS_H__
+-- 
+1.8.1.1
+


More information about the Libreoffice-commits mailing list