[Libreoffice-commits] .: detect-charset.patch
Libreoffice Gerrit user
logerrit at kemper.freedesktop.org
Mon Jan 21 06:03:10 PST 2013
detect-charset.patch | 237 ++++++++++++++++++++++++++++++++++++---------------
1 file changed, 171 insertions(+), 66 deletions(-)
New commits:
commit 6ccfa769ce735728cae0c6ad32b195b62f6ac00f
Author: Fridrich Å trba <fridrich.strba at bluewin.ch>
Date: Mon Jan 21 15:00:43 2013 +0100
Adapt the charset detection patch to ICU
diff --git a/detect-charset.patch b/detect-charset.patch
index 2e7e9c5..ada0767 100644
--- a/detect-charset.patch
+++ b/detect-charset.patch
@@ -1,79 +1,184 @@
-diff --git a/src/lib/CDRParser.cpp b/src/lib/CDRParser.cpp
-index a4e7b17..80a07f6 100644
---- a/src/lib/CDRParser.cpp
-+++ b/src/lib/CDRParser.cpp
-@@ -43,10 +43,18 @@
- #define DUMP_PREVIEW_IMAGE 0
- #endif
+From 44d988e5df8a782705ebe6a477b5ae1b173418bf Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fridrich=20=C5=A0trba?= <fridrich.strba at bluewin.ch>
+Date: Mon, 21 Jan 2013 14:58:31 +0100
+Subject: [PATCH] Use ICU to guess encoding
+
+---
+ configure.ac | 16 +++++++++
+ src/lib/Makefile.am | 4 +--
+ src/lib/libcdr_utils.cpp | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
+ src/lib/libcdr_utils.h | 4 +++
+ 4 files changed, 108 insertions(+), 2 deletions(-)
+
+diff --git a/configure.ac b/configure.ac
+index 1e32311..e5619cf 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -62,6 +62,22 @@ PKG_CHECK_MODULES([ZLIB],[zlib],[],[
+ AC_SUBST(ZLIB_CFLAGS)
+ AC_SUBST(ZLIB_LIBS)
-+#ifndef GUESS_CHARACTER_ENCODING
-+#define GUESS_CHARACTER_ENCODING 1
-+#endif
++# ========
++# Find icu
++# ========
++AC_PATH_PROG([ICU_CONFIG],[icu-config])
++AC_MSG_CHECKING([ICU installation])
++if ${ICU_CONFIG} --cflags >/dev/null 2>&1; then
++ ICU_CFLAGS=`${ICU_CONFIG} --cppflags-searchpath`
++ ICU_LIBS=`${ICU_CONFIG} --ldflags`
++ AC_MSG_RESULT([found])
++else
++ AC_MSG_ERROR([libicu config program icu-config not found])
++fi
++AC_SUBST(ICU_CFLAGS)
++AC_SUBST(ICU_LIBS)
+
- #ifndef M_PI
- #define M_PI 3.14159265358979323846
- #endif
-
-+#if GUESS_CHARACTER_ENCODING
-+#include <libcharguess/universal.h>
-+#endif
+
+ # =================================
+ # Libtool/Version Makefile settings
+ # =================================
+diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
+index 7255d40..bff4ce3 100644
+--- a/src/lib/Makefile.am
++++ b/src/lib/Makefile.am
+@@ -12,9 +12,9 @@ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_include_HEADERS = \
+ CDRStringVector.h \
+ CMXDocument.h
+
+-AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(DEBUG_CXXFLAGS)
++AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(ICU_CFLAGS) $(DEBUG_CXXFLAGS)
+
+-libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD = $(LIBCDR_LIBS) $(ZLIB_LIBS) @LIBCDR_WIN32_RESOURCE@
++libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD = $(LIBCDR_LIBS) $(ZLIB_LIBS) $(ICU_LIBS) @LIBCDR_WIN32_RESOURCE@
+ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_DEPENDENCIES = @LIBCDR_WIN32_RESOURCE@
+ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LDFLAGS = $(version_info) -export-dynamic -no-undefined
+ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_SOURCES = \
+diff --git a/src/lib/libcdr_utils.cpp b/src/lib/libcdr_utils.cpp
+index ef94582..28162e3 100644
+--- a/src/lib/libcdr_utils.cpp
++++ b/src/lib/libcdr_utils.cpp
+@@ -27,6 +27,8 @@
+ * instead of those above.
+ */
+
++#include <string.h>
++#include <unicode/ucsdet.h>
+ #include "libcdr_utils.h"
+
+ #define CDR_NUM_ELEMENTS(array) sizeof(array)/sizeof(array[0])
+@@ -36,6 +38,86 @@
namespace
{
-@@ -2288,6 +2296,16 @@ void libcdr::CDRParser::readStlt(WPXInputStream *input, unsigned length)
- }
- }
-
-+#if GUESS_CHARACTER_ENCODING
-+static const char *getEncoding(const unsigned char *buffer, unsigned bufferLength)
++static unsigned short getEncodingFromICUName(const char *name)
+{
-+ UniversalDetector detector;
-+ detector.HandleData((const char *)buffer, bufferLength);
-+ detector.DataEnd();
-+ return detector.GetCharset();
++ // ANSI
++ if (strcmp(name, "ISO-8859-1") == 0)
++ return 0;
++ if (strcmp(name, "windows-1252") == 0)
++ return 0;
++ // CENTRAL EUROPE
++ if (strcmp(name, "ISO-8859-2") == 0)
++ return 0xee;
++ if (strcmp(name, "windows-1250") == 0)
++ return 0xee;
++ // RUSSIAN
++ if (strcmp(name, "ISO-8859-5") == 0)
++ return 0xcc;
++ if (strcmp(name, "windows-1251") == 0)
++ return 0xcc;
++ if (strcmp(name, "KOI8-R") == 0)
++ return 0xcc;
++ // ARABIC
++ if (strcmp(name, "ISO-8859-6") == 0)
++ return 0xb2;
++ if (strcmp(name, "windows-1256") == 0)
++ return 0xb2;
++ // TURKISH
++ if (strcmp(name, "ISO-8859-9") == 0)
++ return 0xa2;
++ if (strcmp(name, "windows-1254") == 0)
++ return 0xa2;
++ // GREEK
++ if (strcmp(name, "ISO-8859-7") == 0)
++ return 0xa1;
++ if (strcmp(name, "windows-1253") == 0)
++ return 0xa1;
++ // HEBREW
++ if (strcmp(name, "ISO-8859-8") == 0)
++ return 0xb1;
++ if (strcmp(name, "windows-1255") == 0)
++ return 0xb1;
++
++ return 0;
+}
-+#endif
+
- void libcdr::CDRParser::readTxsm(WPXInputStream *input, unsigned length)
- {
- if (m_version < 700)
-@@ -2410,7 +2428,12 @@ void libcdr::CDRParser::readTxsm(WPXInputStream *input, unsigned length)
- if (tmpCharDescription & 0x01)
- appendCharacters(text, tmpTextData);
- else
-+ {
-+#if GUESS_CHARACTER_ENCODING
-+ CDR_DEBUG_MSG(("CDRParser::readTxsm - Detected chunk encoding %s\n", getEncoding(&tmpTextData[0], (unsigned)tmpTextData.size())));
-+#endif
- appendCharacters(text, tmpTextData, charStyles[(tmpCharDescription >> 16) & 0xff].m_charSet);
-+ }
- }
- tmpTextData.clear();
- tmpCharDescription = (uint32_t)(charDescriptions[i] & 0xffffff);
-@@ -2424,7 +2447,12 @@ void libcdr::CDRParser::readTxsm(WPXInputStream *input, unsigned length)
- if (tmpCharDescription & 0x01)
- appendCharacters(text, tmpTextData);
- else
++
++static unsigned short getEncoding(const unsigned char *buffer, unsigned bufferLength)
++{
++ UErrorCode status = U_ZERO_ERROR;
++ UCharsetDetector *csd = 0;
++ const UCharsetMatch *csm = 0;
++ try
++ {
++ csd = ucsdet_open(&status);
++ if (U_FAILURE(status))
++ throw libcdr::EncodingException();
++ ucsdet_setText(csd, (const char *)buffer, bufferLength, &status);
++ if (U_FAILURE(status))
++ throw libcdr::EncodingException();
++ ucsdet_enableInputFilter(csd, TRUE);
++ csm = ucsdet_detect(csd, &status);
++ if (U_FAILURE(status))
++ throw libcdr::EncodingException();
++ const char *name = ucsdet_getName(csm, &status);
++ if (U_FAILURE(status))
++ throw libcdr::EncodingException();
++ if (name)
+ {
-+#if GUESS_CHARACTER_ENCODING
-+ CDR_DEBUG_MSG(("CDRParser::readTxsm - Detected chunk encoding %s\n", getEncoding(&tmpTextData[0], (unsigned)tmpTextData.size())));
-+#endif
- appendCharacters(text, tmpTextData, charStyles[(tmpCharDescription >> 16) & 0xff].m_charSet);
++ unsigned short encoding = getEncodingFromICUName(name);
++ ucsdet_close(csd);
++ return encoding;
+ }
- }
- tmpTextData.clear();
- CDR_DEBUG_MSG(("CDRParser::readTxsm - Text: %s\n", text.cstr()));
-diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
-index 7ca2ecb..c27eda5 100644
---- a/src/lib/Makefile.am
-+++ b/src/lib/Makefile.am
-@@ -14,7 +14,7 @@ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_include_HEADERS = \
++ ucsdet_close(csd);
++ return 0;
++ }
++ catch (const libcdr::EncodingException &)
++ {
++ ucsdet_close(csd);
++ return 0;
++ }
++}
++
+ static void _appendUCS4(WPXString &text, unsigned ucs4Character)
+ {
+ unsigned char first;
+@@ -450,6 +532,10 @@ void libcdr::appendCharacters(WPXString &text, std::vector<unsigned char> charac
+ 0x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7,
+ 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x01B0, 0x20AB, 0x00FF
+ };
++
++ if (!charset && characters.size())
++ charset = getEncoding(&characters[0], characters.size());
++
+ for (std::vector<unsigned char>::const_iterator iter = characters.begin();
+ iter != characters.end(); ++iter)
+ {
+diff --git a/src/lib/libcdr_utils.h b/src/lib/libcdr_utils.h
+index 320891a..5958b75 100644
+--- a/src/lib/libcdr_utils.h
++++ b/src/lib/libcdr_utils.h
+@@ -133,6 +133,10 @@ class UnknownPrecisionException
+ {
+ };
- AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(DEBUG_CXXFLAGS)
++class EncodingException
++{
++};
++
+ } // namespace libcdr
--libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD = $(LIBCDR_LIBS) $(ZLIB_LIBS) @LIBCDR_WIN32_RESOURCE@
-+libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD = $(LIBCDR_LIBS) $(ZLIB_LIBS) @LIBCDR_WIN32_RESOURCE@ -lcharguess
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_DEPENDENCIES = @LIBCDR_WIN32_RESOURCE@
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LDFLAGS = $(version_info) -export-dynamic -no-undefined
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_SOURCES = \
+ #endif // __LIBCDR_UTILS_H__
+--
+1.8.1.1
+
More information about the Libreoffice-commits
mailing list