[Libreoffice-commits] .: 2 commits - detect-charset.patch src/lib

Libreoffice Gerrit user logerrit at kemper.freedesktop.org
Mon Jan 21 07:03:09 PST 2013


 detect-charset.patch     |  184 -----------------------------------------------
 src/lib/libcdr_utils.cpp |   86 +++++++++++++++++++++
 src/lib/libcdr_utils.h   |    4 +
 3 files changed, 90 insertions(+), 184 deletions(-)

New commits:
commit ec852bd198fa1aaeb578374f6a8a049c2b333b09
Author: Fridrich Å trba <fridrich.strba at bluewin.ch>
Date:   Mon Jan 21 16:02:46 2013 +0100

    This is integrated

diff --git a/detect-charset.patch b/detect-charset.patch
deleted file mode 100644
index ada0767..0000000
--- a/detect-charset.patch
+++ /dev/null
@@ -1,184 +0,0 @@
-From 44d988e5df8a782705ebe6a477b5ae1b173418bf Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Fridrich=20=C5=A0trba?= <fridrich.strba at bluewin.ch>
-Date: Mon, 21 Jan 2013 14:58:31 +0100
-Subject: [PATCH] Use ICU to guess encoding
-
----
- configure.ac             | 16 +++++++++
- src/lib/Makefile.am      |  4 +--
- src/lib/libcdr_utils.cpp | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
- src/lib/libcdr_utils.h   |  4 +++
- 4 files changed, 108 insertions(+), 2 deletions(-)
-
-diff --git a/configure.ac b/configure.ac
-index 1e32311..e5619cf 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -62,6 +62,22 @@ PKG_CHECK_MODULES([ZLIB],[zlib],[],[
- AC_SUBST(ZLIB_CFLAGS)
- AC_SUBST(ZLIB_LIBS)
- 
-+# ========
-+# Find icu
-+# ========
-+AC_PATH_PROG([ICU_CONFIG],[icu-config])
-+AC_MSG_CHECKING([ICU installation])
-+if ${ICU_CONFIG} --cflags >/dev/null 2>&1; then
-+	ICU_CFLAGS=`${ICU_CONFIG} --cppflags-searchpath`
-+	ICU_LIBS=`${ICU_CONFIG} --ldflags`
-+	AC_MSG_RESULT([found])
-+else
-+	AC_MSG_ERROR([libicu config program icu-config not found])
-+fi
-+AC_SUBST(ICU_CFLAGS)
-+AC_SUBST(ICU_LIBS)
-+
-+
- # =================================
- # Libtool/Version Makefile settings
- # =================================
-diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
-index 7255d40..bff4ce3 100644
---- a/src/lib/Makefile.am
-+++ b/src/lib/Makefile.am
-@@ -12,9 +12,9 @@ libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_include_HEADERS = \
- 	CDRStringVector.h \
- 	CMXDocument.h
- 
--AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(DEBUG_CXXFLAGS)
-+AM_CXXFLAGS = $(LIBCDR_CXXFLAGS) $(ZLIB_CFLAGS) $(ICU_CFLAGS) $(DEBUG_CXXFLAGS)
- 
--libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD  = $(LIBCDR_LIBS) $(ZLIB_LIBS) @LIBCDR_WIN32_RESOURCE@
-+libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LIBADD  = $(LIBCDR_LIBS) $(ZLIB_LIBS) $(ICU_LIBS) @LIBCDR_WIN32_RESOURCE@
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_DEPENDENCIES = @LIBCDR_WIN32_RESOURCE@
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_LDFLAGS = $(version_info) -export-dynamic -no-undefined
- libcdr_ at CDR_MAJOR_VERSION@_ at CDR_MINOR_VERSION@_la_SOURCES = \
-diff --git a/src/lib/libcdr_utils.cpp b/src/lib/libcdr_utils.cpp
-index ef94582..28162e3 100644
---- a/src/lib/libcdr_utils.cpp
-+++ b/src/lib/libcdr_utils.cpp
-@@ -27,6 +27,8 @@
-  * instead of those above.
-  */
- 
-+#include <string.h>
-+#include <unicode/ucsdet.h>
- #include "libcdr_utils.h"
- 
- #define CDR_NUM_ELEMENTS(array) sizeof(array)/sizeof(array[0])
-@@ -36,6 +38,86 @@
- namespace
- {
- 
-+static unsigned short getEncodingFromICUName(const char *name)
-+{
-+  // ANSI
-+  if (strcmp(name, "ISO-8859-1") == 0)
-+    return 0;
-+  if (strcmp(name, "windows-1252") == 0)
-+    return 0;
-+  // CENTRAL EUROPE
-+  if (strcmp(name, "ISO-8859-2") == 0)
-+    return 0xee;
-+  if (strcmp(name, "windows-1250") == 0)
-+    return 0xee;
-+  // RUSSIAN
-+  if (strcmp(name, "ISO-8859-5") == 0)
-+    return 0xcc;
-+  if (strcmp(name, "windows-1251") == 0)
-+    return 0xcc;
-+  if (strcmp(name, "KOI8-R") == 0)
-+    return 0xcc;
-+  // ARABIC
-+  if (strcmp(name, "ISO-8859-6") == 0)
-+    return 0xb2;
-+  if (strcmp(name, "windows-1256") == 0)
-+    return 0xb2;
-+  // TURKISH
-+  if (strcmp(name, "ISO-8859-9") == 0)
-+    return 0xa2;
-+  if (strcmp(name, "windows-1254") == 0)
-+    return 0xa2;
-+  // GREEK
-+  if (strcmp(name, "ISO-8859-7") == 0)
-+    return 0xa1;
-+  if (strcmp(name, "windows-1253") == 0)
-+    return 0xa1;
-+  // HEBREW
-+  if (strcmp(name, "ISO-8859-8") == 0)
-+    return 0xb1;
-+  if (strcmp(name, "windows-1255") == 0)
-+    return 0xb1;
-+
-+  return 0;
-+}
-+
-+
-+static unsigned short getEncoding(const unsigned char *buffer, unsigned bufferLength)
-+{
-+  UErrorCode status = U_ZERO_ERROR;
-+  UCharsetDetector *csd = 0;
-+  const UCharsetMatch *csm = 0;
-+  try
-+  {
-+    csd = ucsdet_open(&status);
-+    if (U_FAILURE(status))
-+      throw libcdr::EncodingException();
-+    ucsdet_setText(csd, (const char *)buffer, bufferLength, &status);
-+    if (U_FAILURE(status))
-+      throw libcdr::EncodingException();
-+    ucsdet_enableInputFilter(csd, TRUE);
-+    csm = ucsdet_detect(csd, &status);
-+    if (U_FAILURE(status))
-+      throw libcdr::EncodingException();
-+    const char *name = ucsdet_getName(csm, &status);
-+    if (U_FAILURE(status))
-+      throw libcdr::EncodingException();
-+    if (name)
-+    {
-+      unsigned short encoding = getEncodingFromICUName(name);
-+      ucsdet_close(csd);
-+      return encoding;
-+    }
-+    ucsdet_close(csd);
-+    return 0;
-+  }
-+  catch (const libcdr::EncodingException &)
-+  {
-+    ucsdet_close(csd);
-+    return 0;
-+  }
-+}
-+
- static void _appendUCS4(WPXString &text, unsigned ucs4Character)
- {
-   unsigned char first;
-@@ -450,6 +532,10 @@ void libcdr::appendCharacters(WPXString &text, std::vector<unsigned char> charac
-     0x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7,
-     0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x01B0, 0x20AB, 0x00FF
-   };
-+
-+  if (!charset && characters.size())
-+    charset = getEncoding(&characters[0], characters.size());
-+
-   for (std::vector<unsigned char>::const_iterator iter = characters.begin();
-        iter != characters.end(); ++iter)
-   {
-diff --git a/src/lib/libcdr_utils.h b/src/lib/libcdr_utils.h
-index 320891a..5958b75 100644
---- a/src/lib/libcdr_utils.h
-+++ b/src/lib/libcdr_utils.h
-@@ -133,6 +133,10 @@ class UnknownPrecisionException
- {
- };
- 
-+class EncodingException
-+{
-+};
-+
- } // namespace libcdr
- 
- #endif // __LIBCDR_UTILS_H__
--- 
-1.8.1.1
-
commit 7bf6130b715080f15b0cdc2f1149f581ae1483de
Author: Fridrich Å trba <fridrich.strba at bluewin.ch>
Date:   Mon Jan 21 14:58:31 2013 +0100

    Use ICU to guess encoding

diff --git a/src/lib/libcdr_utils.cpp b/src/lib/libcdr_utils.cpp
index ef94582..28162e3 100644
--- a/src/lib/libcdr_utils.cpp
+++ b/src/lib/libcdr_utils.cpp
@@ -27,6 +27,8 @@
  * instead of those above.
  */
 
+#include <string.h>
+#include <unicode/ucsdet.h>
 #include "libcdr_utils.h"
 
 #define CDR_NUM_ELEMENTS(array) sizeof(array)/sizeof(array[0])
@@ -36,6 +38,86 @@
 namespace
 {
 
+static unsigned short getEncodingFromICUName(const char *name)
+{
+  // ANSI
+  if (strcmp(name, "ISO-8859-1") == 0)
+    return 0;
+  if (strcmp(name, "windows-1252") == 0)
+    return 0;
+  // CENTRAL EUROPE
+  if (strcmp(name, "ISO-8859-2") == 0)
+    return 0xee;
+  if (strcmp(name, "windows-1250") == 0)
+    return 0xee;
+  // RUSSIAN
+  if (strcmp(name, "ISO-8859-5") == 0)
+    return 0xcc;
+  if (strcmp(name, "windows-1251") == 0)
+    return 0xcc;
+  if (strcmp(name, "KOI8-R") == 0)
+    return 0xcc;
+  // ARABIC
+  if (strcmp(name, "ISO-8859-6") == 0)
+    return 0xb2;
+  if (strcmp(name, "windows-1256") == 0)
+    return 0xb2;
+  // TURKISH
+  if (strcmp(name, "ISO-8859-9") == 0)
+    return 0xa2;
+  if (strcmp(name, "windows-1254") == 0)
+    return 0xa2;
+  // GREEK
+  if (strcmp(name, "ISO-8859-7") == 0)
+    return 0xa1;
+  if (strcmp(name, "windows-1253") == 0)
+    return 0xa1;
+  // HEBREW
+  if (strcmp(name, "ISO-8859-8") == 0)
+    return 0xb1;
+  if (strcmp(name, "windows-1255") == 0)
+    return 0xb1;
+
+  return 0;
+}
+
+
+static unsigned short getEncoding(const unsigned char *buffer, unsigned bufferLength)
+{
+  UErrorCode status = U_ZERO_ERROR;
+  UCharsetDetector *csd = 0;
+  const UCharsetMatch *csm = 0;
+  try
+  {
+    csd = ucsdet_open(&status);
+    if (U_FAILURE(status))
+      throw libcdr::EncodingException();
+    ucsdet_setText(csd, (const char *)buffer, bufferLength, &status);
+    if (U_FAILURE(status))
+      throw libcdr::EncodingException();
+    ucsdet_enableInputFilter(csd, TRUE);
+    csm = ucsdet_detect(csd, &status);
+    if (U_FAILURE(status))
+      throw libcdr::EncodingException();
+    const char *name = ucsdet_getName(csm, &status);
+    if (U_FAILURE(status))
+      throw libcdr::EncodingException();
+    if (name)
+    {
+      unsigned short encoding = getEncodingFromICUName(name);
+      ucsdet_close(csd);
+      return encoding;
+    }
+    ucsdet_close(csd);
+    return 0;
+  }
+  catch (const libcdr::EncodingException &)
+  {
+    ucsdet_close(csd);
+    return 0;
+  }
+}
+
 static void _appendUCS4(WPXString &text, unsigned ucs4Character)
 {
   unsigned char first;
@@ -450,6 +532,10 @@ void libcdr::appendCharacters(WPXString &text, std::vector<unsigned char> charac
     0x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7,
     0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x01B0, 0x20AB, 0x00FF
   };
+
+  if (!charset && characters.size())
+    charset = getEncoding(&characters[0], characters.size());
+
   for (std::vector<unsigned char>::const_iterator iter = characters.begin();
        iter != characters.end(); ++iter)
   {
diff --git a/src/lib/libcdr_utils.h b/src/lib/libcdr_utils.h
index 320891a..5958b75 100644
--- a/src/lib/libcdr_utils.h
+++ b/src/lib/libcdr_utils.h
@@ -133,6 +133,10 @@ class UnknownPrecisionException
 {
 };
 
+class EncodingException
+{
+};
+
 } // namespace libcdr
 
 #endif // __LIBCDR_UTILS_H__


More information about the Libreoffice-commits mailing list