[Libreoffice-commits] core.git: sc/Library_scui.mk sc/source

Tomofumi Yagi (via logerrit) logerrit at kemper.freedesktop.org
Sun Sep 13 11:22:16 UTC 2020


 sc/Library_scui.mk                  |    3 +
 sc/source/ui/dbgui/scuiasciiopt.cxx |   71 +++++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 33 deletions(-)

New commits:
commit 85f12e47f4a086a3923dd3a6b097776d60c6dc82
Author:     Tomofumi Yagi <yagitmknada at gmail.com>
AuthorDate: Sat Sep 12 11:47:10 2020 +0900
Commit:     Noel Grandin <noel.grandin at collabora.co.uk>
CommitDate: Sun Sep 13 13:21:43 2020 +0200

    Calc: ScImportAsciiDlg can now detect Unicode encoding without BOM
    
    Change-Id: I8a3aa7458ce97f659c0caf2386a96f605b740fbc
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/102543
    Tested-by: Jenkins
    Reviewed-by: Noel Grandin <noel.grandin at collabora.co.uk>

diff --git a/sc/Library_scui.mk b/sc/Library_scui.mk
index a8c2097485b0..86605ab63a0d 100644
--- a/sc/Library_scui.mk
+++ b/sc/Library_scui.mk
@@ -39,6 +39,9 @@ $(eval $(call gb_Library_use_externals,scui,\
 	$(call gb_Helper_optional,OPENCL, \
 		clew) \
 	mdds_headers \
+	icui18n \
+	icuuc \
+	icu_headers \
 ))
 
 $(eval $(call gb_Library_use_libraries,scui,\
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx
index a0e645e551e0..5e5f08bf87a7 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -37,6 +37,9 @@
 #include <miscuno.hxx>
 #include <osl/diagnose.h>
 
+#include <unicode/uclean.h>
+#include <unicode/ucsdet.h>
+
 //! TODO make dynamic
 const SCSIZE ASCIIDLG_MAXROWS                = MAXROWCOUNT;
 
@@ -380,41 +383,43 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, const OUString& aDatNa
     // Sniff for Unicode / not
     if( ePreselectUnicode == RTL_TEXTENCODING_DONTKNOW && mpDatStream )
     {
-        Seek( 0 );
-        mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
-        sal_uLong nUniPos = mpDatStream->Tell();
-        switch (nUniPos)
+        mpDatStream->Seek( 0 );
+        constexpr size_t buffsize = 4096;
+        sal_Int8 bytes[buffsize] = { 0 };
+        sal_Int32 nRead = mpDatStream->ReadBytes( bytes, buffsize );
+        mpDatStream->Seek( 0 );
+
+        if ( nRead > 0 )
         {
-            case 2:
-                ePreselectUnicode = RTL_TEXTENCODING_UNICODE;   // UTF-16
-                break;
-            case 3:
-                ePreselectUnicode = RTL_TEXTENCODING_UTF8;      // UTF-8
-                break;
-            case 0:
-                {
-                    sal_uInt16 n;
-                    mpDatStream->ReadUInt16( n );
-                    // Assume that normal ASCII/ANSI/ISO/etc. text doesn't start with
-                    // control characters except CR,LF,TAB
-                    if ( (n & 0xff00) < 0x2000 )
-                    {
-                        switch ( n & 0xff00 )
-                        {
-                            case 0x0900 :
-                            case 0x0a00 :
-                            case 0x0d00 :
-                                break;
-                            default:
-                                ePreselectUnicode = RTL_TEXTENCODING_UNICODE;   // UTF-16
-                        }
-                    }
-                    mpDatStream->Seek(0);
-                }
-                break;
-            default:
-                ;   // nothing
+            UErrorCode uerr = U_ZERO_ERROR;
+            UCharsetDetector* ucd = ucsdet_open( &uerr );
+            ucsdet_setText( ucd, reinterpret_cast<const char*>(bytes), nRead, &uerr );
+            const UCharsetMatch* match = ucsdet_detect( ucd, &uerr );
+            const char* pEncodingName = ucsdet_getName( match, &uerr );
+
+            if ( U_SUCCESS(uerr) && !strcmp("UTF-8", pEncodingName) )
+            {
+                ePreselectUnicode = RTL_TEXTENCODING_UTF8; // UTF-8
+                mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UTF8 );
+            }
+            else if ( U_SUCCESS(uerr) && !strcmp("UTF-16LE", pEncodingName) )
+            {
+                ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16LE
+                mpDatStream->SetEndian( SvStreamEndian::LITTLE );
+                mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UNICODE );
+            }
+            else if ( U_SUCCESS(uerr) && !strcmp("UTF-16BE", pEncodingName) )
+            {
+                ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16BE
+                mpDatStream->SetEndian(SvStreamEndian::BIG);
+                mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UNICODE );
+            }
+            else // other
+                mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
+
+            ucsdet_close( ucd );
         }
+
         mnStreamPos = mpDatStream->Tell();
     }
 


More information about the Libreoffice-commits mailing list