[Libreoffice-commits] core.git: sw/inc sw/qa sw/source

tobias (via logerrit) logerrit at kemper.freedesktop.org
Sun Jun 6 16:53:27 UTC 2021


 sw/inc/iodetect.hxx                            |    2 -
 sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt |binary
 sw/qa/extras/txtexport/data/UTF16LECRLF.txt    |binary
 sw/qa/extras/txtexport/data/UTF8CRLF.txt       |    2 +
 sw/qa/extras/txtexport/txtexport.cxx           |   49 ++++++++++++++++---------
 sw/source/filter/ascii/parasc.cxx              |    5 ++
 sw/source/filter/basflt/iodetect.cxx           |    8 +++-
 7 files changed, 46 insertions(+), 20 deletions(-)

New commits:
commit 162f5a20095c6937030d23ee03fb8f72c51eefa1
Author:     tobias <tobias.schulz at hotmail.com>
AuthorDate: Sun Jun 6 15:47:06 2021 +0200
Commit:     Noel Grandin <noel.grandin at collabora.co.uk>
CommitDate: Sun Jun 6 18:52:52 2021 +0200

    tdf#142669 Consider BOM on text encoding detection
    
    Return a flag if the auto detected text has a BOM.
    Save the flag in SwAsciiOptions so that BOM gets set correctly when
    file is written.
    
    Change-Id: I358c3ba243bc326a552c2dc24773c94f8319c700
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/116759
    Tested-by: Jenkins
    Reviewed-by: Noel Grandin <noel.grandin at collabora.co.uk>

diff --git a/sw/inc/iodetect.hxx b/sw/inc/iodetect.hxx
index 534b3c1f2bb5..1d5713aaccab 100644
--- a/sw/inc/iodetect.hxx
+++ b/sw/inc/iodetect.hxx
@@ -104,7 +104,7 @@ public:
     static bool IsValidStgFilter( const css::uno::Reference < css::embed::XStorage >& rStg, const SfxFilter& rFilter);
 
     static bool IsDetectableText( const char* pBuf, sal_uLong &rLen,
-            rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd );
+            rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom);
 
     static OUString GetSubStorageName( const SfxFilter& rFltr );
 };
diff --git a/sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt b/sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt
new file mode 100644
index 000000000000..be232521eafc
Binary files /dev/null and b/sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt differ
diff --git a/sw/qa/extras/txtexport/data/UTF16LECRLF.txt b/sw/qa/extras/txtexport/data/UTF16LECRLF.txt
index be232521eafc..b74e964113de 100644
Binary files a/sw/qa/extras/txtexport/data/UTF16LECRLF.txt and b/sw/qa/extras/txtexport/data/UTF16LECRLF.txt differ
diff --git a/sw/qa/extras/txtexport/data/UTF8CRLF.txt b/sw/qa/extras/txtexport/data/UTF8CRLF.txt
new file mode 100644
index 000000000000..62d4d44677b6
--- /dev/null
+++ b/sw/qa/extras/txtexport/data/UTF8CRLF.txt
@@ -0,0 +1,2 @@
+フー
+バー
diff --git a/sw/qa/extras/txtexport/txtexport.cxx b/sw/qa/extras/txtexport/txtexport.cxx
index a5f989cb6689..0e52f51a4e34 100644
--- a/sw/qa/extras/txtexport/txtexport.cxx
+++ b/sw/qa/extras/txtexport/txtexport.cxx
@@ -20,19 +20,25 @@ public:
     }
 
 protected:
-    OString readExportedFile()
+    template <class T> std::vector<T> readMemoryStream()
     {
         SvMemoryStream aMemoryStream;
         SvFileStream aStream(maTempFile.GetURL(), StreamMode::READ);
         aStream.ReadStream(aMemoryStream);
-        const char* pData = static_cast<const char*>(aMemoryStream.GetData());
+        const T* pData = static_cast<const T*>(aMemoryStream.GetData());
+        return std::vector<T>(pData, pData + aMemoryStream.GetSize());
+    }
+
+    OString readExportedFile()
+    {
+        std::vector<char> aMemStream = readMemoryStream<char>();
 
         int offset = 0;
-        if (aMemoryStream.GetSize() > 2 && pData[0] == '\xEF' && pData[1] == '\xBB'
-            && pData[2] == '\xBF')
+        if (aMemStream.size() > 2 && aMemStream[0] == '\xEF' && aMemStream[1] == '\xBB'
+            && aMemStream[2] == '\xBF')
             offset = 3;
 
-        return OString(pData + offset, aMemoryStream.GetSize() - offset);
+        return OString(aMemStream.data() + offset, aMemStream.size() - offset);
     }
 };
 
@@ -64,25 +70,34 @@ DECLARE_TXTEXPORT_TEST(testBullets, "bullets.odt")
     CPPUNIT_ASSERT_EQUAL(aExpected, aData);
 }
 
-DECLARE_TXTEXPORT_TEST(testTdf120574_utf8, "UTF8BOMCRLF.txt")
+DECLARE_TXTEXPORT_TEST(testTdf120574_utf8bom, "UTF8BOMCRLF.txt")
 {
-    SvMemoryStream aMemoryStream;
-    SvFileStream aStream(maTempFile.GetURL(), StreamMode::READ);
-    aStream.ReadStream(aMemoryStream);
-    const char* pData = static_cast<const char*>(aMemoryStream.GetData());
-    OString aData(std::string_view(pData, aMemoryStream.GetSize()));
+    std::vector<char> aMemStream = readMemoryStream<char>();
+    OString aData(std::string_view(aMemStream.data(), aMemStream.size()));
     CPPUNIT_ASSERT_EQUAL(OString(u8"\uFEFFフー\r\nバー\r\n"), aData);
 }
 
-DECLARE_TXTEXPORT_TEST(testTdf120574_utf16le, "UTF16LECRLF.txt")
+DECLARE_TXTEXPORT_TEST(testTdf120574_utf16lebom, "UTF16LEBOMCRLF.txt")
 {
-    SvMemoryStream aMemoryStream;
-    SvFileStream aStream(maTempFile.GetURL(), StreamMode::READ);
-    aStream.ReadStream(aMemoryStream);
-    const sal_Unicode* pData = static_cast<const sal_Unicode*>(aMemoryStream.GetData());
-    OUString aData(pData, aMemoryStream.GetSize() / sizeof(sal_Unicode));
+    std::vector<sal_Unicode> aMemStream = readMemoryStream<sal_Unicode>();
+    OUString aData(aMemStream.data(), aMemStream.size() / sizeof(sal_Unicode));
     CPPUNIT_ASSERT_EQUAL(OUString(u"\uFEFFフー\r\nバー\r\n"), aData);
 }
+
+DECLARE_TXTEXPORT_TEST(testTdf142669_utf8, "UTF8CRLF.txt")
+{
+    std::vector<char> aMemStream = readMemoryStream<char>();
+    OString aData(std::string_view(aMemStream.data(), aMemStream.size()));
+    CPPUNIT_ASSERT_EQUAL(OString(u8"フー\r\nバー\r\n"), aData);
+}
+
+DECLARE_TXTEXPORT_TEST(testTdf142669_utf16le, "UTF16LECRLF.txt")
+{
+    std::vector<sal_Unicode> aMemStream = readMemoryStream<sal_Unicode>();
+    OUString aData(aMemStream.data(), aMemStream.size() / sizeof(sal_Unicode));
+    CPPUNIT_ASSERT_EQUAL(OUString(u"フー\r\nバー\r\n"), aData);
+}
+
 CPPUNIT_PLUGIN_IMPLEMENT();
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sw/source/filter/ascii/parasc.cxx b/sw/source/filter/ascii/parasc.cxx
index 8cdfa91ba6b3..871c8315a575 100644
--- a/sw/source/filter/ascii/parasc.cxx
+++ b/sw/source/filter/ascii/parasc.cxx
@@ -275,8 +275,10 @@ ErrCode SwASCIIParser::ReadChars()
         nOrig = nLen = m_rInput.ReadBytes(m_pArr.get(), ASC_BUFFLEN);
         rtl_TextEncoding eCharSet;
         LineEnd eLineEnd;
+        bool bHasBom;
         const bool bRet
-            = SwIoSystem::IsDetectableText(m_pArr.get(), nLen, &eCharSet, &bSwapUnicode, &eLineEnd);
+            = SwIoSystem::IsDetectableText(m_pArr.get(), nLen, &eCharSet,
+                                            &bSwapUnicode, &eLineEnd, &bHasBom);
         if (!bRet)
             return ERRCODE_IO_BROKENPACKAGE;
 
@@ -285,6 +287,7 @@ ErrCode SwASCIIParser::ReadChars()
         {
             aEmpty.SetCharSet(eCharSet);
             aEmpty.SetParaFlags(eLineEnd);
+            aEmpty.SetIncludeBOM(bHasBom);
             m_rInput.SeekRel(-(tools::Long(nLen)));
         }
         else
diff --git a/sw/source/filter/basflt/iodetect.cxx b/sw/source/filter/basflt/iodetect.cxx
index 2f49b2b199d1..e4d214391f2c 100644
--- a/sw/source/filter/basflt/iodetect.cxx
+++ b/sw/source/filter/basflt/iodetect.cxx
@@ -239,11 +239,12 @@ std::shared_ptr<const SfxFilter> SwIoSystem::GetFileFilter(const OUString& rFile
 }
 
 bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen,
-    rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd)
+    rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom)
 {
     bool bSwap = false;
     rtl_TextEncoding eCharSet = RTL_TEXTENCODING_DONTKNOW;
     bool bLE = true;
+    bool bBom = false;
     /*See if it's a known unicode type*/
     if (rLen >= 2)
     {
@@ -253,17 +254,20 @@ bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen,
         {
             eCharSet = RTL_TEXTENCODING_UTF8;
             nHead = 3;
+            bBom = true;
         }
         else if (sal_uInt8(pBuf[0]) == 0xFE && sal_uInt8(pBuf[1]) == 0xFF)
         {
             eCharSet = RTL_TEXTENCODING_UCS2;
             bLE = false;
             nHead = 2;
+            bBom = true;
         }
         else if (sal_uInt8(pBuf[1]) == 0xFE && sal_uInt8(pBuf[0]) == 0xFF)
         {
             eCharSet = RTL_TEXTENCODING_UCS2;
             nHead = 2;
+            bBom = true;
         }
         pBuf+=nHead;
         rLen-=nHead;
@@ -400,6 +404,8 @@ bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen,
         *pSwap = bSwap;
     if (pLineEnd)
         *pLineEnd = eLineEnd;
+    if (pBom)
+        *pBom = bBom;
 
     return !bIsBareUnicode;
 }


More information about the Libreoffice-commits mailing list