[Libreoffice-commits] core.git: sc/source

Eike Rathke erack at redhat.com
Mon Jul 2 14:23:45 UTC 2018


 sc/source/ui/dbgui/scuiasciiopt.cxx |   30 ++++++++++++++++----
 sc/source/ui/docshell/impex.cxx     |   54 +++++++++++++++++++++++++++++-------
 sc/source/ui/inc/impex.hxx          |   13 ++++++++
 sc/source/ui/inc/scuiasciiopt.hxx   |    3 +-
 4 files changed, 83 insertions(+), 17 deletions(-)

New commits:
commit c807e7ea7a0725a4d8375eda07d6f70870e0d50a
Author: Eike Rathke <erack at redhat.com>
Date:   Mon Jul 2 14:41:59 2018 +0200

    Resolves: tdf#56910 detect a Space (blank) separator if not selected
    
    On populating the CSV import dialog for the first time attempt to
    detect a possible space (blank) separator if field separators
    don't include it already. This can be necessary because of the
    "accept broken misquoted CSV fields" feature that tries to ignore
    trailing blanks after a quoted field and if no separator follows
    continues to add content to the field assuming the single double
    quote was in error. If this blank separator is detected it is
    added to field separators and the line and subsequent lines are
    reread with the new separators.
    
    Change-Id: I3c6d74ce8883f1d279a810e800e54b349d85ac71
    Reviewed-on: https://gerrit.libreoffice.org/56810
    Reviewed-by: Eike Rathke <erack at redhat.com>
    Tested-by: Jenkins

diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx
index b885e9b9c7ec..aeb718be4d08 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -288,7 +288,8 @@ ScImportAsciiDlg::ScImportAsciiDlg( vcl::Window* pParent, const OUString& aDatNa
         aColumnUser ( ScResId( SCSTR_COLUMN_USER ) ),
         aTextSepList(SCSTR_TEXTSEP),
         mcTextSep   ( ScAsciiOptions::cDefaultTextSep ),
-        meCall(eCall)
+        meCall(eCall),
+        mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS)
 {
     get(pFtCharSet, "textcharset");
     get(pLbCharSet, "charset");
@@ -558,7 +559,7 @@ void ScImportAsciiDlg::dispose()
     ModalDialog::dispose();
 }
 
-bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText )
+bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& rcDetectSep )
 {
     if (nLine >= ASCIIDLG_MAXROWS || !mpDatStream)
         return false;
@@ -591,7 +592,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText )
                 break;
             }
             rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators,
-                    mcTextSep);
+                    mcTextSep, rcDetectSep);
             mnStreamPos = mpDatStream->Tell();
             mpRowPosArray[++mnRowPosCount] = mnStreamPos;
         } while (nLine >= mnRowPosCount && mpDatStream->good());
@@ -606,7 +607,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText )
     else
     {
         Seek( mpRowPosArray[nLine]);
-        rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep);
+        rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep);
         mnStreamPos = mpDatStream->Tell();
     }
 
@@ -805,6 +806,12 @@ IMPL_LINK( ScImportAsciiDlg, LbColTypeHdl, ListBox&, rListBox, void )
 
 IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
 {
+    // Checking the separator can only be done once for the very first time
+    // when the dialog wasn't already presented to the user.
+    // As a side effect this has the benefit that the check is only done on the
+    // first set of visible lines.
+    sal_Unicode cDetectSep = (mbDetectSpaceSep && !pRbFixed->IsChecked() && !pCkbSpace->IsChecked() ? 0 : 0xffff);
+
     sal_Int32 nBaseLine = mpTableBox->GetFirstVisLine();
     sal_Int32 nRead = mpTableBox->GetVisLineCount();
     // If mnRowPosCount==0, this is an initializing call, read ahead for row
@@ -817,12 +824,25 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
     sal_Int32 i;
     for (i = 0; i < nRead; i++)
     {
-        if (!GetLine( nBaseLine + i, maPreviewLine[i]))
+        if (!GetLine( nBaseLine + i, maPreviewLine[i], cDetectSep))
             break;
     }
     for (; i < CSV_PREVIEW_LINES; i++)
         maPreviewLine[i].clear();
 
+    if (mbDetectSpaceSep)
+    {
+        mbDetectSpaceSep = false;
+        if (cDetectSep == ' ')
+        {
+            // Expect space to be appended by now so all subsequent
+            // GetLine()/ReadCsvLine() actually used it.
+            assert(maFieldSeparators.endsWith(" "));
+            // Preselect Space in UI.
+            pCkbSpace->Check();
+        }
+    }
+
     mpTableBox->Execute( CSVCMD_SETLINECOUNT, mnRowPosCount);
     bool bMergeSep = pCkbAsOnce->IsChecked();
     bool bRemoveSpace = pCkbRemoveSpace->IsChecked();
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 3b3068764f24..854bc92b9635 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -564,7 +564,7 @@ enum QuoteType
         FIELDEND_QUOTE if end of field quote
         DONTKNOW_QUOTE anything else
  */
-static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps )
+static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps, sal_Unicode& rcDetectSep )
 {
     // Due to broken CSV generators that don't double embedded quotes check if
     // a field separator immediately or with trailing spaces follows the quote,
@@ -572,6 +572,10 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
     const sal_Unicode cBlank = ' ';
     if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank))
         return FIELDEND_QUOTE;
+    // Detect a possible blank separator if it's not already in the list (which
+    // was checked right above for p[1]==cBlank).
+    if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank)
+        rcDetectSep = cBlank;
     while (p[1] == cBlank)
         ++p;
     if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
@@ -601,7 +605,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
                             do not increment nQuotes in caller then!
  */
 static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
-        const sal_Unicode* pSeps, sal_Unicode cStr )
+        const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
 {
     if ((nQuotes % 2) == 0)
     {
@@ -615,7 +619,7 @@ static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unic
     }
     if (p[1] == cStr)
         return FIRST_QUOTE;
-    return lcl_isFieldEndQuote( p, pSeps);
+    return lcl_isFieldEndQuote( p, pSeps, rcDetectSep);
 }
 
 /** Append characters of [p1,p2) to rField.
@@ -664,7 +668,8 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rStrin
                     // break or continue for loop
                     if (eMode == DoubledQuoteMode::ESCAPE)
                     {
-                        if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE)
+                        sal_Unicode cDetectSep = 0xffff;    // No separator detection here.
+                        if (lcl_isFieldEndQuote( p-1, pSeps, cDetectSep) == FIELDEND_QUOTE)
                             break;
                         else
                             continue;
@@ -1299,8 +1304,8 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
     SCTAB nTab = aRange.aStart.Tab();
 
     bool    bFixed              = pExtOptions->IsFixedLen();
-    const OUString& rSeps       = pExtOptions->GetFieldSeps();
-    const sal_Unicode* pSeps    = rSeps.getStr();
+    OUString aSeps              = pExtOptions->GetFieldSeps();  // Need non-const for ReadCsvLine(),
+    const sal_Unicode* pSeps    = aSeps.getStr();               // but it will be const anyway (asserted below).
     bool    bMerge              = pExtOptions->IsMergeSeps();
     bool    bRemoveSpace        = pExtOptions->IsRemoveSpace();
     sal_uInt16  nInfoCount      = pExtOptions->GetInfoCount();
@@ -1336,10 +1341,11 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
     OUString aCell;
     sal_uInt16 i;
     SCROW nRow = nStartRow;
+    sal_Unicode cDetectSep = 0xffff;    // No separator detection here.
 
     while(--nSkipLines>0)
     {
-        aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr); // content is ignored
+        aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep); // content is ignored
         if ( rStrm.eof() )
             break;
     }
@@ -1362,10 +1368,12 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
     {
         for( ;; )
         {
-            aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr);
+            aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep);
             if ( rStrm.eof() && aLine.isEmpty() )
                 break;
 
+            assert(pSeps == aSeps.getStr());
+
             if ( nRow > MAXROW )
             {
                 bOverflowRow = true;    // display warning on import
@@ -2380,8 +2388,26 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr )
 }
 
 OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
-        const OUString& rFieldSeparators, sal_Unicode cFieldQuote )
+        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep )
 {
+    enum RetryState
+    {
+        FORBID,
+        ALLOW,
+        RETRY,
+        RETRIED
+    } eRetryState = (bEmbeddedLineBreak && rcDetectSep == 0 ? RetryState::ALLOW : RetryState::FORBID);
+
+    sal_uInt64 nStreamPos = (eRetryState == RetryState::ALLOW ? rStream.Tell() : 0);
+
+Label_RetryWithNewSep:
+
+    if (eRetryState == RetryState::RETRY)
+    {
+        eRetryState = RetryState::RETRIED;
+        rStream.Seek( nStreamPos);
+    }
+
     OUString aStr;
     rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
 
@@ -2416,7 +2442,15 @@ OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
                         // we are in FIELDEND_QUOTE state.
                         else if (eQuoteState != FIELDEND_QUOTE)
                         {
-                            eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote);
+                            eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep);
+
+                            if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ')
+                            {
+                                eRetryState = RetryState::RETRY;
+                                rFieldSeparators += OUString(' ');
+                                goto Label_RetryWithNewSep;
+                            }
+
                             // DONTKNOW_QUOTE is an embedded unescaped quote we
                             // don't count for pairing.
                             if (eQuoteState != DONTKNOW_QUOTE)
diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx
index 152ae2da98ca..e297c1b7498a 100644
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -175,10 +175,21 @@ public:
 
     @param rFieldSeparators
     A list of characters that each may act as a field separator.
+    If rcDetectSep was 0 and a separator is detected then it is appended to
+    rFieldSeparators.
 
     @param cFieldQuote
     The quote character used.
 
+    @param rcDetectSep
+    If 0 then attempt to detect a possible space (blank) separator if
+    rFieldSeparators doesn't include it already. This can be necessary because
+    of the "accept broken misquoted CSV fields" feature that tries to ignore
+    trailing blanks after a quoted field and if no separator follows continues
+    to add content to the field assuming the single double quote was in error.
+    If this blank separator is detected it is added to rFieldSeparators and the
+    line is reread with the new separators
+
     check Stream::good() to detect IO problems during read
 
     @ATTENTION
@@ -199,7 +210,7 @@ public:
 
   */
 SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
-        const OUString& rFieldSeparators, sal_Unicode cFieldQuote );
+        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep );
 
 #endif
 
diff --git a/sc/source/ui/inc/scuiasciiopt.hxx b/sc/source/ui/inc/scuiasciiopt.hxx
index 5a6ea8a1a0c7..c65fb81c3cec 100644
--- a/sc/source/ui/inc/scuiasciiopt.hxx
+++ b/sc/source/ui/inc/scuiasciiopt.hxx
@@ -89,6 +89,7 @@ class ScImportAsciiDlg : public ModalDialog
     rtl_TextEncoding            meCharSet;          /// Selected char set.
     bool                        mbCharSetSystem;    /// Is System char set selected?
     ScImportAsciiCall           meCall;             /// How the dialog is called (see asciiopt.hxx)
+    bool                        mbDetectSpaceSep;   /// Whether to detect a possible space separator.
 
 public:
                                 ScImportAsciiDlg(
@@ -111,7 +112,7 @@ private:
     /** Enables or disables all separator checkboxes and edit fields. */
     void                        SetupSeparatorCtrls();
 
-    bool                        GetLine( sal_uLong nLine, OUString &rText );
+    bool                        GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& rcDetectSep );
     void                        UpdateVertical();
     inline bool                 Seek( sal_uLong nPos ); // synced to and from mnStreamPos
 


More information about the Libreoffice-commits mailing list