[PATCH libreoffice-4-0] various regex fixes squashed into one

Eike Rathke (via Code Review) gerrit at gerrit.libreoffice.org
Tue Mar 12 03:52:59 PDT 2013


Hi,

I have submitted a patch for review:

    https://gerrit.libreoffice.org/2678

To pull it, you can do:

    git pull ssh://gerrit.libreoffice.org:29418/core refs/changes/78/2678/1

various regex fixes squashed into one

i#118925# enhance textsearch's match-group references

to work for look-ahead/look-behind
(cherry picked from commit 3b83c404c56e5db5bab29ffee41f02822410d625)

Conflicts:
	sw/source/core/crsr/findtxt.cxx

(cherry picked from commit 9a93475d6eba53b2e1fba1585dbd11c94ea4b4a3)

Conflicts:
	sw/source/core/crsr/findtxt.cxx

i#120598 better emulation of regexp word-start and word-end operators

The emulation of the word-start and word-end operators provided
the previous regexp engine can be approximated much better
by using the ICU-regexp exngines powerful look-around feature.

Patch-by: Herbert Duerr
Found-by: ldgolds33 at yahoo.com
(cherry picked from commit ec7ef30693f10315ce80a8f5d7325a0e40855e66)

(cherry picked from commit 8c26876fea085a1bc847abba63dffa97a9499c1d)

i#121482# fix attributed text search for regular expression patterns

(cherry picked from commit e7fc662799e7e936753e24db8d6d3849c12b3ff4)
(cherry picked from commit e6288a5d889da7db5bf23174f85c29ccfcaa44d5)

i#121482# fix backwards regexp search for matches overlapping search start

(cherry picked from commit 854f4ad6c57be62bd922df08f603d8bfb7b918a5)

Additionally fixed unit test, searching backward shall not produce a
different result from searching forward. (erAck)

(cherry picked from commit b514f0ce86e85d9be269ddf2e797befbbf3423f1)

i#121633# fix search for upper-case character classes

in ignore-case regular expressions

(cherry picked from commit b7ee1803453d3a766ce3a833892e1c208aacf8ff)

(cherry picked from commit 531538892795bec909bb8baff6bbf9e19baf809e)

i#121633# fix ignore-case problem caused by i18nsearch API mess

(cherry picked from commit 7644ec176049937b588fe171a553c9a07b375792)

(cherry picked from commit f932a3f1195290f9aa37b593190bd4c6ac5fe2f6)

Change-Id: I44d6216b12f17d0560c4e8cf355937797ddeee2a
---
M i18npool/qa/cppunit/test_textsearch.cxx
M i18npool/source/search/textsearch.cxx
M sw/source/core/crsr/findtxt.cxx
3 files changed, 34 insertions(+), 35 deletions(-)



diff --git a/i18npool/qa/cppunit/test_textsearch.cxx b/i18npool/qa/cppunit/test_textsearch.cxx
index c26550b..d7a6c33 100644
--- a/i18npool/qa/cppunit/test_textsearch.cxx
+++ b/i18npool/qa/cppunit/test_textsearch.cxx
@@ -101,7 +101,7 @@
     sal_Int32 startPos = 2, endPos = 20 ;
     OUString searchStr( "(ab)*a(c|d)+" );
     sal_Int32 fStartRes = 10, fEndRes = 18 ;
-    sal_Int32 bStartRes = 18, bEndRes = 14 ;
+    sal_Int32 bStartRes = 18, bEndRes = 10 ;
 
     // set options
     util::SearchOptions aOptions;
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
index 314dd5b..dceb4d7 100644
--- a/i18npool/source/search/textsearch.cxx
+++ b/i18npool/source/search/textsearch.cxx
@@ -60,7 +60,7 @@
     TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
     TransliterationModules_ignoreProlongedSoundMark_ja_JP;
 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
-static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
+static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
     // Above 2 transliteration is simple but need to take effect in
     // complex transliteration
 
@@ -675,21 +675,30 @@
     // REG_NOSUB is not used anywhere => not implemented
     // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
     // LEV_RELAXED is only used for SearchAlgorithm==Approximate
-    // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
-    if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
+    // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
+    // probably because the transliteration flag IGNORE_CASE handles it as well.
+    if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
+    ||  (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
         nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
     UErrorCode nIcuErr = U_ZERO_ERROR;
     // assumption: transliteration didn't mangle regexp control chars
     IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
 #ifndef DISABLE_WORDBOUND_EMULATION
     // for conveniance specific syntax elements of the old regex engine are emulated
-    // by using regular word boundary matching \b to replace \< and \>
-    static const IcuUniString aChevronPattern( "\\\\<|\\\\>", -1, IcuUniString::kInvariant);
-    static const IcuUniString aChevronReplace( "\\\\b", -1, IcuUniString::kInvariant);
-    static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr);
-    aChevronMatcher.reset( aIcuSearchPatStr);
-    aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr);
-    aChevronMatcher.reset();
+    // - by replacing \< with "word-break followed by a look-ahead word-char"
+    static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
+    static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
+    static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
+    aChevronMatcherB.reset( aIcuSearchPatStr);
+    aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
+    aChevronMatcherB.reset();
+    // - by replacing \> with "look-behind word-char followed by a word-break"
+    static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
+    static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
+    static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
+    aChevronMatcherE.reset( aIcuSearchPatStr);
+    aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
+    aChevronMatcherE.reset();
 #endif
     pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
     if( nIcuErr)
@@ -769,9 +778,15 @@
 
     // find the last match
     int nLastPos = 0;
+    int nFoundEnd = 0;
     do {
         nLastPos = pRegexMatcher->start( nIcuErr);
-    } while( pRegexMatcher->find( nLastPos + 1, nIcuErr));
+        nFoundEnd = pRegexMatcher->end( nIcuErr);
+        if( nFoundEnd >= startPos)
+            break;
+        if( nFoundEnd == nLastPos)
+            ++nFoundEnd;
+    } while( pRegexMatcher->find( nFoundEnd, nIcuErr));
 
     // find last match again to get its details
     pRegexMatcher->find( nLastPos, nIcuErr);
diff --git a/sw/source/core/crsr/findtxt.cxx b/sw/source/core/crsr/findtxt.cxx
index c1231f9..101b002 100644
--- a/sw/source/core/crsr/findtxt.cxx
+++ b/sw/source/core/crsr/findtxt.cxx
@@ -258,7 +258,7 @@
     {
         if( pNode->IsTxtNode() )
         {
-            nTxtLen = ((SwTxtNode*)pNode)->GetTxt().Len();
+            nTxtLen = static_cast<SwTxtNode*>(pNode)->GetTxt().Len();
             if( rNdIdx == pPam->GetMark()->nNode )
                 nEnd = pPam->GetMark()->nContent.GetIndex();
             else
@@ -655,30 +655,14 @@
         if( pTxtNode && pTxtNode->IsTxtNode() && pTxtNode == pPam->GetCntntNode( sal_False ) )
         {
             utl::TextSearch aSTxt( rSearchOpt );
-            String aStr( pPam->GetTxt() );
-            String aReplaceStr( rSearchOpt.replaceString );
-            aStr = comphelper::string::remove(aStr, CH_TXTATR_BREAKWORD);
-            aStr = comphelper::string::remove(aStr, CH_TXTATR_INWORD);
-            xub_StrLen nStart = 0;
-            rtl::OUString sX( 'x' );
-            if( pPam->Start()->nContent > 0 )
-            {
-                aStr.Insert( sX, 0 );
-                ++nStart;
-            }
-            xub_StrLen nEnd = aStr.Len();
-            bool bDeleteLastX = false;
-            if( pPam->End()->nContent < (static_cast<const SwTxtNode*>(pTxtNode))->GetTxt().Len() )
-            {
-                aStr.Insert( sX );
-                bDeleteLastX = true;
-            }
+            const String& rStr = static_cast<const SwTxtNode*>(pTxtNode)->GetTxt();
+            xub_StrLen nStart = pPam->Start()->nContent.GetIndex();
+            xub_StrLen nEnd = pPam->End()->nContent.GetIndex();
             SearchResult aResult;
-            if( aSTxt.SearchFrwrd( aStr, &nStart, &nEnd, &aResult ) )
+            if( aSTxt.SearchFrwrd( rStr, &nStart, &nEnd, &aResult ) )
             {
-                if( bDeleteLastX )
-                    aStr.Erase( aStr.Len() - 1 );
-                aSTxt.ReplaceBackReferences( aReplaceStr, aStr, aResult );
+                String aReplaceStr( rSearchOpt.replaceString );
+                aSTxt.ReplaceBackReferences( aReplaceStr, rStr, aResult );
                 pRet = new String( aReplaceStr );
             }
         }

-- 
To view, visit https://gerrit.libreoffice.org/2678
To unsubscribe, visit https://gerrit.libreoffice.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I44d6216b12f17d0560c4e8cf355937797ddeee2a
Gerrit-PatchSet: 1
Gerrit-Project: core
Gerrit-Branch: libreoffice-4-0
Gerrit-Owner: Eike Rathke <erack at redhat.com>



More information about the LibreOffice mailing list