[PATCH libreoffice-4-0] various regex fixes squashed into one
Eike Rathke (via Code Review)
gerrit at gerrit.libreoffice.org
Tue Mar 12 03:52:59 PDT 2013
Hi,
I have submitted a patch for review:
https://gerrit.libreoffice.org/2678
To pull it, you can do:
git pull ssh://gerrit.libreoffice.org:29418/core refs/changes/78/2678/1
various regex fixes squashed into one
i#118925# enhance textsearch's match-group references
to work for look-ahead/look-behind
(cherry picked from commit 3b83c404c56e5db5bab29ffee41f02822410d625)
Conflicts:
sw/source/core/crsr/findtxt.cxx
(cherry picked from commit 9a93475d6eba53b2e1fba1585dbd11c94ea4b4a3)
Conflicts:
sw/source/core/crsr/findtxt.cxx
i#120598 better emulation of regexp word-start and word-end operators
The emulation of the word-start and word-end operators provided
the previous regexp engine can be approximated much better
by using the ICU-regexp exngines powerful look-around feature.
Patch-by: Herbert Duerr
Found-by: ldgolds33 at yahoo.com
(cherry picked from commit ec7ef30693f10315ce80a8f5d7325a0e40855e66)
(cherry picked from commit 8c26876fea085a1bc847abba63dffa97a9499c1d)
i#121482# fix attributed text search for regular expression patterns
(cherry picked from commit e7fc662799e7e936753e24db8d6d3849c12b3ff4)
(cherry picked from commit e6288a5d889da7db5bf23174f85c29ccfcaa44d5)
i#121482# fix backwards regexp search for matches overlapping search start
(cherry picked from commit 854f4ad6c57be62bd922df08f603d8bfb7b918a5)
Additionally fixed unit test, searching backward shall not produce a
different result from searching forward. (erAck)
(cherry picked from commit b514f0ce86e85d9be269ddf2e797befbbf3423f1)
i#121633# fix search for upper-case character classes
in ignore-case regular expressions
(cherry picked from commit b7ee1803453d3a766ce3a833892e1c208aacf8ff)
(cherry picked from commit 531538892795bec909bb8baff6bbf9e19baf809e)
i#121633# fix ignore-case problem caused by i18nsearch API mess
(cherry picked from commit 7644ec176049937b588fe171a553c9a07b375792)
(cherry picked from commit f932a3f1195290f9aa37b593190bd4c6ac5fe2f6)
Change-Id: I44d6216b12f17d0560c4e8cf355937797ddeee2a
---
M i18npool/qa/cppunit/test_textsearch.cxx
M i18npool/source/search/textsearch.cxx
M sw/source/core/crsr/findtxt.cxx
3 files changed, 34 insertions(+), 35 deletions(-)
diff --git a/i18npool/qa/cppunit/test_textsearch.cxx b/i18npool/qa/cppunit/test_textsearch.cxx
index c26550b..d7a6c33 100644
--- a/i18npool/qa/cppunit/test_textsearch.cxx
+++ b/i18npool/qa/cppunit/test_textsearch.cxx
@@ -101,7 +101,7 @@
sal_Int32 startPos = 2, endPos = 20 ;
OUString searchStr( "(ab)*a(c|d)+" );
sal_Int32 fStartRes = 10, fEndRes = 18 ;
- sal_Int32 bStartRes = 18, bEndRes = 14 ;
+ sal_Int32 bStartRes = 18, bEndRes = 10 ;
// set options
util::SearchOptions aOptions;
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
index 314dd5b..dceb4d7 100644
--- a/i18npool/source/search/textsearch.cxx
+++ b/i18npool/source/search/textsearch.cxx
@@ -60,7 +60,7 @@
TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
TransliterationModules_ignoreProlongedSoundMark_ja_JP;
static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
-static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
+static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
// Above 2 transliteration is simple but need to take effect in
// complex transliteration
@@ -675,21 +675,30 @@
// REG_NOSUB is not used anywhere => not implemented
// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
// LEV_RELAXED is only used for SearchAlgorithm==Approximate
- // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
- if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
+ // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
+ // probably because the transliteration flag IGNORE_CASE handles it as well.
+ if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
+ || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
UErrorCode nIcuErr = U_ZERO_ERROR;
// assumption: transliteration didn't mangle regexp control chars
IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
#ifndef DISABLE_WORDBOUND_EMULATION
// for conveniance specific syntax elements of the old regex engine are emulated
- // by using regular word boundary matching \b to replace \< and \>
- static const IcuUniString aChevronPattern( "\\\\<|\\\\>", -1, IcuUniString::kInvariant);
- static const IcuUniString aChevronReplace( "\\\\b", -1, IcuUniString::kInvariant);
- static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr);
- aChevronMatcher.reset( aIcuSearchPatStr);
- aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr);
- aChevronMatcher.reset();
+ // - by replacing \< with "word-break followed by a look-ahead word-char"
+ static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
+ static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
+ static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
+ aChevronMatcherB.reset( aIcuSearchPatStr);
+ aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
+ aChevronMatcherB.reset();
+ // - by replacing \> with "look-behind word-char followed by a word-break"
+ static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
+ static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
+ static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
+ aChevronMatcherE.reset( aIcuSearchPatStr);
+ aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
+ aChevronMatcherE.reset();
#endif
pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
if( nIcuErr)
@@ -769,9 +778,15 @@
// find the last match
int nLastPos = 0;
+ int nFoundEnd = 0;
do {
nLastPos = pRegexMatcher->start( nIcuErr);
- } while( pRegexMatcher->find( nLastPos + 1, nIcuErr));
+ nFoundEnd = pRegexMatcher->end( nIcuErr);
+ if( nFoundEnd >= startPos)
+ break;
+ if( nFoundEnd == nLastPos)
+ ++nFoundEnd;
+ } while( pRegexMatcher->find( nFoundEnd, nIcuErr));
// find last match again to get its details
pRegexMatcher->find( nLastPos, nIcuErr);
diff --git a/sw/source/core/crsr/findtxt.cxx b/sw/source/core/crsr/findtxt.cxx
index c1231f9..101b002 100644
--- a/sw/source/core/crsr/findtxt.cxx
+++ b/sw/source/core/crsr/findtxt.cxx
@@ -258,7 +258,7 @@
{
if( pNode->IsTxtNode() )
{
- nTxtLen = ((SwTxtNode*)pNode)->GetTxt().Len();
+ nTxtLen = static_cast<SwTxtNode*>(pNode)->GetTxt().Len();
if( rNdIdx == pPam->GetMark()->nNode )
nEnd = pPam->GetMark()->nContent.GetIndex();
else
@@ -655,30 +655,14 @@
if( pTxtNode && pTxtNode->IsTxtNode() && pTxtNode == pPam->GetCntntNode( sal_False ) )
{
utl::TextSearch aSTxt( rSearchOpt );
- String aStr( pPam->GetTxt() );
- String aReplaceStr( rSearchOpt.replaceString );
- aStr = comphelper::string::remove(aStr, CH_TXTATR_BREAKWORD);
- aStr = comphelper::string::remove(aStr, CH_TXTATR_INWORD);
- xub_StrLen nStart = 0;
- rtl::OUString sX( 'x' );
- if( pPam->Start()->nContent > 0 )
- {
- aStr.Insert( sX, 0 );
- ++nStart;
- }
- xub_StrLen nEnd = aStr.Len();
- bool bDeleteLastX = false;
- if( pPam->End()->nContent < (static_cast<const SwTxtNode*>(pTxtNode))->GetTxt().Len() )
- {
- aStr.Insert( sX );
- bDeleteLastX = true;
- }
+ const String& rStr = static_cast<const SwTxtNode*>(pTxtNode)->GetTxt();
+ xub_StrLen nStart = pPam->Start()->nContent.GetIndex();
+ xub_StrLen nEnd = pPam->End()->nContent.GetIndex();
SearchResult aResult;
- if( aSTxt.SearchFrwrd( aStr, &nStart, &nEnd, &aResult ) )
+ if( aSTxt.SearchFrwrd( rStr, &nStart, &nEnd, &aResult ) )
{
- if( bDeleteLastX )
- aStr.Erase( aStr.Len() - 1 );
- aSTxt.ReplaceBackReferences( aReplaceStr, aStr, aResult );
+ String aReplaceStr( rSearchOpt.replaceString );
+ aSTxt.ReplaceBackReferences( aReplaceStr, rStr, aResult );
pRet = new String( aReplaceStr );
}
}
--
To view, visit https://gerrit.libreoffice.org/2678
To unsubscribe, visit https://gerrit.libreoffice.org/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I44d6216b12f17d0560c4e8cf355937797ddeee2a
Gerrit-PatchSet: 1
Gerrit-Project: core
Gerrit-Branch: libreoffice-4-0
Gerrit-Owner: Eike Rathke <erack at redhat.com>
More information about the LibreOffice
mailing list