[Libreoffice-commits] core.git: Branch 'distro/mimo/mimo-7-0' - 2 commits - configure.ac external/liborcus
Andras Timar (via logerrit)
logerrit at kemper.freedesktop.org
Mon May 10 21:28:15 UTC 2021
configure.ac | 2
external/liborcus/UnpackedTarball_liborcus.mk | 4
external/liborcus/allow-utf-8-in-xml-names.patch | 301 +++++++++++++++++++++++
3 files changed, 306 insertions(+), 1 deletion(-)
New commits:
commit 8d458fce078cf5904c9af10b655a245972f56d3f
Author: Andras Timar <andras.timar at collabora.com>
AuthorDate: Mon May 10 23:27:34 2021 +0200
Commit: Andras Timar <andras.timar at collabora.com>
CommitDate: Mon May 10 23:27:34 2021 +0200
Bump version to 7.0.7.0.M1
Change-Id: I5198132ce2b586d95c9f5fe023ac9482acddbebe
diff --git a/configure.ac b/configure.ac
index 15a5137f40fe..739c862272ab 100644
--- a/configure.ac
+++ b/configure.ac
@@ -9,7 +9,7 @@ dnl in order to create a configure script.
# several non-alphanumeric characters, those are split off and used only for the
# ABOUTBOXPRODUCTVERSIONSUFFIX in openoffice.lst. Why that is necessary, no idea.
-AC_INIT([LibreOffice],[7.0.7.0.0+],[],[],[http://documentfoundation.org/])
+AC_INIT([LibreOffice],[7.0.7.0.M1],[],[],[http://documentfoundation.org/])
dnl libnumbertext needs autoconf 2.68, but that can pick up autoconf268 just fine if it is installed
dnl whereas aclocal (as run by autogen.sh) insists on using autoconf and fails hard
commit 0e12435559da8d3c10d82941749d804ceddcaeae
Author: Luboš Luňák <l.lunak at collabora.com>
AuthorDate: Thu Apr 29 20:10:34 2021 +0200
Commit: Andras Timar <andras.timar at collabora.com>
CommitDate: Mon May 10 23:22:01 2021 +0200
allow utf-8 in xml names (liborcus) (tdf#141672)
Change-Id: Ib150d55b588a572e4352396f18de2331983b2aae
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/114892
Tested-by: Jenkins
Reviewed-by: Luboš Luňák <l.lunak at collabora.com>
(cherry picked from commit 6b7c2fa65eb68be520ed4135cc245e33fa22e8bf)
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/114915
Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoffice at gmail.com>
Reviewed-by: Andras Timar <andras.timar at collabora.com>
diff --git a/external/liborcus/UnpackedTarball_liborcus.mk b/external/liborcus/UnpackedTarball_liborcus.mk
index 113e8e25818d..3b54db0e49ce 100644
--- a/external/liborcus/UnpackedTarball_liborcus.mk
+++ b/external/liborcus/UnpackedTarball_liborcus.mk
@@ -23,6 +23,10 @@ $(eval $(call gb_UnpackedTarball_add_patches,liborcus,\
external/liborcus/include.patch.0 \
))
+$(eval $(call gb_UnpackedTarball_add_patches,liborcus,\
+ external/liborcus/allow-utf-8-in-xml-names.patch \
+))
+
ifeq ($(OS),WNT)
$(eval $(call gb_UnpackedTarball_add_patches,liborcus,\
external/liborcus/windows-constants-hack.patch \
diff --git a/external/liborcus/allow-utf-8-in-xml-names.patch b/external/liborcus/allow-utf-8-in-xml-names.patch
new file mode 100644
index 000000000000..e3430881053d
--- /dev/null
+++ b/external/liborcus/allow-utf-8-in-xml-names.patch
@@ -0,0 +1,301 @@
+From fa9b6845ed583f5486372c6ffbc59e02a140d303 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak at centrum.cz>
+Date: Thu, 29 Apr 2021 19:12:20 +0200
+Subject: [PATCH] allow utf-8 in xml names (#137)
+
+https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
+has a list of all allowed characters.
+---
+ include/orcus/sax_parser_base.hpp | 3 +
+ src/orcus_test_xml.cpp | 1 +
+ src/parser/sax_parser_base.cpp | 201 ++++++++++++++++++++++++++++--
+ test/xml/non-ascii/check.txt | 4 +
+ test/xml/non-ascii/input.xml | 4 +
+ 5 files changed, 201 insertions(+), 12 deletions(-)
+ create mode 100644 test/xml/non-ascii/check.txt
+ create mode 100644 test/xml/non-ascii/input.xml
+
+diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp
+index 9939e133..8394c07b 100644
+--- a/include/orcus/sax_parser_base.hpp
++++ b/include/orcus/sax_parser_base.hpp
+@@ -218,6 +218,9 @@ protected:
+ void element_name(parser_element& elem, std::ptrdiff_t begin_pos);
+ void attribute_name(pstring& attr_ns, pstring& attr_name);
+ void characters_with_encoded_char(cell_buffer& buf);
++
++ int is_name_char();
++ int is_name_start_char();
+ };
+
+ }}
+diff --git a/src/orcus_test_xml.cpp b/src/orcus_test_xml.cpp
+index 8a864d68..35f3dea7 100644
+--- a/src/orcus_test_xml.cpp
++++ b/src/orcus_test_xml.cpp
+@@ -77,6 +77,7 @@ const char* sax_parser_test_dirs[] = {
+ SRCDIR"/test/xml/no-decl-1/",
+ SRCDIR"/test/xml/underscore-identifier/",
+ SRCDIR"/test/xml/self-closing-root/",
++ SRCDIR"/test/xml/non-ascii/",
+ };
+
+ const char* sax_parser_parse_only_test_dirs[] = {
+diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp
+index 97aa34ec..db51ff94 100644
+--- a/src/parser/sax_parser_base.cpp
++++ b/src/parser/sax_parser_base.cpp
+@@ -328,20 +328,182 @@ bool parser_base::value(pstring& str, bool decode)
+ return transient_stream();
+ }
+
++// https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
++// Return length of the character in bytes, otherwise 0.
++template< bool only_start_name >
++static
++int is_name_char_helper(const char* mp_char, const char* mp_end)
++{
++ const unsigned char first = mp_char[0];
++ // Note that ':' technically is an allowed name character, but it is handled separately
++ // e.g. in element_name(), so here pretend it isn't.
++ if (/*first == ':' ||*/ first == '_' || (first >= 'A' && first <= 'Z') || (first >= 'a' && first <= 'z'))
++ return 1;
++ if (!only_start_name && (first == '-' || first == '.' || (first >= '0' && first <= '9')))
++ return 1;
++
++ if (first < 0x7f) // other ascii characters are not allowed
++ return 0;
++ if (mp_end < mp_char + 1)
++ return 0;
++ const unsigned char second = mp_char[1];
++
++ // 0xb7 = 0xc2 0xb7 utf-8
++ if (!only_start_name && first == 0xc2 && second == 0xb7)
++ return 2;
++
++ // [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF]
++ // 0xc0 = 0xc3 0x80 utf-8
++ if (first < 0xc3)
++ return 0;
++ // xd7 = 0xc3 0x97 utf-8, 0xf7 = 0xc3 0xb7 utf-8
++ if (first == 0xc3)
++ return second >= 0x80 && second <= 0xff && second != 0x97 && second != 0xb7 ? 2 : 0;
++ // 0x2ff = 0xcb 0xbf utf-8, 0x300 = 0xcc 0x80 utf-8
++ if (first >= 0xc4 && first <= 0xcb)
++ return 2;
++
++ // [#x0300-#x036F]
++ // 0x0300 = 0xcc 0x80 utf-8, 0x36f = 0xcd 0xaf utf-8
++ if (!only_start_name && first == 0xcc)
++ return 2;
++ if (!only_start_name && first == 0xcd && second <= 0xaf)
++ return 2;
++
++ // [#x370-#x37D] | [#x37F-#x1FFF]
++ // 0x370 = 0xcd 0xb0 utf-8, 0x37e = 0xcd 0xbe
++ if (first < 0xcd)
++ return 0;
++ if (first == 0xcd)
++ return second >= 0xb0 && second != 0xbe ? 2 : 0;
++ // 0x07ff = 0xdf 0xbf utf-8 (the last 2-byte utf-8)
++ if (first <= 0xdf)
++ return 2;
++
++ if (first < 0xe0)
++ return 0;
++ if (mp_end < mp_char + 2)
++ return 0;
++ const unsigned char third = mp_char[2];
++
++ // 0x0800 = 0xe0 0xa0 0x80 utf-8, 0x1fff = 0xe1 0xbf 0xbf utf-8, 0x2000 = 0xe2 0x80 0x80
++ if (first == 0xe0 || first == 0xe1)
++ return 3;
++
++ // [#x200C-#x200D]
++ // 0x200c = 0xe2 0x80 0x8c utf-8, 0x200d = 0xe2 0x80 0x8d utf-8
++ if (first < 0xe2)
++ return 0;
++ if (first == 0xe2 && second == 0x80 && (third == 0x8c || third == 0x8d))
++ return 3;
++
++ // [#x203F-#x2040]
++ // 0x203f = 0xe2 0x80 0xbf utf-8, 0x2040 = 0xe2 0x81 0x80 utf-8
++ if (!only_start_name && first == 0xe2 && second == 0x80 && third == 0xbf)
++ return 3;
++ if (!only_start_name && first == 0xe2 && second == 0x81 && third == 0x80)
++ return 3;
++
++ // [#x2070-#x218F]
++ // 0x2070 = 0xe2 0x81 0xb0 utf-8, 0x218f = 0xe2 0x86 0x8f utf-8
++ if (first == 0xe2)
++ {
++ if (second < 0x81)
++ return 0;
++ if (second >= 0x81 && second < 0x86)
++ return 3;
++ if (second == 0x86 && third <= 0x8f)
++ return 3;
++ }
++
++ // [#x2C00-#x2FEF]
++ // 0x2c00 = 0xe2 0xb0 0x80 utf-8, 0x2fef = 0xe2 0xbf 0xaf utf-8
++ if (first == 0xe2)
++ {
++ if (second < 0xb0)
++ return 0;
++ if (second < 0xbf)
++ return 3;
++ if (second == 0xbf && third <= 0xaf)
++ return 3;
++ }
++
++ // [#x3001-#xD7FF]
++ // 0x3001 = 0xe3 0x80 0x81 utf-8, 0xd7ff = 0xed 0x9f 0xbf utf-8, 0xd800 = 0xed 0xa0 0x80 utf-8
++ if (first < 0xe3)
++ return 0;
++ if (first < 0xed)
++ return 3;
++ if (first == 0xed && second <= 0x9f)
++ return 3;
++
++ // [#xF900-#xFDCF]
++ // 0xf900 = 0xef 0xa4 0x80 utf-8, 0xfdcf = 0xef 0xb7 0x8f utf-8
++ if (first == 0xef)
++ {
++ if (second < 0xa4)
++ return 0;
++ if (second < 0xb7)
++ return 3;
++ if (second == 0xb7 && third <= 0x8f)
++ return 3;
++ }
++
++ // [#xFDF0-#xFFFD]
++ // 0xfdf0 = 0xef 0xb7 0xb0 utf-8, 0xfffd = 0xef 0xbf 0xbd utf-8
++ if (first == 0xef)
++ {
++ assert(second >= 0xb7);
++ if (second == 0xb7 && third < 0xb0)
++ return 0;
++ if (second < 0xbe)
++ return 3;
++ if (second == 0xbf && third <= 0xbd)
++ return 3;
++ }
++
++ if (first < 0xf0)
++ return 0;
++ if (mp_end < mp_char + 3)
++ return 0;
++ // const unsigned char fourth = mp_char[3];
++
++ // [#x10000-#xEFFFF]
++ // 0x10000 = 0xf0 0x90 0x80 0x80 utf-8, 0xeffff = 0xf3 0xaf 0xbf 0xbf utf-8,
++ // 0xf0000 = 0xf3 0xb0 0x80 0x80 utf-8
++ if (first >= 0xf0 && first < 0xf2)
++ return 4;
++ if (first == 0xf3 && second < 0xb0)
++ return 4;
++
++ return 0;
++}
++
++int parser_base::is_name_char()
++{
++ return is_name_char_helper<false>(mp_char, mp_end);
++}
++
++int parser_base::is_name_start_char()
++{
++ return is_name_char_helper<true>(mp_char, mp_end);
++}
++
+ void parser_base::name(pstring& str)
+ {
+ const char* p0 = mp_char;
+- char c = cur_char();
+- if (!is_alpha(c) && c != '_')
++ int skip = is_name_start_char();
++ if (skip == 0)
+ {
+ ::std::ostringstream os;
+- os << "name must begin with an alphabet, but got this instead '" << c << "'";
++ os << "name must begin with an alphabet, but got this instead '" << cur_char() << "'";
+ throw malformed_xml_error(os.str(), offset());
+ }
++ next(skip);
+
+ #if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__)
+
+- const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__");
++ const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__..");
+ const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY;
+
+ size_t n_total = available_size();
+@@ -351,20 +513,35 @@ void parser_base::name(pstring& str)
+ __m128i char_block = _mm_loadu_si128((const __m128i*)mp_char);
+
+ int n = std::min<size_t>(16u, n_total);
+- int r = _mm_cmpestri(match, 10, char_block, n, mode);
++ int r = _mm_cmpestri(match, 12, char_block, n, mode);
+ mp_char += r; // Move the current char position.
++ n_total -= r;
+
+- if (r < 16)
+- // No need to move to the next segment. Stop here.
+- break;
++ if (r < 16 && n_total)
++ {
++ // There is a character that does not match the SSE-based ASCII-only check.
++ // It may either by an ascii character that is not allowed, in which case stop,
++ // or it may possibly be an allowed utf-8 character, in which case move over it
++ // using the slow function.
++ skip = is_name_char();
++ if(skip == 0)
++ break;
++ next(skip);
++ n_total -= skip;
++ }
+
+- // Skip 16 chars to the next segment.
+- n_total -= 16;
+ }
++ cur_char_checked(); // check end of xml stream
+
+ #else
+- while (is_alpha(c) || is_numeric(c) || is_name_char(c))
+- c = next_char_checked();
++ for(;;)
++ {
++ cur_char_checked(); // check end of xml stream
++ skip = is_name_char();
++ if(skip == 0)
++ break;
++ next(skip);
++ }
+ #endif
+
+ str = pstring(p0, mp_char-p0);
+diff --git a/test/xml/non-ascii/check.txt b/test/xml/non-ascii/check.txt
+new file mode 100644
+index 00000000..77b7c003
+--- /dev/null
++++ b/test/xml/non-ascii/check.txt
+@@ -0,0 +1,4 @@
++/Myšička
++/Myšička at jméno="Žužla"
++/Myšička/Nožičky
++/Myšička/Nožičky"4"
+diff --git a/test/xml/non-ascii/input.xml b/test/xml/non-ascii/input.xml
+new file mode 100644
+index 00000000..c516744b
+--- /dev/null
++++ b/test/xml/non-ascii/input.xml
+@@ -0,0 +1,4 @@
++<?xml version="1.0" encoding="UTF-8"?>
++<Myšička jméno="Žužla">
++ <Nožičky>4</Nožičky>
++</Myšička>
+--
+2.26.2
+
More information about the Libreoffice-commits
mailing list