[Libreoffice-commits] .: sc/inc sc/source
Kohei Yoshida
kohei at kemper.freedesktop.org
Wed Aug 3 18:34:57 PDT 2011
sc/inc/orcus/README | 5
sc/inc/orcus/css_parser.hpp | 513 +++++++++++++++++++++++++++++++++++++
sc/source/filter/html/htmlimp.cxx | 5
sc/source/filter/html/htmlpars.cxx | 328 +++++++++++++++++++++++
sc/source/filter/inc/htmlimp.hxx | 2
sc/source/filter/inc/htmlpars.hxx | 45 +++
sc/source/filter/rtf/eeimpars.cxx | 24 +
7 files changed, 907 insertions(+), 15 deletions(-)
New commits:
commit d6a0565436c75706189a1dd7e79e93da0f96132b
Author: Kohei Yoshida <kohei.yoshida at suse.com>
Date: Thu Jul 28 00:46:55 2011 -0400
Parse CSS in the <style> content and set number formats to cells.
Pick up number formats specified in the CSS content of Excel
generated HTML documents. This makes use of a template-based CSS
parser from the orcus project.
diff --git a/sc/inc/orcus/README b/sc/inc/orcus/README
new file mode 100644
index 0000000..3ada1c3
--- /dev/null
+++ b/sc/inc/orcus/README
@@ -0,0 +1,5 @@
+The headers in this directory are directly copied from the orcus project[1].
+When modifying any of these files, please ping me so that the changes can be
+upstreamed.
+
+[1] http://gitorious.org/orcus
diff --git a/sc/inc/orcus/css_parser.hpp b/sc/inc/orcus/css_parser.hpp
new file mode 100644
index 0000000..7a1b3e5
--- /dev/null
+++ b/sc/inc/orcus/css_parser.hpp
@@ -0,0 +1,513 @@
+/*************************************************************************
+ *
+ * Copyright (c) 2011 Kohei Yoshida
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ ************************************************************************/
+
+#ifndef __ORCUS_CSS_PARSER_HPP__
+#define __ORCUS_CSS_PARSER_HPP__
+
+#define ORCUS_DEBUG_CSS 0
+
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <string>
+#include <cassert>
+#include <sstream>
+
+#if ORCUS_DEBUG_CSS
+#include <iostream>
+#endif
+
+namespace orcus {
+
+class css_parse_error : public std::exception
+{
+ std::string m_msg;
+public:
+ css_parse_error(const std::string& msg) : m_msg(msg) {}
+ virtual ~css_parse_error() throw() {}
+ virtual const char* what() const throw() { return m_msg.c_str(); }
+};
+
+template<typename _Handler>
+class css_parser
+{
+public:
+ typedef _Handler handler_type;
+
+ css_parser(const char* p, size_t n, handler_type& hdl);
+ void parse();
+
+private:
+ // Handlers - at the time a handler is called the current position is
+ // expected to point to the first unprocessed non-blank character, and
+ // each handler must set the current position to the next unprocessed
+ // non-blank character when it finishes.
+ void rule();
+ void at_rule_name();
+ void selector_name();
+ void property_name();
+ void property();
+ void quoted_value();
+ void value();
+ void name_sep();
+ void property_sep();
+ void block();
+
+ void identifier(const char*& p, size_t& len);
+
+ void skip_blanks();
+ void skip_blanks_reverse();
+ void shrink_stream();
+ void next();
+ char cur_char() const;
+
+ size_t remaining_size() const { return m_length - m_pos - 1; }
+ bool has_char() const { return m_pos < m_length; }
+
+ static bool is_blank(char c)
+ {
+ return c == ' ' || c == '\t' || c == '\n';
+ }
+
+ static bool is_alpha(char c)
+ {
+ if ('a' <= c && c <= 'z')
+ return true;
+ if ('A' <= c && c <= 'Z')
+ return true;
+ return false;
+ }
+
+ static bool is_name_char(char c)
+ {
+ switch (c)
+ {
+ case '-':
+ return true;
+ }
+
+ return false;
+ }
+
+ static bool is_numeric(char c)
+ {
+ if ('0' <= c && c <= '9')
+ return true;
+ return false;
+ }
+
+ handler_type& m_handler;
+ const char* mp_char;
+ size_t m_pos;
+ size_t m_length;
+};
+
+template<typename _Handler>
+css_parser<_Handler>::css_parser(const char* p, size_t n, handler_type& hdl) :
+ m_handler(hdl), mp_char(p), m_pos(0), m_length(n) {}
+
+template<typename _Handler>
+void css_parser<_Handler>::parse()
+{
+ shrink_stream();
+
+#if ORCUS_DEBUG_CSS
+ std::cout << "compressed: '";
+ const char* p = mp_char;
+ for (size_t i = m_pos; i < m_length; ++i, ++p)
+ std::cout << *p;
+ std::cout << "'" << std::endl;
+#endif
+ m_handler.begin_parse();
+ for (; has_char(); next())
+ rule();
+ m_handler.end_parse();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::rule()
+{
+ // <name> , ... , <name> { <properties> }
+ while (has_char())
+ {
+ char c = cur_char();
+ if (is_alpha(c) || c == '.' || c == '@')
+ {
+ selector_name();
+ }
+ else if (c == ',')
+ {
+ name_sep();
+ }
+ else if (c == '{')
+ {
+ block();
+ }
+ else
+ {
+ std::ostringstream os;
+ os << "failed to parse '" << c << "'";
+ throw css_parse_error(os.str());
+ }
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::at_rule_name()
+{
+ assert(has_char());
+ assert(cur_char() == '@');
+ next();
+ char c = cur_char();
+ if (!is_alpha(c))
+ throw css_parse_error("first character of an at-rule name must be an alphabet.");
+
+ const char* p;
+ size_t len;
+ identifier(p, len);
+ skip_blanks();
+
+ m_handler.at_rule_name(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "at-rule name: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::selector_name()
+{
+ // <element name> '.' <class name>
+
+ assert(has_char());
+ char c = cur_char();
+ if (c == '@')
+ {
+ // This is the name of an at-rule.
+ at_rule_name();
+ return;
+ }
+
+ if (!is_alpha(c) && c != '.')
+ throw css_parse_error("first character of a name must be an alphabet or a dot.");
+
+ const char* p_elem = NULL;
+ const char* p_class = NULL;
+ size_t len_elem = 0;
+ size_t len_class = 0;
+ if (c != '.')
+ identifier(p_elem, len_elem);
+
+ if (cur_char() == '.')
+ {
+ next();
+ identifier(p_class, len_class);
+ }
+ skip_blanks();
+
+ m_handler.selector_name(p_elem, len_elem, p_class, len_class);
+#if ORCUS_DEBUG_CSS
+ std::string elem_name(p_elem, len_elem), class_name(p_class, len_class);
+ std::cout << "selector name: (element)'" << elem_name.c_str() << "' (class)'" << class_name.c_str() << "'" << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property_name()
+{
+ assert(has_char());
+ char c = cur_char();
+ if (!is_alpha(c) && c != '.')
+ throw css_parse_error("first character of a name must be an alphabet or a dot.");
+
+ const char* p;
+ size_t len;
+ identifier(p, len);
+ skip_blanks();
+
+ m_handler.property_name(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "property name: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property()
+{
+ // <name> : <value> , ... , <value>
+ m_handler.begin_property();
+ property_name();
+ if (cur_char() != ':')
+ throw css_parse_error("':' expected.");
+ next();
+ skip_blanks();
+ while (has_char())
+ {
+ value();
+ char c = cur_char();
+ if (c == ',')
+ {
+ // separated by commas.
+ next();
+ skip_blanks();
+ }
+ else if (c == ';')
+ break;
+ }
+ skip_blanks();
+ m_handler.end_property();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::quoted_value()
+{
+ assert(cur_char() == '"');
+ next();
+ const char* p = mp_char;
+ size_t len = 1;
+ for (next(); has_char(); next())
+ {
+ if (cur_char() == '"')
+ {
+ // End quote reached.
+ break;
+ }
+ ++len;
+ }
+
+ if (cur_char() != '"')
+ throw css_parse_error("end quote has never been reached.");
+
+ next();
+ skip_blanks();
+
+ m_handler.value(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "quoted value: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::value()
+{
+ assert(has_char());
+ char c = cur_char();
+ if (c == '"')
+ {
+ quoted_value();
+ return;
+ }
+
+ if (!is_alpha(c) && !is_numeric(c) && c != '-' && c != '+' && c != '.')
+ {
+ std::ostringstream os;
+ os << "illegal first character of a value '" << c << "'";
+ throw css_parse_error(os.str());
+ }
+
+ const char* p = mp_char;
+ size_t len = 1;
+ for (next(); has_char(); next())
+ {
+ c = cur_char();
+ if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c) && c != '.')
+ break;
+ ++len;
+ }
+ skip_blanks();
+
+ m_handler.value(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "value: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::name_sep()
+{
+ assert(cur_char() == ',');
+#if ORCUS_DEBUG_CSS
+ std::cout << "," << std::endl;
+#endif
+ next();
+ skip_blanks();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property_sep()
+{
+#if ORCUS_DEBUG_CSS
+ std::cout << ";" << std::endl;
+#endif
+ next();
+ skip_blanks();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::block()
+{
+ // '{' <property> ';' ... ';' <property> '}'
+
+ assert(cur_char() == '{');
+#if ORCUS_DEBUG_CSS
+ std::cout << "{" << std::endl;
+#endif
+ m_handler.begin_block();
+
+ next();
+ skip_blanks();
+
+ // parse properties.
+ while (has_char())
+ {
+ property();
+ if (cur_char() != ';')
+ break;
+ property_sep();
+ if (cur_char() == '}')
+ // ';' after the last property. This is optional but allowed.
+ break;
+ }
+
+ if (cur_char() != '}')
+ throw css_parse_error("} expected.");
+
+ m_handler.end_block();
+
+ next();
+ skip_blanks();
+
+#if ORCUS_DEBUG_CSS
+ std::cout << "}" << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::identifier(const char*& p, size_t& len)
+{
+ p = mp_char;
+ len = 1;
+ for (next(); has_char(); next())
+ {
+ char c = cur_char();
+ if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c))
+ break;
+ ++len;
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::skip_blanks()
+{
+ for (; has_char(); next())
+ {
+ if (!is_blank(*mp_char))
+ break;
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::skip_blanks_reverse()
+{
+ const char* p = mp_char + remaining_size();
+ for (; p != mp_char; --p, --m_length)
+ {
+ if (!is_blank(*p))
+ break;
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::shrink_stream()
+{
+ // Skip any leading blanks.
+ skip_blanks();
+
+ if (!remaining_size())
+ return;
+
+ // Skip any trailing blanks.
+ skip_blanks_reverse();
+
+ // Skip leading <!-- if present.
+
+ const char* com_open = "<!--";
+ size_t com_open_len = std::strlen(com_open);
+ if (remaining_size() < com_open_len)
+ // Not enough stream left. Bail out.
+ return;
+
+ const char* p = mp_char;
+ for (size_t i = 0; i < com_open_len; ++i, ++p)
+ {
+ if (*p != com_open[i])
+ return;
+ next();
+ }
+ mp_char = p;
+
+ // Skip leading blanks once again.
+ skip_blanks();
+
+ // Skip trailing --> if present.
+ const char* com_close = "-->";
+ size_t com_close_len = std::strlen(com_close);
+ size_t n = remaining_size();
+ if (n < com_close_len)
+ // Not enough stream left. Bail out.
+ return;
+
+ p = mp_char + n; // move to the last char.
+ for (size_t i = com_close_len; i > 0; --i, --p)
+ {
+ if (*p != com_close[i-1])
+ return;
+ }
+ m_length -= com_close_len;
+
+ skip_blanks_reverse();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::next()
+{
+ ++m_pos;
+ ++mp_char;
+}
+
+template<typename _Handler>
+char css_parser<_Handler>::cur_char() const
+{
+ return *mp_char;
+}
+
+}
+
+#endif
diff --git a/sc/source/filter/html/htmlimp.cxx b/sc/source/filter/html/htmlimp.cxx
index c0f052a..c174b57 100644
--- a/sc/source/filter/html/htmlimp.cxx
+++ b/sc/source/filter/html/htmlimp.cxx
@@ -77,7 +77,7 @@ ScEEAbsImport *ScFormatFilterPluginImpl::CreateHTMLImport( ScDocument* pDocP, co
return new ScHTMLImport( pDocP, rBaseURL, rRange, bCalcWidthHeight );
}
-ScHTMLImport::ScHTMLImport( ScDocument* pDocP, const String& rBaseURL, const ScRange& rRange, sal_Bool bCalcWidthHeight ) :
+ScHTMLImport::ScHTMLImport( ScDocument* pDocP, const String& rBaseURL, const ScRange& rRange, bool bCalcWidthHeight ) :
ScEEImport( pDocP, rRange )
{
Size aPageSize;
@@ -150,8 +150,7 @@ void ScHTMLImport::WriteToDocument(
pGlobTable->ApplyCellBorders( mpDoc, maRange.aStart );
// correct cell borders for merged cells
- size_t ListSize = pParser->ListSize();
- for ( size_t i = 0; i < ListSize; ++i )
+ for ( size_t i = 0, n = pParser->ListSize(); i < n; ++i )
{
const ScEEParseEntry* pEntry = pParser->ListEntry( i );
if( (pEntry->nColOverlap > 1) || (pEntry->nRowOverlap > 1) )
diff --git a/sc/source/filter/html/htmlpars.cxx b/sc/source/filter/html/htmlpars.cxx
index 4105782..18bf94a 100644
--- a/sc/source/filter/html/htmlpars.cxx
+++ b/sc/source/filter/html/htmlpars.cxx
@@ -49,6 +49,7 @@
#include <editeng/justifyitem.hxx>
#include <sfx2/objsh.hxx>
#include <svl/eitem.hxx>
+#include <svl/intitem.hxx>
#include <svtools/filter.hxx>
#include <svtools/parhtml.hxx>
#include <svtools/htmlkywd.hxx>
@@ -64,12 +65,125 @@
#include "document.hxx"
#include "rangelst.hxx"
+#include <orcus/css_parser.hpp>
+
#include <com/sun/star/document/XDocumentProperties.hpp>
#include <com/sun/star/document/XDocumentPropertiesSupplier.hpp>
using ::editeng::SvxBorderLine;
using namespace ::com::sun::star;
+void ScHTMLStyles::add(const char* pElemName, size_t nElemName, const char* pClassName, size_t nClassName,
+ const rtl::OUString& aProp, const rtl::OUString& aValue)
+{
+ if (pElemName)
+ {
+ rtl::OUString aElem(pElemName, nElemName, RTL_TEXTENCODING_UTF8);
+ aElem = aElem.toAsciiLowerCase();
+ if (pClassName)
+ {
+ // Both element and class names given.
+
+ ElemsType::iterator itrElem = maElemProps.find(aElem);
+ if (itrElem == maElemProps.end())
+ {
+ // new element
+ std::auto_ptr<NamePropsType> p(new NamePropsType);
+ std::pair<ElemsType::iterator, bool> r = maElemProps.insert(aElem, p);
+ if (!r.second)
+ // insertion failed.
+ return;
+ itrElem = r.first;
+ }
+
+ NamePropsType* pClsProps = itrElem->second;
+ rtl::OUString aClass(pClassName, nClassName, RTL_TEXTENCODING_UTF8);
+ aClass = aClass.toAsciiLowerCase();
+ insertProp(*pClsProps, aClass, aProp, aValue);
+ }
+ else
+ {
+ // Element name only. Add it to the element global.
+ insertProp(maElemGlobalProps, aElem, aProp, aValue);
+ }
+ }
+ else
+ {
+ if (pClassName)
+ {
+ // Class name only. Add it to the global.
+ rtl::OUString aClass(pClassName, nClassName, RTL_TEXTENCODING_UTF8);
+ aClass = aClass.toAsciiLowerCase();
+ insertProp(maGlobalProps, aClass, aProp, aValue);
+ }
+ }
+}
+
+const rtl::OUString& ScHTMLStyles::getPropertyValue(
+ const rtl::OUString& rElem, const rtl::OUString& rClass, const rtl::OUString& rPropName) const
+{
+ // First, look into the element-class storage.
+ {
+ ElemsType::const_iterator itr = maElemProps.find(rElem);
+ if (itr != maElemProps.end())
+ {
+ const NamePropsType* pClasses = itr->second;
+ NamePropsType::const_iterator itr2 = pClasses->find(rClass);
+ if (itr2 != pClasses->end())
+ {
+ const PropsType* pProps = itr2->second;
+ PropsType::const_iterator itr3 = pProps->find(rPropName);
+ if (itr3 != pProps->end())
+ return itr3->second;
+ }
+ }
+ }
+ // Next, look into the class global storage.
+ {
+ NamePropsType::const_iterator itr = maGlobalProps.find(rClass);
+ if (itr != maGlobalProps.end())
+ {
+ const PropsType* pProps = itr->second;
+ PropsType::const_iterator itr2 = pProps->find(rPropName);
+ if (itr2 != pProps->end())
+ return itr2->second;
+ }
+ }
+ // As the last resort, look into the element global storage.
+ {
+ NamePropsType::const_iterator itr = maElemGlobalProps.find(rClass);
+ if (itr != maElemGlobalProps.end())
+ {
+ const PropsType* pProps = itr->second;
+ PropsType::const_iterator itr2 = pProps->find(rPropName);
+ if (itr2 != pProps->end())
+ return itr2->second;
+ }
+ }
+
+ return maEmpty; // nothing found.
+}
+
+void ScHTMLStyles::insertProp(
+ NamePropsType& rStore, const rtl::OUString& aName,
+ const rtl::OUString& aProp, const rtl::OUString& aValue)
+{
+ NamePropsType::iterator itr = rStore.find(aName);
+ if (itr == rStore.end())
+ {
+ // new element
+ std::auto_ptr<PropsType> p(new PropsType);
+ std::pair<NamePropsType::iterator, bool> r = rStore.insert(aName, p);
+ if (!r.second)
+ // insertion failed.
+ return;
+
+ itr = r.first;
+ }
+
+ PropsType* pProps = itr->second;
+ pProps->insert(PropsType::value_type(aProp, aValue));
+}
SV_IMPL_VARARR_SORT( ScHTMLColOffset, sal_uLong );
@@ -91,10 +205,21 @@ ScHTMLParser::~ScHTMLParser()
{
}
+ScHTMLStyles& ScHTMLParser::GetStyles()
+{
+ return maStyles;
+}
+
+ScDocument& ScHTMLParser::GetDoc()
+{
+ return *mpDoc;
+}
// ============================================================================
-ScHTMLLayoutParser::ScHTMLLayoutParser( EditEngine* pEditP, const String& rBaseURL, const Size& aPageSizeP, ScDocument* pDocP ) :
+ScHTMLLayoutParser::ScHTMLLayoutParser(
+ EditEngine* pEditP, const String& rBaseURL, const Size& aPageSizeP,
+ ScDocument* pDocP ) :
ScHTMLParser( pEditP, pDocP ),
aPageSize( aPageSizeP ),
aBaseURL( rBaseURL ),
@@ -1867,6 +1992,7 @@ ScHTMLTable::ScHTMLTable( ScHTMLTable& rParentTable, const ImportInfo& rInfo, bo
mrEEParseList( rParentTable.mrEEParseList ),
mpCurrEntryList( 0 ),
maSize( 1, 1 ),
+ mpParser(rParentTable.mpParser),
mbBorderOn( false ),
mbPreFormText( bPreFormText ),
mbRowOn( false ),
@@ -1902,7 +2028,7 @@ ScHTMLTable::ScHTMLTable(
SfxItemPool& rPool,
EditEngine& rEditEngine,
::std::vector< ScEEParseEntry* >& rEEParseList,
- ScHTMLTableId& rnUnusedId
+ ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser
) :
mpParentTable( 0 ),
maTableId( rnUnusedId ),
@@ -1911,6 +2037,7 @@ ScHTMLTable::ScHTMLTable(
mrEEParseList( rEEParseList ),
mpCurrEntryList( 0 ),
maSize( 1, 1 ),
+ mpParser(pParser),
mbBorderOn( false ),
mbPreFormText( false ),
mbRowOn( false ),
@@ -2044,6 +2171,52 @@ void ScHTMLTable::RowOff( const ImportInfo& rInfo )
CreateNewEntry( rInfo );
}
+namespace {
+
+/**
+ * Decode a numbert format string stored in Excel-generated HTML's CSS
+ * region.
+ */
+rtl::OUString decodeNumberFormat(const rtl::OUString& rFmt)
+{
+ rtl::OUStringBuffer aBuf;
+ const sal_Unicode* p = rFmt.getStr();
+ sal_Int32 n = rFmt.getLength();
+ for (sal_Int32 i = 0; i < n; ++i, ++p)
+ {
+ if (*p == '\\')
+ {
+ // Skip '\'.
+ ++i;
+ ++p;
+
+ // Parse all subsequent digits until first non-digit is found.
+ sal_Int32 nDigitCount = 0;
+ const sal_Unicode* p1 = p;
+ for (; i < n; ++i, ++p, ++nDigitCount)
+ {
+ if (*p < '0' || '9' < *p)
+ {
+ --i;
+ --p;
+ break;
+ }
+
+ }
+ if (nDigitCount)
+ {
+ sal_Int32 nVal = rtl::OUString(p1, nDigitCount).toInt32(16);
+ aBuf.append(static_cast<sal_Unicode>(nVal));
+ }
+ }
+ else
+ aBuf.append(*p);
+ }
+ return aBuf.makeStringAndClear();
+}
+
+}
+
void ScHTMLTable::DataOn( const ImportInfo& rInfo )
{
PushEntry( rInfo, true );
@@ -2072,6 +2245,38 @@ void ScHTMLTable::DataOn( const ImportInfo& rInfo )
}
ImplDataOn( aSpanSize );
+
+ const HTMLOptions& rOptions = static_cast<HTMLParser*>(rInfo.pParser)->GetOptions();
+ HTMLOptions::const_iterator itr = rOptions.begin(), itrEnd = rOptions.end();
+ for (; itr != itrEnd; ++itr)
+ {
+ if (itr->GetToken() == HTML_O_CLASS)
+ {
+ // This <td> has class property. Pick up the number format
+ // associated with this class (if any).
+ rtl::OUString aElem(RTL_CONSTASCII_USTRINGPARAM("td"));
+ rtl::OUString aClass = itr->GetString();
+ rtl::OUString aProp(RTL_CONSTASCII_USTRINGPARAM("mso-number-format"));
+ const ScHTMLStyles& rStyles = mpParser->GetStyles();
+ const rtl::OUString& rVal = rStyles.getPropertyValue(aElem, aClass, aProp);
+ rtl::OUString aNumFmt = decodeNumberFormat(rVal);
+
+ sal_uInt32 nNumberFormat = GetFormatTable()->GetEntryKey(aNumFmt);
+ bool bValidFmt = false;
+ if ( nNumberFormat == NUMBERFORMAT_ENTRY_NOT_FOUND )
+ {
+ xub_StrLen nErrPos = 0;
+ short nDummy;
+ bValidFmt = GetFormatTable()->PutEntry(aNumFmt, nErrPos, nDummy, nNumberFormat);
+ }
+ else
+ bValidFmt = true;
+
+ if (bValidFmt)
+ mxDataItemSet->Put( SfxUInt32Item(ATTR_VALUE_FORMAT, nNumberFormat) );
+ }
+ }
+
ProcessFormatOptions( *mxDataItemSet, rInfo );
CreateNewEntry( rInfo );
mxCurrEntry->pValStr = pValStr.release();
@@ -2224,6 +2429,11 @@ void ScHTMLTable::ApplyCellBorders( ScDocument* pDoc, const ScAddress& rFirstPos
aIter->ApplyCellBorders( pDoc, rFirstPos );
}
+SvNumberFormatter* ScHTMLTable::GetFormatTable()
+{
+ return mpParser->GetDoc().GetFormatTable();
+}
+
// ----------------------------------------------------------------------------
bool ScHTMLTable::IsEmptyCell() const
@@ -2690,9 +2900,10 @@ ScHTMLGlobalTable::ScHTMLGlobalTable(
SfxItemPool& rPool,
EditEngine& rEditEngine,
::std::vector< ScEEParseEntry* >& rEEParseList,
- ScHTMLTableId& rnUnusedId
+ ScHTMLTableId& rnUnusedId,
+ ScHTMLParser* pParser
) :
- ScHTMLTable( rPool, rEditEngine, rEEParseList, rnUnusedId )
+ ScHTMLTable( rPool, rEditEngine, rEEParseList, rnUnusedId, pParser )
{
}
@@ -2717,7 +2928,8 @@ ScHTMLQueryParser::ScHTMLQueryParser( EditEngine* pEditEngine, ScDocument* pDoc
mnUnusedId( SC_HTML_GLOBAL_TABLE ),
mbTitleOn( false )
{
- mxGlobTable.reset( new ScHTMLGlobalTable( *pPool, *pEdit, maList, mnUnusedId ) );
+ mxGlobTable.reset(
+ new ScHTMLGlobalTable(*pPool, *pEdit, maList, mnUnusedId, this));
mpCurrTable = mxGlobTable.get();
}
@@ -2779,6 +2991,9 @@ void ScHTMLQueryParser::ProcessToken( const ImportInfo& rInfo )
case HTML_TITLE_ON: TitleOn( rInfo ); break; // <title>
case HTML_TITLE_OFF: TitleOff( rInfo ); break; // </title>
+ case HTML_STYLE_ON: break;
+ case HTML_STYLE_OFF: ParseStyle(rInfo.aText); break;
+
// --- body handling ---
case HTML_BODY_ON: mpCurrTable->BodyOn( rInfo ); break; // <body>
case HTML_BODY_OFF: mpCurrTable->BodyOff( rInfo ); break; // </body>
@@ -2956,6 +3171,109 @@ void ScHTMLQueryParser::CloseTable( const ImportInfo& rInfo )
mpCurrTable = mpCurrTable->CloseTable( rInfo );
}
+namespace {
+
+/**
+ * Handler class for the CSS parser.
+ */
+class CSSHandler
+{
+ struct MemStr
+ {
+ const char* mp;
+ size_t mn;
+
+ MemStr() : mp(NULL), mn(0) {}
+ MemStr(const char* p, size_t n) : mp(p), mn(n) {}
+ MemStr(const MemStr& r) : mp(r.mp), mn(r.mn) {}
+ MemStr& operator=(const MemStr& r)
+ {
+ mp = r.mp;
+ mn = r.mn;
+ return *this;
+ }
+ };
+
+ typedef std::pair<MemStr, MemStr> SelectorName; // element : class
+ typedef std::vector<SelectorName> SelectorNames;
+ SelectorNames maSelectorNames; /// current selector names.
+ MemStr maPropName; /// current property name.
+ MemStr maPropValue; /// current property value.
+
+ ScHTMLStyles& mrStyles;
+public:
+ CSSHandler(ScHTMLStyles& rStyles) : mrStyles(rStyles) {}
+
+ void at_rule_name(const char* /*p*/, size_t /*n*/)
+ {
+ // For now, we ignore at-rule properties.
+ }
+
+ void selector_name(const char* p_elem, size_t n_elem, const char* p_class, size_t n_class)
+ {
+ MemStr aElem(p_elem, n_elem), aClass(p_class, n_class);
+ SelectorName aName(aElem, aClass);
+ maSelectorNames.push_back(aName);
+ }
+
+ void property_name(const char* p, size_t n)
+ {
+ maPropName = MemStr(p, n);
+ }
+
+ void value(const char* p, size_t n)
+ {
+ maPropValue = MemStr(p, n);
+ }
+
+ void begin_parse() {}
+
+ void end_parse() {}
+
+ void begin_block() {}
+
+ void end_block()
+ {
+ maSelectorNames.clear();
+ }
+
+ void begin_property() {}
+
+ void end_property()
+ {
+ SelectorNames::const_iterator itr = maSelectorNames.begin(), itrEnd = maSelectorNames.end();
+ for (; itr != itrEnd; ++itr)
+ {
+ // Add this property to the collection for each selector.
+ const SelectorName& rSelName = *itr;
+ const MemStr& rElem = rSelName.first;
+ const MemStr& rClass = rSelName.second;
+ rtl::OUString aName(maPropName.mp, maPropName.mn, RTL_TEXTENCODING_UTF8);
+ rtl::OUString aValue(maPropValue.mp, maPropValue.mn, RTL_TEXTENCODING_UTF8);
+ mrStyles.add(rElem.mp, rElem.mn, rClass.mp, rClass.mn, aName, aValue);
+ }
+ maPropName = MemStr();
+ maPropValue = MemStr();
+ }
+};
+
+}
+
+void ScHTMLQueryParser::ParseStyle(const rtl::OUString& rStrm)
+{
+ rtl::OString aStr = rtl::OUStringToOString(rStrm, RTL_TEXTENCODING_UTF8);
+ CSSHandler aHdl(GetStyles());
+ orcus::css_parser<CSSHandler> aParser(aStr.getStr(), aStr.getLength(), aHdl);
+ try
+ {
+ aParser.parse();
+ }
+ catch (const orcus::css_parse_error&)
+ {
+ // Parsing of CSS failed. Do nothing for now.
+ }
+}
+
// ----------------------------------------------------------------------------
IMPL_LINK( ScHTMLQueryParser, HTMLImportHdl, const ImportInfo*, pInfo )
diff --git a/sc/source/filter/inc/htmlimp.hxx b/sc/source/filter/inc/htmlimp.hxx
index 355a2dd..ecb6c19 100644
--- a/sc/source/filter/inc/htmlimp.hxx
+++ b/sc/source/filter/inc/htmlimp.hxx
@@ -39,7 +39,7 @@ private:
static void InsertRangeName( ScDocument* pDoc, const String& rName, const ScRange& rRange );
public:
- ScHTMLImport( ScDocument* pDoc, const String& rBaseURL, const ScRange& rRange, sal_Bool bCalcWidthHeight = sal_True );
+ ScHTMLImport( ScDocument* pDoc, const String& rBaseURL, const ScRange& rRange, bool bCalcWidthHeight );
virtual ~ScHTMLImport();
const ScHTMLParser* GetParser() const { return (ScHTMLParser*)mpParser; }
diff --git a/sc/source/filter/inc/htmlpars.hxx b/sc/source/filter/inc/htmlpars.hxx
index c1ab90e..85dbd99 100644
--- a/sc/source/filter/inc/htmlpars.hxx
+++ b/sc/source/filter/inc/htmlpars.hxx
@@ -35,6 +35,8 @@
#include <vector>
#include <list>
#include <map>
+#include <boost/ptr_container/ptr_map.hpp>
+#include <boost/unordered_map.hpp>
#include "rangelst.hxx"
#include "eeparser.hxx"
@@ -51,9 +53,40 @@ const sal_uInt16 SC_HTML_OFFSET_TOLERANCE_LARGE = 10; // nested
class ScHTMLTable;
+/**
+ * Collection of HTML style data parsed from the content of <style>
+ * elements.
+ */
+class ScHTMLStyles
+{
+ typedef ::boost::unordered_map<rtl::OUString, rtl::OUString, rtl::OUStringHash> PropsType;
+ typedef ::boost::ptr_map<rtl::OUString, PropsType> NamePropsType;
+ typedef ::boost::ptr_map<rtl::OUString, NamePropsType> ElemsType;
+
+ NamePropsType maGlobalProps; /// global properties (for a given class for all elements)
+ NamePropsType maElemGlobalProps; /// element global properties (no class specified)
+ ElemsType maElemProps; /// element to class to properties (both element and class are given)
+ const rtl::OUString maEmpty; /// just a persistent empty string.
+public:
+ void add(const char* pElemName, size_t nElemName, const char* pClassName, size_t nClassName,
+ const rtl::OUString& aProp, const rtl::OUString& aValue);
+
+ /**
+ * Find best-matching property value for given element and class names.
+ */
+ const rtl::OUString& getPropertyValue(
+ const rtl::OUString& rElem, const rtl::OUString& rClass, const rtl::OUString& rPropName) const;
+
+private:
+ static void insertProp(
+ NamePropsType& rProps, const rtl::OUString& aName,
+ const rtl::OUString& aProp, const rtl::OUString& aValue);
+};
+
/** Base class for HTML parser classes. */
class ScHTMLParser : public ScEEParser
{
+ ScHTMLStyles maStyles;
protected:
sal_uInt32 maFontHeights[ SC_HTML_FONTSIZES ];
ScDocument* mpDoc; /// The destination document.
@@ -64,6 +97,9 @@ public:
virtual sal_uLong Read( SvStream& rStrm, const String& rBaseURL ) = 0;
+ ScHTMLStyles& GetStyles();
+ ScDocument& GetDoc();
+
/** Returns the "global table" which contains the entire HTML document. */
virtual const ScHTMLTable* GetGlobalTable() const = 0;
};
@@ -436,6 +472,8 @@ public:
/** Applies border formatting to the passed document. */
void ApplyCellBorders( ScDocument* pDoc, const ScAddress& rFirstPos ) const;
+ SvNumberFormatter* GetFormatTable();
+
protected:
/** Creates a new HTML table without parent.
@descr This constructor is used to create the "global table". */
@@ -443,7 +481,7 @@ protected:
SfxItemPool& rPool,
EditEngine& rEditEngine,
::std::vector< ScEEParseEntry* >& rEEParseList,
- ScHTMLTableId& rnUnusedId );
+ ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser );
/** Fills all empty cells in this and nested tables with dummy parse entries. */
void FillEmptyCells();
@@ -550,6 +588,7 @@ private:
ScHTMLSize maSize; /// Size of the table.
ScHTMLPos maCurrCell; /// Address of current cell to fill.
ScHTMLPos maDocBasePos; /// Resulting base address in a Calc document.
+ ScHTMLParser* mpParser;
bool mbBorderOn; /// true = Table borders on.
bool mbPreFormText; /// true = Table from preformatted text (<pre> tag).
bool mbRowOn; /// true = Inside of <tr> </tr>.
@@ -567,7 +606,7 @@ public:
SfxItemPool& rPool,
EditEngine& rEditEngine,
::std::vector< ScEEParseEntry* >& rEEParseList,
- ScHTMLTableId& rnUnusedId );
+ ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser );
virtual ~ScHTMLGlobalTable();
@@ -620,6 +659,8 @@ private:
/** Closes the current table, regardless on opening tag. */
void CloseTable( const ImportInfo& rInfo );
+ void ParseStyle(const rtl::OUString& rStrm);
+
DECL_LINK( HTMLImportHdl, const ImportInfo* );
private:
diff --git a/sc/source/filter/rtf/eeimpars.cxx b/sc/source/filter/rtf/eeimpars.cxx
index 6aae696..933e54c 100644
--- a/sc/source/filter/rtf/eeimpars.cxx
+++ b/sc/source/filter/rtf/eeimpars.cxx
@@ -160,7 +160,7 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
}
ScDocumentPool* pDocPool = mpDoc->GetPool();
ScRangeName* pRangeNames = mpDoc->GetRangeName();
- for ( size_t i = 0, nListSize = mpParser->ListSize(); i < nListSize; ++i )
+ for ( size_t i = 0, n = mpParser->ListSize(); i < n; ++i )
{
pE = mpParser->ListEntry( i );
SCROW nRow = nStartRow + pE->nRow;
@@ -274,6 +274,10 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
const SfxPoolItem* pPosture;
if ( rESet.GetItemState( ATTR_FONT_POSTURE, false, &pPosture) != SFX_ITEM_SET )
pPosture = 0;
+ // Number format
+ const SfxPoolItem* pNumFmt = NULL;
+ if ( rESet.GetItemState(ATTR_VALUE_FORMAT, false, &pNumFmt) == SFX_ITEM_SET )
+ rSet.Put(*pNumFmt);
if ( pFont || pHeight || pWeight || pPosture )
{
String aStr( mpEngine->GetText( pE->aSel ) );
@@ -358,10 +362,21 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
aStr.EraseLeadingAndTrailingChars();
}
+ bool bTextFormat = false;
+
+ const SfxPoolItem* pNumFmt = NULL;
+ if (rSet.GetItemState(ATTR_VALUE_FORMAT, false, &pNumFmt) == SFX_ITEM_SET)
+ {
+ sal_uInt32 nNumFmt = static_cast<const SfxUInt32Item*>(pNumFmt)->GetValue();
+ sal_uInt16 nType = pFormatter->GetType(nNumFmt);
+ if (nType == NUMBERFORMAT_TEXT)
+ // Format is set to Text.
+ bTextFormat = true;
+ }
+
// TODO: RTF import should follow the language tag,
// currently this follows the HTML options for both, HTML
// and RTF.
- bool bEnUsRecognized = false;
if (bNumbersEnglishUS)
{
pFormatter->ChangeIntl( LANGUAGE_ENGLISH_US);
@@ -369,13 +384,14 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
double fEnVal = 0.0;
if (pFormatter->IsNumberFormat( aStr, nIndex, fEnVal))
{
- bEnUsRecognized = true;
sal_uInt32 nNewIndex =
pFormatter->GetFormatForLanguageIfBuiltIn(
nIndex, LANGUAGE_SYSTEM);
OSL_ENSURE( nNewIndex != nIndex, "ScEEImport::WriteToDocument: NumbersEnglishUS not a built-in format?");
pFormatter->GetInputLineString( fEnVal, nNewIndex, aStr);
}
+ else
+ bTextFormat = true;
pFormatter->ChangeIntl( LANGUAGE_SYSTEM);
}
@@ -384,7 +400,7 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
aStr.SearchAndReplaceAll( (sal_Unicode)'\t', (sal_Unicode)' ' );
aStr.SearchAndReplaceAll( (sal_Unicode)'\n', (sal_Unicode)' ' );
- if (bNumbersEnglishUS && !bEnUsRecognized)
+ if (bTextFormat)
mpDoc->PutCell( nCol, nRow, nTab, new ScStringCell( aStr));
else
{
More information about the Libreoffice-commits
mailing list