[Libreoffice-commits] .: sc/inc sc/source

Kohei Yoshida kohei at kemper.freedesktop.org
Wed Aug 3 18:34:57 PDT 2011


 sc/inc/orcus/README                |    5 
 sc/inc/orcus/css_parser.hpp        |  513 +++++++++++++++++++++++++++++++++++++
 sc/source/filter/html/htmlimp.cxx  |    5 
 sc/source/filter/html/htmlpars.cxx |  328 +++++++++++++++++++++++
 sc/source/filter/inc/htmlimp.hxx   |    2 
 sc/source/filter/inc/htmlpars.hxx  |   45 +++
 sc/source/filter/rtf/eeimpars.cxx  |   24 +
 7 files changed, 907 insertions(+), 15 deletions(-)

New commits:
commit d6a0565436c75706189a1dd7e79e93da0f96132b
Author: Kohei Yoshida <kohei.yoshida at suse.com>
Date:   Thu Jul 28 00:46:55 2011 -0400

    Parse CSS in the <style> content and set number formats to cells.
    
    Pick up number formats specified in the CSS content of Excel
    generated HTML documents.  This makes use of a template-based CSS
    parser from the orcus project.

diff --git a/sc/inc/orcus/README b/sc/inc/orcus/README
new file mode 100644
index 0000000..3ada1c3
--- /dev/null
+++ b/sc/inc/orcus/README
@@ -0,0 +1,5 @@
+The headers in this directory are directly copied from the orcus project[1].
+When modifying any of these files, please ping me so that the changes can be
+upstreamed.
+
+[1] http://gitorious.org/orcus
diff --git a/sc/inc/orcus/css_parser.hpp b/sc/inc/orcus/css_parser.hpp
new file mode 100644
index 0000000..7a1b3e5
--- /dev/null
+++ b/sc/inc/orcus/css_parser.hpp
@@ -0,0 +1,513 @@
+/*************************************************************************
+ *
+ * Copyright (c) 2011 Kohei Yoshida
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ ************************************************************************/
+
+#ifndef __ORCUS_CSS_PARSER_HPP__
+#define __ORCUS_CSS_PARSER_HPP__
+
+#define ORCUS_DEBUG_CSS 0
+
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <string>
+#include <cassert>
+#include <sstream>
+
+#if ORCUS_DEBUG_CSS
+#include <iostream>
+#endif
+
+namespace orcus {
+
+class css_parse_error : public std::exception
+{
+    std::string m_msg;
+public:
+    css_parse_error(const std::string& msg) : m_msg(msg) {}
+    virtual ~css_parse_error() throw() {}
+    virtual const char* what() const throw() { return m_msg.c_str(); }
+};
+
+template<typename _Handler>
+class css_parser
+{
+public:
+    typedef _Handler handler_type;
+
+    css_parser(const char* p, size_t n, handler_type& hdl);
+    void parse();
+
+private:
+    // Handlers - at the time a handler is called the current position is
+    // expected to point to the first unprocessed non-blank character, and
+    // each handler must set the current position to the next unprocessed
+    // non-blank character when it finishes.
+    void rule();
+    void at_rule_name();
+    void selector_name();
+    void property_name();
+    void property();
+    void quoted_value();
+    void value();
+    void name_sep();
+    void property_sep();
+    void block();
+
+    void identifier(const char*& p, size_t& len);
+
+    void skip_blanks();
+    void skip_blanks_reverse();
+    void shrink_stream();
+    void next();
+    char cur_char() const;
+
+    size_t remaining_size() const { return m_length - m_pos - 1; }
+    bool has_char() const { return m_pos < m_length; }
+
+    static bool is_blank(char c)
+    {
+        return c == ' ' || c == '\t' || c == '\n';
+    }
+
+    static bool is_alpha(char c)
+    {
+        if ('a' <= c && c <= 'z')
+            return true;
+        if ('A' <= c && c <= 'Z')
+            return true;
+        return false;
+    }
+
+    static bool is_name_char(char c)
+    {
+        switch (c)
+        {
+            case '-':
+                return true;
+        }
+
+        return false;
+    }
+
+    static bool is_numeric(char c)
+    {
+        if ('0' <= c && c <= '9')
+            return true;
+        return false;
+    }
+
+    handler_type& m_handler;
+    const char* mp_char;
+    size_t m_pos;
+    size_t m_length;
+};
+
+template<typename _Handler>
+css_parser<_Handler>::css_parser(const char* p, size_t n, handler_type& hdl) :
+    m_handler(hdl), mp_char(p), m_pos(0), m_length(n) {}
+
+template<typename _Handler>
+void css_parser<_Handler>::parse()
+{
+    shrink_stream();
+
+#if ORCUS_DEBUG_CSS
+    std::cout << "compressed: '";
+    const char* p = mp_char;
+    for (size_t i = m_pos; i < m_length; ++i, ++p)
+        std::cout << *p;
+    std::cout << "'" << std::endl;
+#endif
+    m_handler.begin_parse();
+    for (; has_char(); next())
+        rule();
+    m_handler.end_parse();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::rule()
+{
+    // <name> , ... , <name> { <properties> }
+    while (has_char())
+    {
+        char c = cur_char();
+        if (is_alpha(c) || c == '.' || c == '@')
+        {
+            selector_name();
+        }
+        else if (c == ',')
+        {
+            name_sep();
+        }
+        else if (c == '{')
+        {
+            block();
+        }
+        else
+        {
+            std::ostringstream os;
+            os << "failed to parse '" << c << "'";
+            throw css_parse_error(os.str());
+        }
+    }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::at_rule_name()
+{
+    assert(has_char());
+    assert(cur_char() == '@');
+    next();
+    char c = cur_char();
+    if (!is_alpha(c))
+        throw css_parse_error("first character of an at-rule name must be an alphabet.");
+
+    const char* p;
+    size_t len;
+    identifier(p, len);
+    skip_blanks();
+
+    m_handler.at_rule_name(p, len);
+#if ORCUS_DEBUG_CSS
+    std::string foo(p, len);
+    std::cout << "at-rule name: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::selector_name()
+{
+    // <element name> '.' <class name>
+
+    assert(has_char());
+    char c = cur_char();
+    if (c == '@')
+    {
+        // This is the name of an at-rule.
+        at_rule_name();
+        return;
+    }
+
+    if (!is_alpha(c) && c != '.')
+        throw css_parse_error("first character of a name must be an alphabet or a dot.");
+
+    const char* p_elem = NULL;
+    const char* p_class = NULL;
+    size_t len_elem = 0;
+    size_t len_class = 0;
+    if (c != '.')
+        identifier(p_elem, len_elem);
+
+    if (cur_char() == '.')
+    {
+        next();
+        identifier(p_class, len_class);
+    }
+    skip_blanks();
+
+    m_handler.selector_name(p_elem, len_elem, p_class, len_class);
+#if ORCUS_DEBUG_CSS
+    std::string elem_name(p_elem, len_elem), class_name(p_class, len_class);
+    std::cout << "selector name: (element)'" << elem_name.c_str() << "' (class)'" << class_name.c_str() << "'" << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property_name()
+{
+    assert(has_char());
+    char c = cur_char();
+    if (!is_alpha(c) && c != '.')
+        throw css_parse_error("first character of a name must be an alphabet or a dot.");
+
+    const char* p;
+    size_t len;
+    identifier(p, len);
+    skip_blanks();
+
+    m_handler.property_name(p, len);
+#if ORCUS_DEBUG_CSS
+    std::string foo(p, len);
+    std::cout << "property name: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property()
+{
+    // <name> : <value> , ... , <value>
+    m_handler.begin_property();
+    property_name();
+    if (cur_char() != ':')
+        throw css_parse_error("':' expected.");
+    next();
+    skip_blanks();
+    while (has_char())
+    {
+        value();
+        char c = cur_char();
+        if (c == ',')
+        {
+            // separated by commas.
+            next();
+            skip_blanks();
+        }
+        else if (c == ';')
+            break;
+    }
+    skip_blanks();
+    m_handler.end_property();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::quoted_value()
+{
+    assert(cur_char() == '"');
+    next();
+    const char* p = mp_char;
+    size_t len = 1;
+    for (next(); has_char(); next())
+    {
+        if (cur_char() == '"')
+        {
+            // End quote reached.
+            break;
+        }
+        ++len;
+    }
+
+    if (cur_char() != '"')
+        throw css_parse_error("end quote has never been reached.");
+
+    next();
+    skip_blanks();
+
+    m_handler.value(p, len);
+#if ORCUS_DEBUG_CSS
+    std::string foo(p, len);
+    std::cout << "quoted value: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::value()
+{
+    assert(has_char());
+    char c = cur_char();
+    if (c == '"')
+    {
+        quoted_value();
+        return;
+    }
+
+    if (!is_alpha(c) && !is_numeric(c) && c != '-' && c != '+' && c != '.')
+    {
+        std::ostringstream os;
+        os << "illegal first character of a value '" << c << "'";
+        throw css_parse_error(os.str());
+    }
+
+    const char* p = mp_char;
+    size_t len = 1;
+    for (next(); has_char(); next())
+    {
+        c = cur_char();
+        if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c) && c != '.')
+            break;
+        ++len;
+    }
+    skip_blanks();
+
+    m_handler.value(p, len);
+#if ORCUS_DEBUG_CSS
+    std::string foo(p, len);
+    std::cout << "value: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::name_sep()
+{
+    assert(cur_char() == ',');
+#if ORCUS_DEBUG_CSS
+    std::cout << "," << std::endl;
+#endif
+    next();
+    skip_blanks();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property_sep()
+{
+#if ORCUS_DEBUG_CSS
+    std::cout << ";" << std::endl;
+#endif
+    next();
+    skip_blanks();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::block()
+{
+    // '{' <property> ';' ... ';' <property> '}'
+
+    assert(cur_char() == '{');
+#if ORCUS_DEBUG_CSS
+    std::cout << "{" << std::endl;
+#endif
+    m_handler.begin_block();
+
+    next();
+    skip_blanks();
+
+    // parse properties.
+    while (has_char())
+    {
+        property();
+        if (cur_char() != ';')
+            break;
+        property_sep();
+        if (cur_char() == '}')
+            // ';' after the last property.  This is optional but allowed.
+            break;
+    }
+
+    if (cur_char() != '}')
+        throw css_parse_error("} expected.");
+
+    m_handler.end_block();
+
+    next();
+    skip_blanks();
+
+#if ORCUS_DEBUG_CSS
+    std::cout << "}" << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::identifier(const char*& p, size_t& len)
+{
+    p = mp_char;
+    len = 1;
+    for (next(); has_char(); next())
+    {
+        char c = cur_char();
+        if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c))
+            break;
+        ++len;
+    }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::skip_blanks()
+{
+    for (; has_char(); next())
+    {
+        if (!is_blank(*mp_char))
+            break;
+    }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::skip_blanks_reverse()
+{
+    const char* p = mp_char + remaining_size();
+    for (; p != mp_char; --p, --m_length)
+    {
+        if (!is_blank(*p))
+            break;
+    }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::shrink_stream()
+{
+    // Skip any leading blanks.
+    skip_blanks();
+
+    if (!remaining_size())
+        return;
+
+    // Skip any trailing blanks.
+    skip_blanks_reverse();
+
+    // Skip leading <!-- if present.
+
+    const char* com_open = "<!--";
+    size_t com_open_len = std::strlen(com_open);
+    if (remaining_size() < com_open_len)
+        // Not enough stream left.  Bail out.
+        return;
+
+    const char* p = mp_char;
+    for (size_t i = 0; i < com_open_len; ++i, ++p)
+    {
+        if (*p != com_open[i])
+            return;
+        next();
+    }
+    mp_char = p;
+
+    // Skip leading blanks once again.
+    skip_blanks();
+
+    // Skip trailing --> if present.
+    const char* com_close = "-->";
+    size_t com_close_len = std::strlen(com_close);
+    size_t n = remaining_size();
+    if (n < com_close_len)
+        // Not enough stream left.  Bail out.
+        return;
+
+    p = mp_char + n; // move to the last char.
+    for (size_t i = com_close_len; i > 0; --i, --p)
+    {
+        if (*p != com_close[i-1])
+            return;
+    }
+    m_length -= com_close_len;
+
+    skip_blanks_reverse();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::next()
+{
+    ++m_pos;
+    ++mp_char;
+}
+
+template<typename _Handler>
+char css_parser<_Handler>::cur_char() const
+{
+    return *mp_char;
+}
+
+}
+
+#endif
diff --git a/sc/source/filter/html/htmlimp.cxx b/sc/source/filter/html/htmlimp.cxx
index c0f052a..c174b57 100644
--- a/sc/source/filter/html/htmlimp.cxx
+++ b/sc/source/filter/html/htmlimp.cxx
@@ -77,7 +77,7 @@ ScEEAbsImport *ScFormatFilterPluginImpl::CreateHTMLImport( ScDocument* pDocP, co
     return new ScHTMLImport( pDocP, rBaseURL, rRange, bCalcWidthHeight );
 }
 
-ScHTMLImport::ScHTMLImport( ScDocument* pDocP, const String& rBaseURL, const ScRange& rRange, sal_Bool bCalcWidthHeight ) :
+ScHTMLImport::ScHTMLImport( ScDocument* pDocP, const String& rBaseURL, const ScRange& rRange, bool bCalcWidthHeight ) :
     ScEEImport( pDocP, rRange )
 {
     Size aPageSize;
@@ -150,8 +150,7 @@ void ScHTMLImport::WriteToDocument(
     pGlobTable->ApplyCellBorders( mpDoc, maRange.aStart );
 
     // correct cell borders for merged cells
-    size_t ListSize = pParser->ListSize();
-    for ( size_t i = 0; i < ListSize; ++i )
+    for ( size_t i = 0, n = pParser->ListSize(); i < n; ++i )
     {
         const ScEEParseEntry* pEntry = pParser->ListEntry( i );
         if( (pEntry->nColOverlap > 1) || (pEntry->nRowOverlap > 1) )
diff --git a/sc/source/filter/html/htmlpars.cxx b/sc/source/filter/html/htmlpars.cxx
index 4105782..18bf94a 100644
--- a/sc/source/filter/html/htmlpars.cxx
+++ b/sc/source/filter/html/htmlpars.cxx
@@ -49,6 +49,7 @@
 #include <editeng/justifyitem.hxx>
 #include <sfx2/objsh.hxx>
 #include <svl/eitem.hxx>
+#include <svl/intitem.hxx>
 #include <svtools/filter.hxx>
 #include <svtools/parhtml.hxx>
 #include <svtools/htmlkywd.hxx>
@@ -64,12 +65,125 @@
 #include "document.hxx"
 #include "rangelst.hxx"
 
+#include <orcus/css_parser.hpp>
+
 #include <com/sun/star/document/XDocumentProperties.hpp>
 #include <com/sun/star/document/XDocumentPropertiesSupplier.hpp>
 
 using ::editeng::SvxBorderLine;
 using namespace ::com::sun::star;
 
+void ScHTMLStyles::add(const char* pElemName, size_t nElemName, const char* pClassName, size_t nClassName,
+                       const rtl::OUString& aProp, const rtl::OUString& aValue)
+{
+    if (pElemName)
+    {
+        rtl::OUString aElem(pElemName, nElemName, RTL_TEXTENCODING_UTF8);
+        aElem = aElem.toAsciiLowerCase();
+        if (pClassName)
+        {
+            // Both element and class names given.
+
+            ElemsType::iterator itrElem = maElemProps.find(aElem);
+            if (itrElem == maElemProps.end())
+            {
+                // new element
+                std::auto_ptr<NamePropsType> p(new NamePropsType);
+                std::pair<ElemsType::iterator, bool> r = maElemProps.insert(aElem, p);
+                if (!r.second)
+                    // insertion failed.
+                    return;
+                itrElem = r.first;
+            }
+
+            NamePropsType* pClsProps = itrElem->second;
+            rtl::OUString aClass(pClassName, nClassName, RTL_TEXTENCODING_UTF8);
+            aClass = aClass.toAsciiLowerCase();
+            insertProp(*pClsProps, aClass, aProp, aValue);
+        }
+        else
+        {
+            // Element name only. Add it to the element global.
+            insertProp(maElemGlobalProps, aElem, aProp, aValue);
+        }
+    }
+    else
+    {
+        if (pClassName)
+        {
+            // Class name only. Add it to the global.
+            rtl::OUString aClass(pClassName, nClassName, RTL_TEXTENCODING_UTF8);
+            aClass = aClass.toAsciiLowerCase();
+            insertProp(maGlobalProps, aClass, aProp, aValue);
+        }
+    }
+}
+
+const rtl::OUString& ScHTMLStyles::getPropertyValue(
+    const rtl::OUString& rElem, const rtl::OUString& rClass, const rtl::OUString& rPropName) const
+{
+    // First, look into the element-class storage.
+    {
+        ElemsType::const_iterator itr = maElemProps.find(rElem);
+        if (itr != maElemProps.end())
+        {
+            const NamePropsType* pClasses = itr->second;
+            NamePropsType::const_iterator itr2 = pClasses->find(rClass);
+            if (itr2 != pClasses->end())
+            {
+                const PropsType* pProps = itr2->second;
+                PropsType::const_iterator itr3 = pProps->find(rPropName);
+                if (itr3 != pProps->end())
+                    return itr3->second;
+            }
+        }
+    }
+    // Next, look into the class global storage.
+    {
+        NamePropsType::const_iterator itr = maGlobalProps.find(rClass);
+        if (itr != maGlobalProps.end())
+        {
+            const PropsType* pProps = itr->second;
+            PropsType::const_iterator itr2 = pProps->find(rPropName);
+            if (itr2 != pProps->end())
+                return itr2->second;
+        }
+    }
+    // As the last resort, look into the element global storage.
+    {
+        NamePropsType::const_iterator itr = maElemGlobalProps.find(rClass);
+        if (itr != maElemGlobalProps.end())
+        {
+            const PropsType* pProps = itr->second;
+            PropsType::const_iterator itr2 = pProps->find(rPropName);
+            if (itr2 != pProps->end())
+                return itr2->second;
+        }
+    }
+
+    return maEmpty; // nothing found.
+}
+
+void ScHTMLStyles::insertProp(
+    NamePropsType& rStore, const rtl::OUString& aName,
+    const rtl::OUString& aProp, const rtl::OUString& aValue)
+{
+    NamePropsType::iterator itr = rStore.find(aName);
+    if (itr == rStore.end())
+    {
+        // new element
+        std::auto_ptr<PropsType> p(new PropsType);
+        std::pair<NamePropsType::iterator, bool> r = rStore.insert(aName, p);
+        if (!r.second)
+            // insertion failed.
+            return;
+
+        itr = r.first;
+    }
+
+    PropsType* pProps = itr->second;
+    pProps->insert(PropsType::value_type(aProp, aValue));
+}
 
 SV_IMPL_VARARR_SORT( ScHTMLColOffset, sal_uLong );
 
@@ -91,10 +205,21 @@ ScHTMLParser::~ScHTMLParser()
 {
 }
 
+ScHTMLStyles& ScHTMLParser::GetStyles()
+{
+    return maStyles;
+}
+
+ScDocument& ScHTMLParser::GetDoc()
+{
+    return *mpDoc;
+}
 
 // ============================================================================
 
-ScHTMLLayoutParser::ScHTMLLayoutParser( EditEngine* pEditP, const String& rBaseURL, const Size& aPageSizeP, ScDocument* pDocP ) :
+ScHTMLLayoutParser::ScHTMLLayoutParser(
+    EditEngine* pEditP, const String& rBaseURL, const Size& aPageSizeP,
+    ScDocument* pDocP ) :
         ScHTMLParser( pEditP, pDocP ),
         aPageSize( aPageSizeP ),
         aBaseURL( rBaseURL ),
@@ -1867,6 +1992,7 @@ ScHTMLTable::ScHTMLTable( ScHTMLTable& rParentTable, const ImportInfo& rInfo, bo
     mrEEParseList( rParentTable.mrEEParseList ),
     mpCurrEntryList( 0 ),
     maSize( 1, 1 ),
+    mpParser(rParentTable.mpParser),
     mbBorderOn( false ),
     mbPreFormText( bPreFormText ),
     mbRowOn( false ),
@@ -1902,7 +2028,7 @@ ScHTMLTable::ScHTMLTable(
     SfxItemPool& rPool,
     EditEngine& rEditEngine,
     ::std::vector< ScEEParseEntry* >& rEEParseList,
-    ScHTMLTableId& rnUnusedId
+    ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser
 ) :
     mpParentTable( 0 ),
     maTableId( rnUnusedId ),
@@ -1911,6 +2037,7 @@ ScHTMLTable::ScHTMLTable(
     mrEEParseList( rEEParseList ),
     mpCurrEntryList( 0 ),
     maSize( 1, 1 ),
+    mpParser(pParser),
     mbBorderOn( false ),
     mbPreFormText( false ),
     mbRowOn( false ),
@@ -2044,6 +2171,52 @@ void ScHTMLTable::RowOff( const ImportInfo& rInfo )
     CreateNewEntry( rInfo );
 }
 
+namespace {
+
+/**
+ * Decode a numbert format string stored in Excel-generated HTML's CSS
+ * region.
+ */
+rtl::OUString decodeNumberFormat(const rtl::OUString& rFmt)
+{
+    rtl::OUStringBuffer aBuf;
+    const sal_Unicode* p = rFmt.getStr();
+    sal_Int32 n = rFmt.getLength();
+    for (sal_Int32 i = 0; i < n; ++i, ++p)
+    {
+        if (*p == '\\')
+        {
+            // Skip '\'.
+            ++i;
+            ++p;
+
+            // Parse all subsequent digits until first non-digit is found.
+            sal_Int32 nDigitCount = 0;
+            const sal_Unicode* p1 = p;
+            for (; i < n; ++i, ++p, ++nDigitCount)
+            {
+                if (*p < '0' || '9' < *p)
+                {
+                    --i;
+                    --p;
+                    break;
+                }
+
+            }
+            if (nDigitCount)
+            {
+                sal_Int32 nVal = rtl::OUString(p1, nDigitCount).toInt32(16);
+                aBuf.append(static_cast<sal_Unicode>(nVal));
+            }
+        }
+        else
+            aBuf.append(*p);
+    }
+    return aBuf.makeStringAndClear();
+}
+
+}
+
 void ScHTMLTable::DataOn( const ImportInfo& rInfo )
 {
     PushEntry( rInfo, true );
@@ -2072,6 +2245,38 @@ void ScHTMLTable::DataOn( const ImportInfo& rInfo )
         }
 
         ImplDataOn( aSpanSize );
+
+        const HTMLOptions& rOptions = static_cast<HTMLParser*>(rInfo.pParser)->GetOptions();
+        HTMLOptions::const_iterator itr = rOptions.begin(), itrEnd = rOptions.end();
+        for (; itr != itrEnd; ++itr)
+        {
+            if (itr->GetToken() == HTML_O_CLASS)
+            {
+                // This <td> has class property.  Pick up the number format
+                // associated with this class (if any).
+                rtl::OUString aElem(RTL_CONSTASCII_USTRINGPARAM("td"));
+                rtl::OUString aClass = itr->GetString();
+                rtl::OUString aProp(RTL_CONSTASCII_USTRINGPARAM("mso-number-format"));
+                const ScHTMLStyles& rStyles = mpParser->GetStyles();
+                const rtl::OUString& rVal = rStyles.getPropertyValue(aElem, aClass, aProp);
+                rtl::OUString aNumFmt = decodeNumberFormat(rVal);
+
+                sal_uInt32 nNumberFormat = GetFormatTable()->GetEntryKey(aNumFmt);
+                bool bValidFmt = false;
+                if ( nNumberFormat == NUMBERFORMAT_ENTRY_NOT_FOUND )
+                {
+                    xub_StrLen nErrPos  = 0;
+                    short nDummy;
+                    bValidFmt = GetFormatTable()->PutEntry(aNumFmt, nErrPos, nDummy, nNumberFormat);
+                }
+                else
+                    bValidFmt = true;
+
+                if (bValidFmt)
+                    mxDataItemSet->Put( SfxUInt32Item(ATTR_VALUE_FORMAT, nNumberFormat) );
+            }
+        }
+
         ProcessFormatOptions( *mxDataItemSet, rInfo );
         CreateNewEntry( rInfo );
         mxCurrEntry->pValStr = pValStr.release();
@@ -2224,6 +2429,11 @@ void ScHTMLTable::ApplyCellBorders( ScDocument* pDoc, const ScAddress& rFirstPos
         aIter->ApplyCellBorders( pDoc, rFirstPos );
 }
 
+SvNumberFormatter* ScHTMLTable::GetFormatTable()
+{
+    return mpParser->GetDoc().GetFormatTable();
+}
+
 // ----------------------------------------------------------------------------
 
 bool ScHTMLTable::IsEmptyCell() const
@@ -2690,9 +2900,10 @@ ScHTMLGlobalTable::ScHTMLGlobalTable(
     SfxItemPool& rPool,
     EditEngine& rEditEngine,
     ::std::vector< ScEEParseEntry* >& rEEParseList,
-    ScHTMLTableId& rnUnusedId
+    ScHTMLTableId& rnUnusedId,
+    ScHTMLParser* pParser
 ) :
-    ScHTMLTable( rPool, rEditEngine, rEEParseList, rnUnusedId )
+    ScHTMLTable( rPool, rEditEngine, rEEParseList, rnUnusedId, pParser )
 {
 }
 
@@ -2717,7 +2928,8 @@ ScHTMLQueryParser::ScHTMLQueryParser( EditEngine* pEditEngine, ScDocument* pDoc
     mnUnusedId( SC_HTML_GLOBAL_TABLE ),
     mbTitleOn( false )
 {
-    mxGlobTable.reset( new ScHTMLGlobalTable( *pPool, *pEdit, maList, mnUnusedId ) );
+    mxGlobTable.reset(
+        new ScHTMLGlobalTable(*pPool, *pEdit, maList, mnUnusedId, this));
     mpCurrTable = mxGlobTable.get();
 }
 
@@ -2779,6 +2991,9 @@ void ScHTMLQueryParser::ProcessToken( const ImportInfo& rInfo )
         case HTML_TITLE_ON:         TitleOn( rInfo );               break;  // <title>
         case HTML_TITLE_OFF:        TitleOff( rInfo );              break;  // </title>
 
+        case HTML_STYLE_ON:                                         break;
+        case HTML_STYLE_OFF:        ParseStyle(rInfo.aText);        break;
+
 // --- body handling ---
         case HTML_BODY_ON:          mpCurrTable->BodyOn( rInfo );   break;  // <body>
         case HTML_BODY_OFF:         mpCurrTable->BodyOff( rInfo );  break;  // </body>
@@ -2956,6 +3171,109 @@ void ScHTMLQueryParser::CloseTable( const ImportInfo& rInfo )
     mpCurrTable = mpCurrTable->CloseTable( rInfo );
 }
 
+namespace {
+
+/**
+ * Handler class for the CSS parser.
+ */
+class CSSHandler
+{
+    struct MemStr
+    {
+        const char* mp;
+        size_t      mn;
+
+        MemStr() : mp(NULL), mn(0) {}
+        MemStr(const char* p, size_t n) : mp(p), mn(n) {}
+        MemStr(const MemStr& r) : mp(r.mp), mn(r.mn) {}
+        MemStr& operator=(const MemStr& r)
+        {
+            mp = r.mp;
+            mn = r.mn;
+            return *this;
+        }
+    };
+
+    typedef std::pair<MemStr, MemStr> SelectorName; // element : class
+    typedef std::vector<SelectorName> SelectorNames;
+    SelectorNames maSelectorNames; /// current selector names.
+    MemStr maPropName;  /// current property name.
+    MemStr maPropValue; /// current property value.
+
+    ScHTMLStyles& mrStyles;
+public:
+    CSSHandler(ScHTMLStyles& rStyles) : mrStyles(rStyles) {}
+
+    void at_rule_name(const char* /*p*/, size_t /*n*/)
+    {
+        // For now, we ignore at-rule properties.
+    }
+
+    void selector_name(const char* p_elem, size_t n_elem, const char* p_class, size_t n_class)
+    {
+        MemStr aElem(p_elem, n_elem), aClass(p_class, n_class);
+        SelectorName aName(aElem, aClass);
+        maSelectorNames.push_back(aName);
+    }
+
+    void property_name(const char* p, size_t n)
+    {
+        maPropName = MemStr(p, n);
+    }
+
+    void value(const char* p, size_t n)
+    {
+        maPropValue = MemStr(p, n);
+    }
+
+    void begin_parse() {}
+
+    void end_parse() {}
+
+    void begin_block() {}
+
+    void end_block()
+    {
+        maSelectorNames.clear();
+    }
+
+    void begin_property() {}
+
+    void end_property()
+    {
+        SelectorNames::const_iterator itr = maSelectorNames.begin(), itrEnd = maSelectorNames.end();
+        for (; itr != itrEnd; ++itr)
+        {
+            // Add this property to the collection for each selector.
+            const SelectorName& rSelName = *itr;
+            const MemStr& rElem = rSelName.first;
+            const MemStr& rClass = rSelName.second;
+            rtl::OUString aName(maPropName.mp, maPropName.mn, RTL_TEXTENCODING_UTF8);
+            rtl::OUString aValue(maPropValue.mp, maPropValue.mn, RTL_TEXTENCODING_UTF8);
+            mrStyles.add(rElem.mp, rElem.mn, rClass.mp, rClass.mn, aName, aValue);
+        }
+        maPropName = MemStr();
+        maPropValue = MemStr();
+    }
+};
+
+}
+
+void ScHTMLQueryParser::ParseStyle(const rtl::OUString& rStrm)
+{
+    rtl::OString aStr = rtl::OUStringToOString(rStrm, RTL_TEXTENCODING_UTF8);
+    CSSHandler aHdl(GetStyles());
+    orcus::css_parser<CSSHandler> aParser(aStr.getStr(), aStr.getLength(), aHdl);
+    try
+    {
+        aParser.parse();
+    }
+    catch (const orcus::css_parse_error&)
+    {
+        // Parsing of CSS failed.  Do nothing for now.
+    }
+}
+
 // ----------------------------------------------------------------------------
 
 IMPL_LINK( ScHTMLQueryParser, HTMLImportHdl, const ImportInfo*, pInfo )
diff --git a/sc/source/filter/inc/htmlimp.hxx b/sc/source/filter/inc/htmlimp.hxx
index 355a2dd..ecb6c19 100644
--- a/sc/source/filter/inc/htmlimp.hxx
+++ b/sc/source/filter/inc/htmlimp.hxx
@@ -39,7 +39,7 @@ private:
     static void			InsertRangeName( ScDocument* pDoc, const String& rName, const ScRange& rRange );
 
 public:
-    ScHTMLImport( ScDocument* pDoc, const String& rBaseURL, const ScRange& rRange, sal_Bool bCalcWidthHeight = sal_True );
+    ScHTMLImport( ScDocument* pDoc, const String& rBaseURL, const ScRange& rRange, bool bCalcWidthHeight );
     virtual ~ScHTMLImport();
     const ScHTMLParser* GetParser() const { return (ScHTMLParser*)mpParser; }
 
diff --git a/sc/source/filter/inc/htmlpars.hxx b/sc/source/filter/inc/htmlpars.hxx
index c1ab90e..85dbd99 100644
--- a/sc/source/filter/inc/htmlpars.hxx
+++ b/sc/source/filter/inc/htmlpars.hxx
@@ -35,6 +35,8 @@
 #include <vector>
 #include <list>
 #include <map>
+#include <boost/ptr_container/ptr_map.hpp>
+#include <boost/unordered_map.hpp>
 
 #include "rangelst.hxx"
 #include "eeparser.hxx"
@@ -51,9 +53,40 @@ const sal_uInt16 SC_HTML_OFFSET_TOLERANCE_LARGE = 10;   // nested
 
 class ScHTMLTable;
 
+/**
+ * Collection of HTML style data parsed from the content of <style>
+ * elements.
+ */
+class ScHTMLStyles
+{
+    typedef ::boost::unordered_map<rtl::OUString, rtl::OUString, rtl::OUStringHash> PropsType;
+    typedef ::boost::ptr_map<rtl::OUString, PropsType> NamePropsType;
+    typedef ::boost::ptr_map<rtl::OUString, NamePropsType> ElemsType;
+
+    NamePropsType maGlobalProps;     /// global properties (for a given class for all elements)
+    NamePropsType maElemGlobalProps; /// element global properties (no class specified)
+    ElemsType maElemProps;           /// element to class to properties (both element and class are given)
+    const rtl::OUString maEmpty;     /// just a persistent empty string.
+public:
+    void add(const char* pElemName, size_t nElemName, const char* pClassName, size_t nClassName,
+             const rtl::OUString& aProp, const rtl::OUString& aValue);
+
+    /**
+     * Find best-matching property value for given element and class names.
+     */
+    const rtl::OUString& getPropertyValue(
+        const rtl::OUString& rElem, const rtl::OUString& rClass, const rtl::OUString& rPropName) const;
+
+private:
+    static void insertProp(
+        NamePropsType& rProps, const rtl::OUString& aName,
+        const rtl::OUString& aProp, const rtl::OUString& aValue);
+};
+
 /** Base class for HTML parser classes. */
 class ScHTMLParser : public ScEEParser
 {
+    ScHTMLStyles                maStyles;
 protected:
     sal_uInt32                  maFontHeights[ SC_HTML_FONTSIZES ];
     ScDocument*                 mpDoc;          /// The destination document.
@@ -64,6 +97,9 @@ public:
 
     virtual sal_uLong		        Read( SvStream& rStrm, const String& rBaseURL  ) = 0;
 
+    ScHTMLStyles&               GetStyles();
+    ScDocument&                 GetDoc();
+
     /** Returns the "global table" which contains the entire HTML document. */
     virtual const ScHTMLTable*  GetGlobalTable() const = 0;
 };
@@ -436,6 +472,8 @@ public:
     /** Applies border formatting to the passed document. */
     void                ApplyCellBorders( ScDocument* pDoc, const ScAddress& rFirstPos ) const;
 
+    SvNumberFormatter* GetFormatTable();
+
 protected:
     /** Creates a new HTML table without parent.
         @descr  This constructor is used to create the "global table". */
@@ -443,7 +481,7 @@ protected:
                             SfxItemPool& rPool,
                             EditEngine& rEditEngine,
                             ::std::vector< ScEEParseEntry* >& rEEParseList,
-                            ScHTMLTableId& rnUnusedId );
+                            ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser );
 
     /** Fills all empty cells in this and nested tables with dummy parse entries. */
     void                FillEmptyCells();
@@ -550,6 +588,7 @@ private:
     ScHTMLSize          maSize;             /// Size of the table.
     ScHTMLPos           maCurrCell;         /// Address of current cell to fill.
     ScHTMLPos           maDocBasePos;       /// Resulting base address in a Calc document.
+    ScHTMLParser*       mpParser;
     bool                mbBorderOn;         /// true = Table borders on.
     bool                mbPreFormText;      /// true = Table from preformatted text (<pre> tag).
     bool                mbRowOn;            /// true = Inside of <tr> </tr>.
@@ -567,7 +606,7 @@ public:
                             SfxItemPool& rPool,
                             EditEngine& rEditEngine,
                             ::std::vector< ScEEParseEntry* >& rEEParseList,
-                            ScHTMLTableId& rnUnusedId );
+                            ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser );
 
     virtual             ~ScHTMLGlobalTable();
 
@@ -620,6 +659,8 @@ private:
     /** Closes the current table, regardless on opening tag. */
     void                CloseTable( const ImportInfo& rInfo );
 
+    void                ParseStyle(const rtl::OUString& rStrm);
+
     DECL_LINK( HTMLImportHdl, const ImportInfo* );
 
 private:
diff --git a/sc/source/filter/rtf/eeimpars.cxx b/sc/source/filter/rtf/eeimpars.cxx
index 6aae696..933e54c 100644
--- a/sc/source/filter/rtf/eeimpars.cxx
+++ b/sc/source/filter/rtf/eeimpars.cxx
@@ -160,7 +160,7 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
     }
     ScDocumentPool* pDocPool = mpDoc->GetPool();
     ScRangeName* pRangeNames = mpDoc->GetRangeName();
-    for ( size_t i = 0, nListSize = mpParser->ListSize(); i < nListSize; ++i )
+    for ( size_t i = 0, n = mpParser->ListSize(); i < n; ++i )
     {
         pE = mpParser->ListEntry( i );
         SCROW nRow = nStartRow + pE->nRow;
@@ -274,6 +274,10 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
                 const SfxPoolItem* pPosture;
                 if ( rESet.GetItemState( ATTR_FONT_POSTURE, false, &pPosture) != SFX_ITEM_SET )
                     pPosture = 0;
+                // Number format
+                const SfxPoolItem* pNumFmt = NULL;
+                if ( rESet.GetItemState(ATTR_VALUE_FORMAT, false, &pNumFmt) == SFX_ITEM_SET )
+                    rSet.Put(*pNumFmt);
                 if ( pFont || pHeight || pWeight || pPosture )
                 {
                     String aStr( mpEngine->GetText( pE->aSel ) );
@@ -358,10 +362,21 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
                         aStr.EraseLeadingAndTrailingChars();
                     }
 
+                    bool bTextFormat = false;
+
+                    const SfxPoolItem* pNumFmt = NULL;
+                    if (rSet.GetItemState(ATTR_VALUE_FORMAT, false, &pNumFmt) == SFX_ITEM_SET)
+                    {
+                        sal_uInt32 nNumFmt = static_cast<const SfxUInt32Item*>(pNumFmt)->GetValue();
+                        sal_uInt16 nType = pFormatter->GetType(nNumFmt);
+                        if (nType == NUMBERFORMAT_TEXT)
+                            // Format is set to Text.
+                            bTextFormat = true;
+                    }
+
                     // TODO: RTF import should follow the language tag,
                     // currently this follows the HTML options for both, HTML
                     // and RTF.
-                    bool bEnUsRecognized = false;
                     if (bNumbersEnglishUS)
                     {
                         pFormatter->ChangeIntl( LANGUAGE_ENGLISH_US);
@@ -369,13 +384,14 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
                         double fEnVal = 0.0;
                         if (pFormatter->IsNumberFormat( aStr, nIndex, fEnVal))
                         {
-                            bEnUsRecognized = true;
                             sal_uInt32 nNewIndex =
                                 pFormatter->GetFormatForLanguageIfBuiltIn(
                                         nIndex, LANGUAGE_SYSTEM);
                             OSL_ENSURE( nNewIndex != nIndex, "ScEEImport::WriteToDocument: NumbersEnglishUS not a built-in format?");
                             pFormatter->GetInputLineString( fEnVal, nNewIndex, aStr);
                         }
+                        else
+                            bTextFormat = true;
                         pFormatter->ChangeIntl( LANGUAGE_SYSTEM);
                     }
 
@@ -384,7 +400,7 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor,
                     aStr.SearchAndReplaceAll( (sal_Unicode)'\t', (sal_Unicode)' ' );
                     aStr.SearchAndReplaceAll( (sal_Unicode)'\n', (sal_Unicode)' ' );
 
-                    if (bNumbersEnglishUS && !bEnUsRecognized)
+                    if (bTextFormat)
                         mpDoc->PutCell( nCol, nRow, nTab, new ScStringCell( aStr));
                     else
                     {


More information about the Libreoffice-commits mailing list