[Libreoffice-commits] core.git: filter/Library_htmlfd.mk filter/Module_filter.mk filter/source postprocess/Rdb_services.mk Repository.mk solenv/gbuild

Maxim Monastirsky momonasmon at gmail.com
Thu Jan 23 06:49:48 PST 2014


 Repository.mk                                         |    1 
 filter/Library_htmlfd.mk                              |   36 ++
 filter/Module_filter.mk                               |    1 
 filter/source/config/fragments/types/generic_HTML.xcu |    2 
 filter/source/htmlfilterdetect/fdcomp.cxx             |   36 ++
 filter/source/htmlfilterdetect/filterdetect.cxx       |  232 ++++++++++++++++++
 filter/source/htmlfilterdetect/filterdetect.hxx       |   64 ++++
 filter/source/htmlfilterdetect/htmlfd.component       |   15 +
 postprocess/Rdb_services.mk                           |    1 
 solenv/gbuild/extensions/pre_MergedLibsList.mk        |    1 
 10 files changed, 388 insertions(+), 1 deletion(-)

New commits:
commit cc2893834d8ac699dbb38b152f21f17f3debb06b
Author: Maxim Monastirsky <momonasmon at gmail.com>
Date:   Mon Jan 20 10:17:05 2014 +0200

    related: fdo#73682 Introduce HTML detection service
    
    Change-Id: I66bb579019ce8411b821c623955a454fd81cf811
    Reviewed-on: https://gerrit.libreoffice.org/7600
    Reviewed-by: Kohei Yoshida <libreoffice at kohei.us>
    Tested-by: Kohei Yoshida <libreoffice at kohei.us>

diff --git a/Repository.mk b/Repository.mk
index 6c4d488..7066001 100644
--- a/Repository.mk
+++ b/Repository.mk
@@ -270,6 +270,7 @@ $(eval $(call gb_Helper_register_libraries_for_install,OOOLIBS,ooo, \
 	$(if $(ENABLE_DIRECTX),gdipluscanvas) \
 	guesslang \
 	$(if $(filter DESKTOP,$(BUILD_TYPE)),helplinker) \
+	htmlfd \
 	i18npool \
 	i18nsearch \
 	hyphen \
diff --git a/filter/Library_htmlfd.mk b/filter/Library_htmlfd.mk
new file mode 100644
index 0000000..a147509
--- /dev/null
+++ b/filter/Library_htmlfd.mk
@@ -0,0 +1,36 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#*************************************************************************
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+#*************************************************************************
+
+$(eval $(call gb_Library_Library,htmlfd))
+
+$(eval $(call gb_Library_set_componentfile,htmlfd,filter/source/htmlfilterdetect/htmlfd))
+
+$(eval $(call gb_Library_use_external,xmlfd,boost_headers))
+
+$(eval $(call gb_Library_use_sdk_api,htmlfd))
+
+$(eval $(call gb_Library_use_libraries,htmlfd,\
+	ucbhelper \
+	cppuhelper \
+	cppu \
+	sal \
+	tl \
+	utl \
+	svt \
+	$(gb_UWINAPI) \
+))
+
+$(eval $(call gb_Library_add_exception_objects,htmlfd,\
+	filter/source/htmlfilterdetect/fdcomp \
+	filter/source/htmlfilterdetect/filterdetect \
+))
+
+# vim: set noet sw=4 ts=4:
diff --git a/filter/Module_filter.mk b/filter/Module_filter.mk
index 403184a..58307b4 100644
--- a/filter/Module_filter.mk
+++ b/filter/Module_filter.mk
@@ -34,6 +34,7 @@ $(eval $(call gb_Module_add_targets,filter,\
 	    Library_exp) \
 	Library_filterconfig \
 	Library_flash \
+	Library_htmlfd \
 	Library_icd \
 	Library_icg \
 	Library_idx \
diff --git a/filter/source/config/fragments/types/generic_HTML.xcu b/filter/source/config/fragments/types/generic_HTML.xcu
index ede6d2b..58ffedc 100644
--- a/filter/source/config/fragments/types/generic_HTML.xcu
+++ b/filter/source/config/fragments/types/generic_HTML.xcu
@@ -16,7 +16,7 @@
  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 -->
     <node oor:name="generic_HTML" oor:op="replace" >
-        <prop oor:name="DetectService"><value>com.sun.star.text.FormatDetector</value></prop>
+        <prop oor:name="DetectService"><value>com.sun.star.comp.filters.HtmlFilterDetect</value></prop>
         <prop oor:name="URLPattern"><value>private:factory/swriter/web*</value></prop>
         <prop oor:name="Extensions"><value>html htm</value></prop>
         <prop oor:name="MediaType"><value>text/html</value></prop>
diff --git a/filter/source/htmlfilterdetect/fdcomp.cxx b/filter/source/htmlfilterdetect/fdcomp.cxx
new file mode 100644
index 0000000..40360e9
--- /dev/null
+++ b/filter/source/htmlfilterdetect/fdcomp.cxx
@@ -0,0 +1,36 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <sal/config.h>
+
+#include <cppuhelper/factory.hxx>
+#include <cppuhelper/implementationentry.hxx>
+#include <sal/types.h>
+
+#include "filterdetect.hxx"
+
+namespace {
+
+static cppu::ImplementationEntry const services[] = {
+    { &HtmlFilterDetect_createInstance, &HtmlFilterDetect_getImplementationName,
+      &HtmlFilterDetect_getSupportedServiceNames,
+      &cppu::createSingleComponentFactory, 0, 0 },
+    { 0, 0, 0, 0, 0, 0 }
+};
+
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT void * SAL_CALL htmlfd_component_getFactory(
+    char const * pImplName, void * pServiceManager, void * pRegistryKey)
+{
+    return cppu::component_getFactoryHelper(
+        pImplName, pServiceManager, pRegistryKey, services);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/filter/source/htmlfilterdetect/filterdetect.cxx b/filter/source/htmlfilterdetect/filterdetect.cxx
new file mode 100644
index 0000000..140912d
--- /dev/null
+++ b/filter/source/htmlfilterdetect/filterdetect.cxx
@@ -0,0 +1,232 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "filterdetect.hxx"
+
+#include <svtools/htmltokn.h>
+#include <tools/urlobj.hxx>
+#include <ucbhelper/content.hxx>
+#include <unotools/mediadescriptor.hxx>
+#include <unotools/ucbstreamhelper.hxx>
+
+#include <com/sun/star/io/XInputStream.hpp>
+#include <cppuhelper/supportsservice.hxx>
+
+#include <boost/scoped_ptr.hpp>
+
+using com::sun::star::io::XInputStream;
+using com::sun::star::uno::Sequence;
+using com::sun::star::uno::Reference;
+using com::sun::star::uno::Any;
+using com::sun::star::uno::XComponentContext;
+using com::sun::star::uno::XInterface;
+using com::sun::star::uno::Exception;
+using com::sun::star::uno::RuntimeException;
+using com::sun::star::ucb::XCommandEnvironment;
+
+using namespace com::sun::star;
+using namespace com::sun::star::beans;
+
+namespace {
+
+enum DetectPhase {
+    BeforeTag,
+    TagOpened,
+    InTagName
+};
+
+bool isHTMLStream(const OString& aStreamHeader)
+{
+    const char* pHeader = aStreamHeader.getStr();
+    const int   nLength = aStreamHeader.getLength();
+    int nStartOfTagIndex = 0;
+    int i = 0;
+
+    DetectPhase dp = BeforeTag;
+
+    for ( i = 0; i < nLength; ++i, ++pHeader )
+    {
+        char c = *pHeader;
+        if ( c == ' ' || c == '\n' || c == '\t' )
+        {
+            if ( dp == TagOpened )
+                return false; // Invalid: Should start with a tag name
+            else if ( dp == InTagName )
+                break; // End of tag name reached
+        }
+        else if ( c == '<' )
+        {
+            if ( dp == BeforeTag )
+                dp = TagOpened;
+            else
+                return false; // Invalid: Nested '<'
+        }
+        else if ( c == '>' )
+        {
+            if ( dp == InTagName )
+                break; // End of tag name reached
+            else
+                return false; // Invalid: Empty tag or before '<'
+        }
+        else if ( c == '!' )
+        {
+            if ( i == 1 && dp == TagOpened )
+                return true; // "<!" at the very beginning of the file
+            else
+                return false; // Invalid: '!' before '<' or inside tag name
+        }
+        else
+        {
+            if ( dp == BeforeTag )
+                return false; // Invalid: Should start with a tag
+            else if ( dp == TagOpened )
+            {
+                nStartOfTagIndex = i;
+                dp = InTagName;
+            }
+        }
+    }
+
+    // The string following '<' has to be a known HTML token.
+    if ( GetHTMLToken( OStringToOUString( aStreamHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex ),
+                                                RTL_TEXTENCODING_ASCII_US ) ) != 0 )
+        return true;
+
+    return false;
+}
+
+}
+
+OUString SAL_CALL HtmlFilterDetect::detect(Sequence<PropertyValue>& lDescriptor)
+    throw (RuntimeException)
+{
+    OUString sUrl;
+    OUString sDocService;
+    OString  resultString;
+    Reference<XInputStream> xInStream;
+
+    const PropertyValue *pValue = lDescriptor.getConstArray();
+    sal_Int32 nLength  = lDescriptor.getLength();
+    sal_Int32 location = nLength;
+
+    for ( sal_Int32 i = 0; i < nLength; ++i )
+    {
+        if ( pValue[i].Name == utl::MediaDescriptor::PROP_URL() )
+            pValue[i].Value >>= sUrl;
+        else if ( pValue[i].Name == utl::MediaDescriptor::PROP_INPUTSTREAM() )
+            pValue[i].Value >>= xInStream;
+        else if ( pValue[i].Name == utl::MediaDescriptor::PROP_DOCUMENTSERVICE() )
+        {
+            location = i;
+            pValue[i].Value >>= sDocService;
+        }
+    }
+
+    try
+    {
+        if ( !xInStream.is() )
+        {
+            ucbhelper::Content aContent( sUrl, Reference<XCommandEnvironment>(), mxCtx );
+            xInStream = aContent.openStream();
+            if ( !xInStream.is() )
+                return OUString();
+        }
+
+        boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
+        if ( !pInStream || pInStream->GetError() )
+            return OUString();
+
+        pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
+        sal_Size nUniPos = pInStream->Tell();
+
+        const sal_uInt16 nSize = 4096;
+
+        if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
+            resultString = read_uInt8s_ToOString( *pInStream, nSize );
+        else // UTF-16
+            resultString = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
+
+        if ( isHTMLStream( resultString.toAsciiLowerCase() ) )
+        {
+            // Some Apps/Web services use ".xls" extension to indicate that
+            // the given file should be opened by a spreadsheet software
+            if ( sDocService.isEmpty() )
+            {
+                INetURLObject aParser( sUrl );
+                OUString aExt = aParser.getExtension( INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET );
+                aExt = aExt.toAsciiLowerCase();
+
+                if ( aExt == "xls" )
+                {
+                    if ( location == lDescriptor.getLength() )
+                    {
+                        lDescriptor.realloc( location + 1 );
+                        lDescriptor[location].Name = utl::MediaDescriptor::PROP_DOCUMENTSERVICE();
+                    }
+                    lDescriptor[location].Value <<= OUString( "com.sun.star.sheet.SpreadsheetDocument" );
+                }
+            }
+            return OUString( "generic_HTML" );
+        }
+    }
+    catch (const Exception &)
+    {
+        OSL_FAIL( "An Exception occurred while opening File stream" );
+    }
+
+    return OUString(); // Failed
+}
+
+// XInitialization
+
+void SAL_CALL HtmlFilterDetect::initialize(const Sequence<Any>& /*aArguments*/)
+    throw (Exception, RuntimeException)
+{
+}
+
+OUString HtmlFilterDetect_getImplementationName()
+{
+    return OUString( "com.sun.star.comp.filters.HtmlFilterDetect" );
+}
+
+Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames()
+{
+    Sequence<OUString> aRet(2);
+    OUString* pArray = aRet.getArray();
+    pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
+    pArray[1] = "com.sun.star.comp.filters.HtmlFilterDetect";
+    return aRet;
+}
+
+Reference<XInterface> HtmlFilterDetect_createInstance(const Reference<XComponentContext>& rCtx)
+{
+    return (cppu::OWeakObject*) new HtmlFilterDetect( rCtx );
+}
+
+// XServiceInfo
+
+OUString SAL_CALL HtmlFilterDetect::getImplementationName()
+    throw (RuntimeException)
+{
+    return HtmlFilterDetect_getImplementationName();
+}
+
+sal_Bool SAL_CALL HtmlFilterDetect::supportsService(const OUString& rServiceName)
+    throw (RuntimeException)
+{
+    return cppu::supportsService( this, rServiceName );
+}
+
+Sequence<OUString> SAL_CALL HtmlFilterDetect::getSupportedServiceNames()
+    throw (RuntimeException)
+{
+    return HtmlFilterDetect_getSupportedServiceNames();
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/filter/source/htmlfilterdetect/filterdetect.hxx b/filter/source/htmlfilterdetect/filterdetect.hxx
new file mode 100644
index 0000000..631d4d3
--- /dev/null
+++ b/filter/source/htmlfilterdetect/filterdetect.hxx
@@ -0,0 +1,64 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#ifndef INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX
+#define INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX
+
+#include <com/sun/star/document/XExtendedFilterDetection.hpp>
+#include <com/sun/star/lang/XInitialization.hpp>
+#include <com/sun/star/lang/XServiceInfo.hpp>
+#include <com/sun/star/uno/XComponentContext.hpp>
+
+#include <cppuhelper/implbase3.hxx>
+
+class HtmlFilterDetect : public cppu::WeakImplHelper3<
+    com::sun::star::document::XExtendedFilterDetection,
+    com::sun::star::lang::XInitialization,
+    com::sun::star::lang::XServiceInfo>
+{
+    com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext> mxCtx;
+
+public:
+
+    HtmlFilterDetect(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& xCtx) :
+        mxCtx(xCtx) {}
+    virtual ~HtmlFilterDetect() {}
+
+    // XExtendedFilterDetection
+
+    virtual OUString SAL_CALL detect(com::sun::star::uno::Sequence<com::sun::star::beans::PropertyValue>& lDescriptor)
+        throw (com::sun::star::uno::RuntimeException);
+
+    // XInitialization
+
+    virtual void SAL_CALL initialize(const ::com::sun::star::uno::Sequence<com::sun::star::uno::Any>& aArguments)
+        throw (com::sun::star::uno::Exception, com::sun::star::uno::RuntimeException);
+
+    // XServiceInfo
+
+    virtual OUString SAL_CALL getImplementationName()
+        throw (com::sun::star::uno::RuntimeException);
+
+    virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName)
+        throw (com::sun::star::uno::RuntimeException);
+
+    virtual com::sun::star::uno::Sequence<OUString> SAL_CALL getSupportedServiceNames()
+        throw (com::sun::star::uno::RuntimeException);
+};
+
+OUString HtmlFilterDetect_getImplementationName();
+
+com::sun::star::uno::Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames();
+
+com::sun::star::uno::Reference<com::sun::star::uno::XInterface>
+HtmlFilterDetect_createInstance(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& rCtx);
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/filter/source/htmlfilterdetect/htmlfd.component b/filter/source/htmlfilterdetect/htmlfd.component
new file mode 100644
index 0000000..32c41b8
--- /dev/null
+++ b/filter/source/htmlfilterdetect/htmlfd.component
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ -->
+
+<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@"
+    prefix="htmlfd" xmlns="http://openoffice.org/2010/uno-components">
+  <implementation name="com.sun.star.comp.filters.HtmlFilterDetect">
+    <service name="com.sun.star.document.ExtendedTypeDetection"/>
+  </implementation>
+</component>
diff --git a/postprocess/Rdb_services.mk b/postprocess/Rdb_services.mk
index cd8e3c9..b0c8a10 100755
--- a/postprocess/Rdb_services.mk
+++ b/postprocess/Rdb_services.mk
@@ -29,6 +29,7 @@ $(eval $(call gb_Rdb_add_components,services,\
 	filter/source/config/cache/filterconfig1 \
 	filter/source/flash/flash \
 	filter/source/graphic/graphicfilter \
+	filter/source/htmlfilterdetect/htmlfd \
 	filter/source/msfilter/msfilter \
 	filter/source/odfflatxml/odfflatxml \
 	filter/source/pdf/pdffilter \
diff --git a/solenv/gbuild/extensions/pre_MergedLibsList.mk b/solenv/gbuild/extensions/pre_MergedLibsList.mk
index 9cc2079..ba7ad86 100644
--- a/solenv/gbuild/extensions/pre_MergedLibsList.mk
+++ b/solenv/gbuild/extensions/pre_MergedLibsList.mk
@@ -46,6 +46,7 @@ gb_EXTRAMERGEDLIBS := \
 	graphicfilter \
 	guesslang \
 	$(if $(ENABLE_JAVA),hsqldb) \
+	htmlfd \
 	hyphen \
 	icd \
 	icg \


More information about the Libreoffice-commits mailing list