479c87e
From df7ff240db01ee0e993c7cbc30d3370d6d1d0956 Mon Sep 17 00:00:00 2001
63d924a
From: David Tardon <dtardon@redhat.com>
63d924a
Date: Tue, 8 Jul 2014 17:01:27 +0200
63d924a
Subject: [PATCH] avoid problems detecting HTML files with .xls ext.
63d924a
479c87e
(cherry picked from commit 86c6f18c2766aad43d6e3bfcf3530e40440ebca7)
479c87e
Signed-off-by: David Tardon <dtardon@redhat.com>
479c87e
479c87e
Conflicts:
479c87e
	filter/source/textfilterdetect/filterdetect.cxx
479c87e
63d924a
Change-Id: I9955223aac20f3f640fde51bb7231666c269ca70
63d924a
---
479c87e
 filter/Configuration_filter.mk                     |   1 +
479c87e
 filter/source/config/fragments/types/calc_HTML.xcu |  35 ++++
479c87e
 sc/Library_scd.mk                                  |   1 +
479c87e
 sc/inc/htmlfilterdetect.hxx                        |  80 +++++++++
479c87e
 sc/source/filter/html/htmlfilterdetect.cxx         | 180 +++++++++++++++++++++
479c87e
 sc/source/ui/unoobj/detreg.cxx                     |   9 ++
479c87e
 sc/util/scd.component                              |   3 +
479c87e
 7 files changed, 309 insertions(+)
63d924a
 create mode 100644 filter/source/config/fragments/types/calc_HTML.xcu
479c87e
 create mode 100644 sc/inc/htmlfilterdetect.hxx
479c87e
 create mode 100644 sc/source/filter/html/htmlfilterdetect.cxx
63d924a
63d924a
diff --git a/filter/Configuration_filter.mk b/filter/Configuration_filter.mk
479c87e
index fe84350..36cf294 100644
63d924a
--- a/filter/Configuration_filter.mk
63d924a
+++ b/filter/Configuration_filter.mk
479c87e
@@ -514,6 +514,7 @@ $(call filter_Configuration_add_ui_filters,fcfg_langpack,filter/source/config/fr
63d924a
 $(call filter_Configuration_add_types,fcfg_langpack,fcfg_calc_types.xcu,filter/source/config/fragments/types,\
63d924a
 	calc_DIF \
63d924a
 	calc_ODS_FlatXML \
63d924a
+	calc_HTML \
63d924a
 	generic_HTML \
63d924a
 	generic_Text \
63d924a
 	calc_Lotus \
63d924a
diff --git a/filter/source/config/fragments/types/calc_HTML.xcu b/filter/source/config/fragments/types/calc_HTML.xcu
63d924a
new file mode 100644
479c87e
index 0000000..f4682da
63d924a
--- /dev/null
63d924a
+++ b/filter/source/config/fragments/types/calc_HTML.xcu
63d924a
@@ -0,0 +1,35 @@
63d924a
+
63d924a
+ * This file is part of the LibreOffice project.
63d924a
+ *
63d924a
+ * This Source Code Form is subject to the terms of the Mozilla Public
63d924a
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
63d924a
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
63d924a
+ *
63d924a
+ * This file incorporates work covered by the following license notice:
63d924a
+ *
63d924a
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
63d924a
+ *   contributor license agreements. See the NOTICE file distributed
63d924a
+ *   with this work for additional information regarding copyright
63d924a
+ *   ownership. The ASF licenses this file to you under the Apache
63d924a
+ *   License, Version 2.0 (the "License"); you may not use this file
63d924a
+ *   except in compliance with the License. You may obtain a copy of
63d924a
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
63d924a
+-->
63d924a
+    
63d924a
+    extension. Allow to detect these early to avoid going through the
63d924a
+    whole list of detectors. This also avoids the risk of misdetection
63d924a
+    as something else, as there are some formats that are text files and
63d924a
+    the detection is just a heuristic (e.g., wp1 or wp42 supported by
63d924a
+    libwpd). -->
63d924a
+    <node oor:name="calc_HTML" oor:op="replace" >
479c87e
+        <prop oor:name="DetectService"><value>com.sun.star.comp.calc.HtmlFilterDetect</value></prop>
63d924a
+        <prop oor:name="URLPattern"/>
63d924a
+        <prop oor:name="Extensions"><value>xls</value></prop>
63d924a
+        <prop oor:name="MediaType"><value>text/html</value></prop>
63d924a
+        <prop oor:name="Preferred"><value>false</value></prop>
63d924a
+        <prop oor:name="PreferredFilter"/>
63d924a
+        <prop oor:name="UIName">
63d924a
+            <value>HTML Table</value>
63d924a
+        </prop>
63d924a
+        <prop oor:name="ClipboardFormat"/>
63d924a
+    </node>
479c87e
diff --git a/sc/Library_scd.mk b/sc/Library_scd.mk
479c87e
index 4d02ae1..1b4d035 100644
479c87e
--- a/sc/Library_scd.mk
479c87e
+++ b/sc/Library_scd.mk
479c87e
@@ -37,6 +37,7 @@ $(eval $(call gb_Library_use_libraries,scd,\
479c87e
 ))
479c87e
 
479c87e
 $(eval $(call gb_Library_add_exception_objects,scd,\
479c87e
+	sc/source/filter/html/htmlfilterdetect \
479c87e
 	sc/source/ui/unoobj/detreg \
479c87e
 	sc/source/ui/unoobj/scdetect \
479c87e
 	sc/source/ui/unoobj/exceldetect \
479c87e
diff --git a/sc/inc/htmlfilterdetect.hxx b/sc/inc/htmlfilterdetect.hxx
479c87e
new file mode 100644
479c87e
index 0000000..f131e89
479c87e
--- /dev/null
479c87e
+++ b/sc/inc/htmlfilterdetect.hxx
479c87e
@@ -0,0 +1,80 @@
479c87e
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
479c87e
+/*
479c87e
+ * This file is part of the LibreOffice project.
479c87e
+ *
479c87e
+ * This Source Code Form is subject to the terms of the Mozilla Public
479c87e
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
479c87e
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
479c87e
+ *
479c87e
+ * This file incorporates work covered by the following license notice:
479c87e
+ *
479c87e
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
479c87e
+ *   contributor license agreements. See the NOTICE file distributed
479c87e
+ *   with this work for additional information regarding copyright
479c87e
+ *   ownership. The ASF licenses this file to you under the Apache
479c87e
+ *   License, Version 2.0 (the "License"); you may not use this file
479c87e
+ *   except in compliance with the License. You may obtain a copy of
479c87e
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
479c87e
+ */
479c87e
+
479c87e
+#ifndef INCLUDED_SC_INC_HTMLFILTERDETECT_HXX
479c87e
+#define INCLUDED_SC_INC_HTMLFILTERDETECT_HXX
479c87e
+
479c87e
+#include <com/sun/star/document/XExtendedFilterDetection.hpp>
479c87e
+#include <com/sun/star/lang/XInitialization.hpp>
479c87e
+#include <com/sun/star/lang/XServiceInfo.hpp>
479c87e
+#include <com/sun/star/uno/XComponentContext.hpp>
479c87e
+
479c87e
+#include <cppuhelper/implbase3.hxx>
479c87e
+
479c87e
+namespace sc
479c87e
+{
479c87e
+
479c87e
+class HtmlFilterDetect : public cppu::WeakImplHelper3<
479c87e
+    com::sun::star::document::XExtendedFilterDetection,
479c87e
+    com::sun::star::lang::XInitialization,
479c87e
+    com::sun::star::lang::XServiceInfo>
479c87e
+{
479c87e
+    com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext> mxCxt;
479c87e
+
479c87e
+public:
479c87e
+
479c87e
+    HtmlFilterDetect (const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& xCxt);
479c87e
+    virtual ~HtmlFilterDetect();
479c87e
+
479c87e
+    // XExtendedFilterDetection
479c87e
+
479c87e
+    virtual OUString SAL_CALL detect(com::sun::star::uno::Sequence<com::sun::star::beans::PropertyValue>& lDescriptor)
479c87e
+            throw( com::sun::star::uno::RuntimeException, std::exception ) SAL_OVERRIDE;
479c87e
+
479c87e
+    // XInitialization
479c87e
+
479c87e
+    virtual void SAL_CALL initialize( const ::com::sun::star::uno::Sequence<com::sun::star::uno::Any>& aArguments)
479c87e
+        throw (com::sun::star::uno::Exception, com::sun::star::uno::RuntimeException, std::exception) SAL_OVERRIDE;
479c87e
+
479c87e
+    // XServiceInfo
479c87e
+
479c87e
+    virtual OUString SAL_CALL getImplementationName()
479c87e
+        throw (com::sun::star::uno::RuntimeException, std::exception) SAL_OVERRIDE;
479c87e
+
479c87e
+    virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName)
479c87e
+        throw (com::sun::star::uno::RuntimeException, std::exception) SAL_OVERRIDE;
479c87e
+
479c87e
+    virtual com::sun::star::uno::Sequence<OUString> SAL_CALL getSupportedServiceNames()
479c87e
+        throw (com::sun::star::uno::RuntimeException, std::exception) SAL_OVERRIDE;
479c87e
+};
479c87e
+
479c87e
+OUString HtmlFilterDetect_getImplementationName();
479c87e
+
479c87e
+bool HtmlFilterDetect_supportsService(const OUString& ServiceName);
479c87e
+
479c87e
+com::sun::star::uno::Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames();
479c87e
+
479c87e
+com::sun::star::uno::Reference<com::sun::star::uno::XInterface>
479c87e
+HtmlFilterDetect_createInstance(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& rCxt);
479c87e
+
479c87e
+}
479c87e
+
479c87e
+#endif
479c87e
+
479c87e
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
479c87e
diff --git a/sc/source/filter/html/htmlfilterdetect.cxx b/sc/source/filter/html/htmlfilterdetect.cxx
479c87e
new file mode 100644
479c87e
index 0000000..f2f3db5
479c87e
--- /dev/null
479c87e
+++ b/sc/source/filter/html/htmlfilterdetect.cxx
479c87e
@@ -0,0 +1,180 @@
479c87e
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
479c87e
+/*
479c87e
+ * This file is part of the LibreOffice project.
479c87e
+ *
479c87e
+ * This Source Code Form is subject to the terms of the Mozilla Public
479c87e
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
479c87e
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
479c87e
+ */
479c87e
+
479c87e
+#include "htmlfilterdetect.hxx"
479c87e
+
479c87e
+#include <svtools/htmltokn.h>
479c87e
+#include <ucbhelper/content.hxx>
479c87e
+#include <unotools/mediadescriptor.hxx>
479c87e
+#include <unotools/ucbstreamhelper.hxx>
479c87e
+
479c87e
+#include <com/sun/star/lang/XMultiServiceFactory.hpp>
479c87e
+#include <com/sun/star/io/XInputStream.hpp>
479c87e
+#include <cppuhelper/supportsservice.hxx>
479c87e
+#include <boost/scoped_ptr.hpp>
479c87e
+
479c87e
+#define CALC_HTML_FILTER   "calc_HTML_WebQuery"
479c87e
+
479c87e
+namespace sc
479c87e
+{
479c87e
+
479c87e
+using namespace ::com::sun::star;
479c87e
+using utl::MediaDescriptor;
479c87e
+
479c87e
+namespace {
479c87e
+
479c87e
+bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
479c87e
+{
479c87e
+    boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
479c87e
+    if ( !pInStream || pInStream->GetError() )
479c87e
+        // No stream
479c87e
+        return false;
479c87e
+
479c87e
+    // Read the stream header
479c87e
+    pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
479c87e
+    const sal_Size nUniPos = pInStream->Tell();
479c87e
+    const sal_uInt16 nSize = 4096;
479c87e
+
479c87e
+    OString sHeader;
479c87e
+    if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
479c87e
+        sHeader = read_uInt8s_ToOString( *pInStream, nSize );
479c87e
+    else // UTF-16 (nUniPos = 2)
479c87e
+        sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
479c87e
+
479c87e
+    // Now check whether the stream begins with a known HTML tag.
479c87e
+    enum DetectPhase { BeforeTag, TagOpened, InTagName };
479c87e
+    DetectPhase dp = BeforeTag;
479c87e
+
479c87e
+    const char* pHeader = sHeader.getStr();
479c87e
+    const int   nLength = sHeader.getLength();
479c87e
+    int i = 0, nStartOfTagIndex = 0;
479c87e
+
479c87e
+    for ( i = 0; i < nLength; ++i, ++pHeader )
479c87e
+    {
479c87e
+        char c = *pHeader;
479c87e
+        if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' )
479c87e
+        {
479c87e
+            if ( dp == TagOpened )
479c87e
+                return false; // Invalid: Should start with a tag name
479c87e
+            else if ( dp == InTagName )
479c87e
+                break; // End of tag name reached
479c87e
+        }
479c87e
+        else if ( c == '<' )
479c87e
+        {
479c87e
+            if ( dp == BeforeTag )
479c87e
+                dp = TagOpened;
479c87e
+            else
479c87e
+                return false; // Invalid: Nested '<'
479c87e
+        }
479c87e
+        else if ( c == '>' )
479c87e
+        {
479c87e
+            if ( dp == InTagName )
479c87e
+                break; // End of tag name reached
479c87e
+            else
479c87e
+                return false; // Invalid: Empty tag or before '<'
479c87e
+        }
479c87e
+        else if ( c == '!' )
479c87e
+        {
479c87e
+            if ( dp == TagOpened )
479c87e
+                return true; // "
479c87e
+            else
479c87e
+                return false; // Invalid: '!' before '<' or inside tag name
479c87e
+        }
479c87e
+        else
479c87e
+        {
479c87e
+            if ( dp == BeforeTag )
479c87e
+                return false; // Invalid: Should start with a tag
479c87e
+            else if ( dp == TagOpened )
479c87e
+            {
479c87e
+                nStartOfTagIndex = i;
479c87e
+                dp = InTagName;
479c87e
+            }
479c87e
+        }
479c87e
+    }
479c87e
+
479c87e
+    // The string following '<' has to be a known HTML token.
479c87e
+    OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
479c87e
+    if ( GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != 0 )
479c87e
+        return true;
479c87e
+
479c87e
+    return false;
479c87e
+}
479c87e
+
479c87e
+}
479c87e
+
479c87e
+HtmlFilterDetect::HtmlFilterDetect(const uno::Reference<uno::XComponentContext>& xCxt) :
479c87e
+    mxCxt(xCxt) {}
479c87e
+
479c87e
+HtmlFilterDetect::~HtmlFilterDetect() {}
479c87e
+
479c87e
+OUString SAL_CALL HtmlFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor) throw (uno::RuntimeException, std::exception)
479c87e
+{
479c87e
+    MediaDescriptor aMediaDesc(lDescriptor);
479c87e
+
479c87e
+    OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() );
479c87e
+
479c87e
+    uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY);
479c87e
+    if (!xInStream.is() || !IsHTMLStream(xInStream))
479c87e
+        return OUString();
479c87e
+
479c87e
+    aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER);
479c87e
+
479c87e
+    aMediaDesc >> lDescriptor;
479c87e
+    return aType;
479c87e
+}
479c87e
+
479c87e
+// XInitialization
479c87e
+
479c87e
+void SAL_CALL HtmlFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
479c87e
+    throw (uno::Exception, uno::RuntimeException, std::exception)
479c87e
+{
479c87e
+}
479c87e
+
479c87e
+OUString HtmlFilterDetect_getImplementationName()
479c87e
+{
479c87e
+    return OUString("com.sun.star.comp.calc.HtmlFilterDetect");
479c87e
+}
479c87e
+
479c87e
+uno::Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames()
479c87e
+{
479c87e
+    uno::Sequence<OUString> aRet(2);
479c87e
+    OUString* pArray = aRet.getArray();
479c87e
+    pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
479c87e
+    pArray[1] = "com.sun.star.comp.filters.HtmlFilterDetect";
479c87e
+    return aRet;
479c87e
+}
479c87e
+
479c87e
+uno::Reference<uno::XInterface> HtmlFilterDetect_createInstance(
479c87e
+    const uno::Reference<uno::XComponentContext> & rCxt)
479c87e
+{
479c87e
+    return (cppu::OWeakObject*) new HtmlFilterDetect(rCxt);
479c87e
+}
479c87e
+
479c87e
+// XServiceInfo
479c87e
+OUString SAL_CALL HtmlFilterDetect::getImplementationName()
479c87e
+    throw (uno::RuntimeException, std::exception)
479c87e
+{
479c87e
+    return HtmlFilterDetect_getImplementationName();
479c87e
+}
479c87e
+
479c87e
+sal_Bool SAL_CALL HtmlFilterDetect::supportsService(const OUString& rServiceName)
479c87e
+    throw (uno::RuntimeException, std::exception)
479c87e
+{
479c87e
+    return cppu::supportsService(this, rServiceName);
479c87e
+}
479c87e
+
479c87e
+uno::Sequence<OUString> SAL_CALL HtmlFilterDetect::getSupportedServiceNames()
479c87e
+    throw (uno::RuntimeException, std::exception)
479c87e
+{
479c87e
+    return HtmlFilterDetect_getSupportedServiceNames();
479c87e
+}
479c87e
+
479c87e
+}
479c87e
+
479c87e
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
479c87e
diff --git a/sc/source/ui/unoobj/detreg.cxx b/sc/source/ui/unoobj/detreg.cxx
479c87e
index 6edc743..f840ac1 100644
479c87e
--- a/sc/source/ui/unoobj/detreg.cxx
479c87e
+++ b/sc/source/ui/unoobj/detreg.cxx
479c87e
@@ -18,6 +18,7 @@
479c87e
  */
479c87e
 
479c87e
 
479c87e
+#include "htmlfilterdetect.hxx"
479c87e
 #include "scdetect.hxx"
479c87e
 #include "exceldetect.hxx"
479c87e
 #include <cppuhelper/implementationentry.hxx>
479c87e
@@ -42,6 +43,14 @@ static const cppu::ImplementationEntry spServices[] =
479c87e
         0, 0
479c87e
     },
63d924a
 
479c87e
+    {
479c87e
+        sc::HtmlFilterDetect_createInstance,
479c87e
+        sc::HtmlFilterDetect_getImplementationName,
479c87e
+        sc::HtmlFilterDetect_getSupportedServiceNames,
479c87e
+        cppu::createSingleComponentFactory,
479c87e
+        0, 0
479c87e
+    },
479c87e
+
479c87e
     { 0, 0, 0, 0, 0, 0 }
479c87e
 };
63d924a
 
479c87e
diff --git a/sc/util/scd.component b/sc/util/scd.component
479c87e
index 767429a..76ed959 100644
479c87e
--- a/sc/util/scd.component
479c87e
+++ b/sc/util/scd.component
479c87e
@@ -25,4 +25,7 @@
479c87e
   <implementation name="com.sun.star.comp.calc.ExcelBiffFormatDetector">
479c87e
     <service name="com.sun.star.frame.ExtendedTypeDetection"/>
479c87e
   </implementation>
479c87e
+  <implementation name="com.sun.star.comp.calc.HtmlFilterDetect">
479c87e
+    <service name="com.sun.star.frame.ExtendedTypeDetection"/>
479c87e
+  </implementation>
479c87e
 </component>
63d924a
-- 
63d924a
1.9.3
63d924a