File indexing completed on 2025-01-12 13:05:51

0001 /* This file is part of the KDE project
0002    Copyright (C) 2001 Eva Brucherseifer <eva@kde.org>
0003    Copyright (C) 2005 Bram Schoenmakers <bramschoenmakers@kde.nl>
0004    based on kspread csv export filter by David Faure
0005 
0006    This library is free software; you can redistribute it and/or
0007    modify it under the terms of the GNU Library General Public
0008    License as published by the Free Software Foundation; either
0009    version 2 of the License, or (at your option) any later version.
0010 
0011    This library is distributed in the hope that it will be useful,
0012    but WITHOUT ANY WARRANTY; without even the implied warranty of
0013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0014    Library General Public License for more details.
0015 
0016    You should have received a copy of the GNU Library General Public License
0017    along with this library; see the file COPYING.LIB.  If not, write to
0018    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0019  * Boston, MA 02110-1301, USA.
0020 */
0021 
0022 #include "htmlimport.h"
0023 
0024 #include "HtmlImportDebug.h"
0025 //#include <exportdialog.h>
0026 
0027 #include <QFile>
0028 #include <QFileInfo>
0029 #include <QTextCodec>
0030 #include <QTextStream>
0031 #include <QByteArray>
0032 #include <QEventLoop>
0033 #include <kpluginfactory.h>
0034 #include <KoFilterChain.h>
0035 #include <KoXmlWriter.h>
0036 #include <KoOdfWriteStore.h>
0037 #include <KoGenStyles.h>
0038 #include <KoGenStyle.h>
0039 
0040 #include <khtml_part.h>
0041 #include <khtmlview.h>
0042 #include <dom/dom_text.h>
0043 #include <dom/dom2_views.h>
0044 #include <dom/dom_doc.h>
0045 #include <dom/dom_element.h>
0046 #include <dom/dom_string.h>
0047 //#include <dom/html_table.h>
0048 //#include <dom/html_misc.h>
0049 
0050 //using namespace Calligra::Sheets;
0051 
0052 K_PLUGIN_FACTORY_WITH_JSON(HTMLImportFactory, "calligra_filter_html2ods.json",
0053                            registerPlugin<HTMLImport>();)
0054 
0055 HTMLImport::HTMLImport(QObject* parent, const QVariantList&)
0056     : KoFilter(parent)
0057 {
0058 }
0059 
0060 HTMLImport::~HTMLImport()
0061 {
0062 }
0063 
0064 KoFilter::ConversionStatus HTMLImport::convert(const QByteArray& from, const QByteArray& to)
0065 {
0066     if (to != "application/vnd.oasis.opendocument.spreadsheet" || from != "text/html") {
0067         warnHtml << "Invalid mimetypes " << to << " " << from;
0068         return KoFilter::NotImplemented;
0069     }
0070 
0071     QString inputFile = m_chain->inputFile();
0072     QString outputFile = m_chain->outputFile();
0073     debugHtml<<"inputFile="<<inputFile<<"outputFile="<<outputFile;
0074 
0075     // check if the inout file exists
0076     m_inputDir = QFileInfo(m_chain->inputFile()).dir();
0077     if(!m_inputDir.exists())
0078         return KoFilter::StupidError;
0079 
0080     // create output store
0081     KoStore* storeout = KoStore::createStore(outputFile, KoStore::Write, "application/vnd.oasis.opendocument.spreadsheet", KoStore::Zip);
0082     if (!storeout)
0083         return KoFilter::FileNotFound;
0084 
0085     KoOdfWriteStore oasisStore(storeout);
0086     m_manifestWriter = oasisStore.manifestWriter("application/vnd.oasis.opendocument.spreadsheet");
0087     m_store = &oasisStore;
0088 
0089     m_mainStyles = new KoGenStyles();
0090 
0091     KoXmlWriter* bodyWriter = m_store->bodyWriter();
0092     m_store->contentWriter(); // we need to create the instance even if the contentWriter is not used
0093 
0094     bodyWriter->startElement("office:body");
0095     KoFilter::ConversionStatus result = loadUrl(QUrl::fromLocalFile(m_chain->inputFile()));
0096     if(result != KoFilter::OK)
0097         warnHtml << "Failed to load url=" << m_chain->inputFile();
0098     bodyWriter->endElement(); // office:body
0099 
0100     if(m_store->closeContentWriter())
0101         m_manifestWriter->addManifestEntry("content.xml", "text/xml");
0102 
0103     if(createStyle())
0104         m_manifestWriter->addManifestEntry("styles.xml", "text/xml");
0105 
0106     if(createMeta())
0107         m_manifestWriter->addManifestEntry("meta.xml", "text/xml");
0108 
0109     m_store->closeManifestWriter();
0110     delete storeout;
0111     m_manifestWriter = 0;
0112     m_store = 0;
0113     return result;
0114 }
0115 
0116 bool HTMLImport::createStyle()
0117 {
0118     if (!m_store->store()->open("styles.xml"))
0119         return false;
0120     KoStoreDevice dev(m_store->store());
0121     KoXmlWriter* stylesWriter = new KoXmlWriter(&dev);
0122 
0123     stylesWriter->startDocument("office:document-styles");
0124     stylesWriter->startElement("office:document-styles");
0125     stylesWriter->addAttribute("xmlns:office", "urn:oasis:names:tc:opendocument:xmlns:office:1.0");
0126     stylesWriter->addAttribute("xmlns:style", "urn:oasis:names:tc:opendocument:xmlns:style:1.0");
0127     stylesWriter->addAttribute("xmlns:text", "urn:oasis:names:tc:opendocument:xmlns:text:1.0");
0128     stylesWriter->addAttribute("xmlns:table", "urn:oasis:names:tc:opendocument:xmlns:table:1.0");
0129     stylesWriter->addAttribute("xmlns:draw", "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0");
0130     stylesWriter->addAttribute("xmlns:fo", "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0");
0131     stylesWriter->addAttribute("xmlns:svg", "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0");
0132     stylesWriter->addAttribute("office:version", "1.0");
0133 
0134     m_mainStyles->saveOdfStyles(KoGenStyles::MasterStyles, stylesWriter);
0135     m_mainStyles->saveOdfStyles(KoGenStyles::DocumentStyles, stylesWriter); // office:style
0136     m_mainStyles->saveOdfStyles(KoGenStyles::DocumentAutomaticStyles, stylesWriter); // office:automatic-styles
0137 
0138     stylesWriter->endElement();  // office:document-styles
0139     stylesWriter->endDocument();
0140 
0141     delete stylesWriter;
0142     return m_store->store()->close();
0143 }
0144 
0145 bool HTMLImport::createMeta()
0146 {
0147     if (!m_store->store()->open("meta.xml"))
0148         return false;
0149 
0150     KoStoreDevice dev(m_store->store());
0151     KoXmlWriter* metaWriter = new KoXmlWriter(&dev);
0152     metaWriter->startDocument("office:document-meta");
0153     metaWriter->startElement("office:document-meta");
0154     metaWriter->addAttribute("xmlns:office", "urn:oasis:names:tc:opendocument:xmlns:office:1.0");
0155     metaWriter->addAttribute("xmlns:xlink", "http://www.w3.org/1999/xlink");
0156     metaWriter->addAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/");
0157     metaWriter->addAttribute("xmlns:meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
0158     metaWriter->startElement("office:meta");
0159 
0160     //metaWriter->startElement("dc:title");
0161     //metaWriter->addTextNode(workbook->property(Workbook::PIDSI_TITLE).toString());
0162     //metaWriter->endElement();
0163 
0164     //metaWriter->startElement("dc:subject", false);
0165     //metaWriter->addTextNode(workbook->property(Workbook::PIDSI_SUBJECT).toString());
0166     //metaWriter->endElement();
0167 
0168     metaWriter->endElement(); // office:meta
0169     metaWriter->endElement(); // office:document-meta
0170     metaWriter->endDocument();
0171 
0172     delete metaWriter;
0173     return m_store->store()->close();
0174 }
0175 
0176 KoFilter::ConversionStatus HTMLImport::loadUrl(const QUrl &url)
0177 {
0178     debugHtml << url;
0179 
0180     KoXmlWriter* bodyWriter = m_store->bodyWriter();
0181     //KoXmlWriter* contentWriter = m_store->contentWriter();
0182 
0183     QStringList sheets;
0184     {
0185         KHTMLPart html;
0186         html.view()->resize(600, 530);
0187         html.setAutoloadImages(false);
0188         html.setJScriptEnabled(false);
0189         html.setPluginsEnabled(false);
0190         html.setJavaEnabled(false);
0191         html.setMetaRefreshEnabled(false);
0192 
0193         QEventLoop loop;
0194         connect(&html, SIGNAL(completed()), &loop, SLOT(quit()));
0195         QMetaObject::invokeMethod(&html,"openUrl", Qt::QueuedConnection, Q_ARG(QUrl,url));
0196         //if (!html.openUrl(url)) { warnHtml << "Failed loadUrl" << url; return KoFilter::StupidError; }
0197         loop.exec(QEventLoop::ExcludeUserInputEvents);
0198 
0199         // body
0200         DOM::Document doc = html.document();
0201         DOM::NodeList body = doc.getElementsByTagName("body");
0202         DOM::Node docbody = body.item(0);
0203         if (!docbody.isNull()) {
0204             m_states.push(InBody);
0205             bodyWriter->startElement("office:spreadsheet");
0206             parseNode(docbody);
0207             bodyWriter->endElement(); // office:spreadsheet
0208             m_states.pop();
0209         }
0210 
0211         // frames
0212         DOM::NodeList frameset = doc.getElementsByTagName("frameset");
0213         DOM::Node frame = frameset.item(0);
0214         if (!frame.isNull()) {
0215             for(uint i = 0; i < frameset.length(); ++i) {
0216                 for (DOM::Node n = frameset.item(i).firstChild(); !n.isNull(); n = n.nextSibling()) {
0217                     DOM::Element f = n;
0218                     if(!f.isNull() && f.nodeName().lower() == "frame" && f.getAttribute("name").string() == "frSheet")
0219                         sheets.append(f.getAttribute("src").string());
0220                 }
0221             }
0222         }
0223     }
0224 
0225     // the KHTMLPart and DOM::Document are no more and we can call us recursivly now.
0226     if(!sheets.isEmpty()) {
0227         m_states.push(InFrameset);
0228         foreach(const QString &src, sheets) {
0229             const QUrl u = QUrl::fromLocalFile(QFileInfo(m_inputDir, src).absoluteFilePath());
0230             loadUrl(u);
0231         }
0232         m_states.pop();
0233     }
0234 
0235     return KoFilter::OK;
0236 }
0237 
0238 void HTMLImport::parseNode(DOM::Node node)
0239 {
0240     KoXmlWriter* bodyWriter = m_store->bodyWriter();
0241     //KoXmlWriter* contentWriter = m_store->contentWriter();
0242 
0243     // check if this is a text node.
0244     DOM::Text t = node;
0245     if (!t.isNull()) {
0246         if(!m_states.isEmpty() && m_states.top() == InCell) {
0247             const QString s = t.data().string().trimmed();
0248             if(!s.isEmpty()) {
0249                 //debugHtml<<"TEXT tagname=" << node.nodeName() << "TEXT="<<t.data().string();
0250                 bodyWriter->addAttribute("office:value-type", "string");
0251                 bodyWriter->addAttribute("office:string-value", s);
0252             }
0253         }
0254         return; // no children anymore...
0255     }
0256 
0257     DOM::DOMString tag = node.nodeName().lower();
0258 
0259     if(tag == "table") {
0260         m_states.push(InTable);
0261         bodyWriter->startElement("table:table");
0262 
0263         // hack to get some name defined
0264         static int sheetCount = 0;
0265         bodyWriter->addAttribute("table:name", QString("Sheet %1").arg(++sheetCount));
0266     }
0267     else if(tag == "tr") {
0268         m_states.push(InRow);
0269         bodyWriter->startElement("table:table-row");
0270         //xmlWriter->addAttribute("table:number-columns-spanned", );
0271         //xmlWriter->addAttribute("table:number-rows-spanned", );
0272     }
0273     else if(tag == "td") {
0274         m_states.push(InCell);
0275         bodyWriter->startElement("table:table-cell");
0276     } else {
0277         m_states.push(InNone);
0278     }
0279 
0280     //debugHtml<<"...START nodeName="<<node.nodeName();
0281 
0282     DOM::Element e = node;
0283     bool go_recursive = true;
0284     if (!e.isNull()) {
0285         //parseStyle(e); // get the CSS information
0286         go_recursive = parseTag(e); // get the tag information
0287     }
0288     if (go_recursive) {
0289         for (DOM::Node n = node.firstChild(); !n.isNull(); n = n.nextSibling()) {
0290             parseNode(n);
0291         }
0292     }
0293 
0294     State state = m_states.pop();
0295     if(state == InTable || state == InRow || state == InCell) {
0296         bodyWriter->endElement();
0297     }
0298 
0299     //debugHtml<<"...END nodeName="<<node.nodeName();
0300 }
0301 
0302 bool HTMLImport::parseTag(DOM::Element element)
0303 {
0304     DOM::DOMString tag = element.tagName().lower();
0305 
0306     // Don't handle the content of comment- or script-nodes.
0307     if (element.nodeType() == DOM::Node::COMMENT_NODE || tag == "script") {
0308         return false;
0309     }
0310 
0311     return true;
0312 }
0313 
0314 #include <htmlimport.moc>