File indexing completed on 2025-01-12 13:05:51
0001 /* This file is part of the KDE project 0002 Copyright (C) 2001 Eva Brucherseifer <eva@kde.org> 0003 Copyright (C) 2005 Bram Schoenmakers <bramschoenmakers@kde.nl> 0004 based on kspread csv export filter by David Faure 0005 0006 This library is free software; you can redistribute it and/or 0007 modify it under the terms of the GNU Library General Public 0008 License as published by the Free Software Foundation; either 0009 version 2 of the License, or (at your option) any later version. 0010 0011 This library is distributed in the hope that it will be useful, 0012 but WITHOUT ANY WARRANTY; without even the implied warranty of 0013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0014 Library General Public License for more details. 0015 0016 You should have received a copy of the GNU Library General Public License 0017 along with this library; see the file COPYING.LIB. If not, write to 0018 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0019 * Boston, MA 02110-1301, USA. 0020 */ 0021 0022 #include "htmlimport.h" 0023 0024 #include "HtmlImportDebug.h" 0025 //#include <exportdialog.h> 0026 0027 #include <QFile> 0028 #include <QFileInfo> 0029 #include <QTextCodec> 0030 #include <QTextStream> 0031 #include <QByteArray> 0032 #include <QEventLoop> 0033 #include <kpluginfactory.h> 0034 #include <KoFilterChain.h> 0035 #include <KoXmlWriter.h> 0036 #include <KoOdfWriteStore.h> 0037 #include <KoGenStyles.h> 0038 #include <KoGenStyle.h> 0039 0040 #include <khtml_part.h> 0041 #include <khtmlview.h> 0042 #include <dom/dom_text.h> 0043 #include <dom/dom2_views.h> 0044 #include <dom/dom_doc.h> 0045 #include <dom/dom_element.h> 0046 #include <dom/dom_string.h> 0047 //#include <dom/html_table.h> 0048 //#include <dom/html_misc.h> 0049 0050 //using namespace Calligra::Sheets; 0051 0052 K_PLUGIN_FACTORY_WITH_JSON(HTMLImportFactory, "calligra_filter_html2ods.json", 0053 registerPlugin<HTMLImport>();) 0054 0055 HTMLImport::HTMLImport(QObject* parent, const QVariantList&) 0056 : KoFilter(parent) 0057 { 0058 } 0059 0060 HTMLImport::~HTMLImport() 0061 { 0062 } 0063 0064 KoFilter::ConversionStatus HTMLImport::convert(const QByteArray& from, const QByteArray& to) 0065 { 0066 if (to != "application/vnd.oasis.opendocument.spreadsheet" || from != "text/html") { 0067 warnHtml << "Invalid mimetypes " << to << " " << from; 0068 return KoFilter::NotImplemented; 0069 } 0070 0071 QString inputFile = m_chain->inputFile(); 0072 QString outputFile = m_chain->outputFile(); 0073 debugHtml<<"inputFile="<<inputFile<<"outputFile="<<outputFile; 0074 0075 // check if the inout file exists 0076 m_inputDir = QFileInfo(m_chain->inputFile()).dir(); 0077 if(!m_inputDir.exists()) 0078 return KoFilter::StupidError; 0079 0080 // create output store 0081 KoStore* storeout = KoStore::createStore(outputFile, KoStore::Write, "application/vnd.oasis.opendocument.spreadsheet", KoStore::Zip); 0082 if (!storeout) 0083 return KoFilter::FileNotFound; 0084 0085 KoOdfWriteStore oasisStore(storeout); 0086 m_manifestWriter = oasisStore.manifestWriter("application/vnd.oasis.opendocument.spreadsheet"); 0087 m_store = &oasisStore; 0088 0089 m_mainStyles = new KoGenStyles(); 0090 0091 KoXmlWriter* bodyWriter = m_store->bodyWriter(); 0092 m_store->contentWriter(); // we need to create the instance even if the contentWriter is not used 0093 0094 bodyWriter->startElement("office:body"); 0095 KoFilter::ConversionStatus result = loadUrl(QUrl::fromLocalFile(m_chain->inputFile())); 0096 if(result != KoFilter::OK) 0097 warnHtml << "Failed to load url=" << m_chain->inputFile(); 0098 bodyWriter->endElement(); // office:body 0099 0100 if(m_store->closeContentWriter()) 0101 m_manifestWriter->addManifestEntry("content.xml", "text/xml"); 0102 0103 if(createStyle()) 0104 m_manifestWriter->addManifestEntry("styles.xml", "text/xml"); 0105 0106 if(createMeta()) 0107 m_manifestWriter->addManifestEntry("meta.xml", "text/xml"); 0108 0109 m_store->closeManifestWriter(); 0110 delete storeout; 0111 m_manifestWriter = 0; 0112 m_store = 0; 0113 return result; 0114 } 0115 0116 bool HTMLImport::createStyle() 0117 { 0118 if (!m_store->store()->open("styles.xml")) 0119 return false; 0120 KoStoreDevice dev(m_store->store()); 0121 KoXmlWriter* stylesWriter = new KoXmlWriter(&dev); 0122 0123 stylesWriter->startDocument("office:document-styles"); 0124 stylesWriter->startElement("office:document-styles"); 0125 stylesWriter->addAttribute("xmlns:office", "urn:oasis:names:tc:opendocument:xmlns:office:1.0"); 0126 stylesWriter->addAttribute("xmlns:style", "urn:oasis:names:tc:opendocument:xmlns:style:1.0"); 0127 stylesWriter->addAttribute("xmlns:text", "urn:oasis:names:tc:opendocument:xmlns:text:1.0"); 0128 stylesWriter->addAttribute("xmlns:table", "urn:oasis:names:tc:opendocument:xmlns:table:1.0"); 0129 stylesWriter->addAttribute("xmlns:draw", "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"); 0130 stylesWriter->addAttribute("xmlns:fo", "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"); 0131 stylesWriter->addAttribute("xmlns:svg", "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"); 0132 stylesWriter->addAttribute("office:version", "1.0"); 0133 0134 m_mainStyles->saveOdfStyles(KoGenStyles::MasterStyles, stylesWriter); 0135 m_mainStyles->saveOdfStyles(KoGenStyles::DocumentStyles, stylesWriter); // office:style 0136 m_mainStyles->saveOdfStyles(KoGenStyles::DocumentAutomaticStyles, stylesWriter); // office:automatic-styles 0137 0138 stylesWriter->endElement(); // office:document-styles 0139 stylesWriter->endDocument(); 0140 0141 delete stylesWriter; 0142 return m_store->store()->close(); 0143 } 0144 0145 bool HTMLImport::createMeta() 0146 { 0147 if (!m_store->store()->open("meta.xml")) 0148 return false; 0149 0150 KoStoreDevice dev(m_store->store()); 0151 KoXmlWriter* metaWriter = new KoXmlWriter(&dev); 0152 metaWriter->startDocument("office:document-meta"); 0153 metaWriter->startElement("office:document-meta"); 0154 metaWriter->addAttribute("xmlns:office", "urn:oasis:names:tc:opendocument:xmlns:office:1.0"); 0155 metaWriter->addAttribute("xmlns:xlink", "http://www.w3.org/1999/xlink"); 0156 metaWriter->addAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/"); 0157 metaWriter->addAttribute("xmlns:meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); 0158 metaWriter->startElement("office:meta"); 0159 0160 //metaWriter->startElement("dc:title"); 0161 //metaWriter->addTextNode(workbook->property(Workbook::PIDSI_TITLE).toString()); 0162 //metaWriter->endElement(); 0163 0164 //metaWriter->startElement("dc:subject", false); 0165 //metaWriter->addTextNode(workbook->property(Workbook::PIDSI_SUBJECT).toString()); 0166 //metaWriter->endElement(); 0167 0168 metaWriter->endElement(); // office:meta 0169 metaWriter->endElement(); // office:document-meta 0170 metaWriter->endDocument(); 0171 0172 delete metaWriter; 0173 return m_store->store()->close(); 0174 } 0175 0176 KoFilter::ConversionStatus HTMLImport::loadUrl(const QUrl &url) 0177 { 0178 debugHtml << url; 0179 0180 KoXmlWriter* bodyWriter = m_store->bodyWriter(); 0181 //KoXmlWriter* contentWriter = m_store->contentWriter(); 0182 0183 QStringList sheets; 0184 { 0185 KHTMLPart html; 0186 html.view()->resize(600, 530); 0187 html.setAutoloadImages(false); 0188 html.setJScriptEnabled(false); 0189 html.setPluginsEnabled(false); 0190 html.setJavaEnabled(false); 0191 html.setMetaRefreshEnabled(false); 0192 0193 QEventLoop loop; 0194 connect(&html, SIGNAL(completed()), &loop, SLOT(quit())); 0195 QMetaObject::invokeMethod(&html,"openUrl", Qt::QueuedConnection, Q_ARG(QUrl,url)); 0196 //if (!html.openUrl(url)) { warnHtml << "Failed loadUrl" << url; return KoFilter::StupidError; } 0197 loop.exec(QEventLoop::ExcludeUserInputEvents); 0198 0199 // body 0200 DOM::Document doc = html.document(); 0201 DOM::NodeList body = doc.getElementsByTagName("body"); 0202 DOM::Node docbody = body.item(0); 0203 if (!docbody.isNull()) { 0204 m_states.push(InBody); 0205 bodyWriter->startElement("office:spreadsheet"); 0206 parseNode(docbody); 0207 bodyWriter->endElement(); // office:spreadsheet 0208 m_states.pop(); 0209 } 0210 0211 // frames 0212 DOM::NodeList frameset = doc.getElementsByTagName("frameset"); 0213 DOM::Node frame = frameset.item(0); 0214 if (!frame.isNull()) { 0215 for(uint i = 0; i < frameset.length(); ++i) { 0216 for (DOM::Node n = frameset.item(i).firstChild(); !n.isNull(); n = n.nextSibling()) { 0217 DOM::Element f = n; 0218 if(!f.isNull() && f.nodeName().lower() == "frame" && f.getAttribute("name").string() == "frSheet") 0219 sheets.append(f.getAttribute("src").string()); 0220 } 0221 } 0222 } 0223 } 0224 0225 // the KHTMLPart and DOM::Document are no more and we can call us recursivly now. 0226 if(!sheets.isEmpty()) { 0227 m_states.push(InFrameset); 0228 foreach(const QString &src, sheets) { 0229 const QUrl u = QUrl::fromLocalFile(QFileInfo(m_inputDir, src).absoluteFilePath()); 0230 loadUrl(u); 0231 } 0232 m_states.pop(); 0233 } 0234 0235 return KoFilter::OK; 0236 } 0237 0238 void HTMLImport::parseNode(DOM::Node node) 0239 { 0240 KoXmlWriter* bodyWriter = m_store->bodyWriter(); 0241 //KoXmlWriter* contentWriter = m_store->contentWriter(); 0242 0243 // check if this is a text node. 0244 DOM::Text t = node; 0245 if (!t.isNull()) { 0246 if(!m_states.isEmpty() && m_states.top() == InCell) { 0247 const QString s = t.data().string().trimmed(); 0248 if(!s.isEmpty()) { 0249 //debugHtml<<"TEXT tagname=" << node.nodeName() << "TEXT="<<t.data().string(); 0250 bodyWriter->addAttribute("office:value-type", "string"); 0251 bodyWriter->addAttribute("office:string-value", s); 0252 } 0253 } 0254 return; // no children anymore... 0255 } 0256 0257 DOM::DOMString tag = node.nodeName().lower(); 0258 0259 if(tag == "table") { 0260 m_states.push(InTable); 0261 bodyWriter->startElement("table:table"); 0262 0263 // hack to get some name defined 0264 static int sheetCount = 0; 0265 bodyWriter->addAttribute("table:name", QString("Sheet %1").arg(++sheetCount)); 0266 } 0267 else if(tag == "tr") { 0268 m_states.push(InRow); 0269 bodyWriter->startElement("table:table-row"); 0270 //xmlWriter->addAttribute("table:number-columns-spanned", ); 0271 //xmlWriter->addAttribute("table:number-rows-spanned", ); 0272 } 0273 else if(tag == "td") { 0274 m_states.push(InCell); 0275 bodyWriter->startElement("table:table-cell"); 0276 } else { 0277 m_states.push(InNone); 0278 } 0279 0280 //debugHtml<<"...START nodeName="<<node.nodeName(); 0281 0282 DOM::Element e = node; 0283 bool go_recursive = true; 0284 if (!e.isNull()) { 0285 //parseStyle(e); // get the CSS information 0286 go_recursive = parseTag(e); // get the tag information 0287 } 0288 if (go_recursive) { 0289 for (DOM::Node n = node.firstChild(); !n.isNull(); n = n.nextSibling()) { 0290 parseNode(n); 0291 } 0292 } 0293 0294 State state = m_states.pop(); 0295 if(state == InTable || state == InRow || state == InCell) { 0296 bodyWriter->endElement(); 0297 } 0298 0299 //debugHtml<<"...END nodeName="<<node.nodeName(); 0300 } 0301 0302 bool HTMLImport::parseTag(DOM::Element element) 0303 { 0304 DOM::DOMString tag = element.tagName().lower(); 0305 0306 // Don't handle the content of comment- or script-nodes. 0307 if (element.nodeType() == DOM::Node::COMMENT_NODE || tag == "script") { 0308 return false; 0309 } 0310 0311 return true; 0312 } 0313 0314 #include <htmlimport.moc>