File indexing completed on 2025-03-16 12:49:36
0001 /* 0002 This file is part of a KMetaData File Extractor 0003 SPDX-FileCopyrightText: 2013 Denis Steckelmacher <steckdenis@yahoo.fr> 0004 0005 SPDX-License-Identifier: LGPL-2.1-or-later 0006 */ 0007 0008 #include "officeextractor.h" 0009 #include "kfilemetadata_debug.h" 0010 0011 #include <QRegularExpression> 0012 #include <QStandardPaths> 0013 0014 #include <QProcess> 0015 0016 using namespace KFileMetaData; 0017 0018 OfficeExtractor::OfficeExtractor(QObject* parent) 0019 : ExtractorPlugin(parent) 0020 { 0021 // Find the executables of catdoc, catppt and xls2csv. If an executable cannot 0022 // be found, indexing its corresponding MIME type will be disabled 0023 findExe(QStringLiteral("application/msword"), QStringLiteral("catdoc"), m_catdoc); 0024 findExe(QStringLiteral("application/vnd.ms-excel"), QStringLiteral("xls2csv"), m_xls2csv); 0025 findExe(QStringLiteral("application/vnd.ms-powerpoint"), QStringLiteral("catppt"), m_catppt); 0026 } 0027 0028 void OfficeExtractor::findExe(const QString& mimeType, const QString& name, QString& fullPath) 0029 { 0030 fullPath = QStandardPaths::findExecutable(name); 0031 0032 if (!fullPath.isEmpty()) { 0033 m_available_mime_types << mimeType; 0034 } 0035 } 0036 0037 QStringList OfficeExtractor::mimetypes() const 0038 { 0039 return m_available_mime_types; 0040 } 0041 0042 0043 void OfficeExtractor::extract(ExtractionResult* result) 0044 { 0045 QStringList args; 0046 QString contents; 0047 0048 args << QStringLiteral("-s") << QStringLiteral("cp1252"); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ? 0049 args << QStringLiteral("-d") << QStringLiteral("utf8"); 0050 0051 const QString fileUrl = result->inputUrl(); 0052 const QString mimeType = result->inputMimetype(); 0053 if (mimeType == QLatin1String("application/msword")) { 0054 result->addType(Type::Document); 0055 0056 args << QStringLiteral("-w"); 0057 contents = textFromFile(fileUrl, m_catdoc, args); 0058 0059 // Now that we have the plain text content, count words, lines and characters 0060 // (original code from plaintextextractor.cpp, authored by Vishesh Handa) 0061 int lines = contents.count(QLatin1Char('\n')); 0062 int words = contents.count(QRegularExpression(QStringLiteral("\\b\\w+\\b"), QRegularExpression::UseUnicodePropertiesOption)); 0063 0064 result->add(Property::WordCount, words); 0065 result->add(Property::LineCount, lines); 0066 } else if (mimeType == QLatin1String("application/vnd.ms-excel")) { 0067 result->addType(Type::Document); 0068 result->addType(Type::Spreadsheet); 0069 0070 args << QStringLiteral("-c") << QStringLiteral(" "); 0071 args << QStringLiteral("-b") << QStringLiteral(" "); 0072 args << QStringLiteral("-q") << QStringLiteral("0"); 0073 contents = textFromFile(fileUrl, m_xls2csv, args); 0074 } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint")) { 0075 result->addType(Type::Document); 0076 result->addType(Type::Presentation); 0077 0078 contents = textFromFile(fileUrl, m_catppt, args); 0079 } 0080 0081 if (contents.isEmpty()) { 0082 return; 0083 } 0084 0085 result->append(contents); 0086 0087 return; 0088 } 0089 0090 QString OfficeExtractor::textFromFile(const QString& fileUrl, const QString& command, QStringList& arguments) 0091 { 0092 const QString exec = QStandardPaths::findExecutable(command); 0093 if (exec.isEmpty()) { 0094 qCDebug(KFILEMETADATA_LOG) << "Could not find executable in PATH:" << command; 0095 return {}; 0096 } 0097 0098 arguments << fileUrl; 0099 0100 // Start a process and read its standard output 0101 QProcess process; 0102 0103 process.setReadChannel(QProcess::StandardOutput); 0104 process.start(exec, arguments, QIODevice::ReadOnly); 0105 process.waitForFinished(); 0106 0107 if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0) { 0108 return QString(); 0109 } else { 0110 return QString::fromUtf8(process.readAll()); 0111 } 0112 } 0113 0114 #include "moc_officeextractor.cpp"