File indexing completed on 2024-05-12 15:37:06

0001 /*
0002     This file is part of a KMetaData File Extractor
0003     SPDX-FileCopyrightText: 2013 Denis Steckelmacher <steckdenis@yahoo.fr>
0004 
0005     SPDX-License-Identifier: LGPL-2.1-or-later
0006 */
0007 
0008 #include "officeextractor.h"
0009 #include "kfilemetadata_debug.h"
0010 
0011 #include <QRegularExpression>
0012 #include <QStandardPaths>
0013 
0014 #include <QProcess>
0015 
0016 using namespace KFileMetaData;
0017 
0018 OfficeExtractor::OfficeExtractor(QObject* parent)
0019     : ExtractorPlugin(parent)
0020 {
0021     // Find the executables of catdoc, catppt and xls2csv. If an executable cannot
0022     // be found, indexing its corresponding MIME type will be disabled
0023     findExe(QStringLiteral("application/msword"), QStringLiteral("catdoc"), m_catdoc);
0024     findExe(QStringLiteral("application/vnd.ms-excel"), QStringLiteral("xls2csv"), m_xls2csv);
0025     findExe(QStringLiteral("application/vnd.ms-powerpoint"), QStringLiteral("catppt"), m_catppt);
0026 }
0027 
0028 void OfficeExtractor::findExe(const QString& mimeType, const QString& name, QString& fullPath)
0029 {
0030     fullPath = QStandardPaths::findExecutable(name);
0031 
0032     if (!fullPath.isEmpty()) {
0033         m_available_mime_types << mimeType;
0034     }
0035 }
0036 
0037 QStringList OfficeExtractor::mimetypes() const
0038 {
0039     return m_available_mime_types;
0040 }
0041 
0042 
0043 void OfficeExtractor::extract(ExtractionResult* result)
0044 {
0045     QStringList args;
0046     QString contents;
0047 
0048     args << QStringLiteral("-s") << QStringLiteral("cp1252"); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ?
0049     args << QStringLiteral("-d") << QStringLiteral("utf8");
0050 
0051     const QString fileUrl = result->inputUrl();
0052     const QString mimeType = result->inputMimetype();
0053     if (mimeType == QLatin1String("application/msword")) {
0054         result->addType(Type::Document);
0055 
0056         args << QStringLiteral("-w");
0057         contents = textFromFile(fileUrl, m_catdoc, args);
0058 
0059         // Now that we have the plain text content, count words, lines and characters
0060         // (original code from plaintextextractor.cpp, authored by Vishesh Handa)
0061         int lines = contents.count(QLatin1Char('\n'));
0062         int words = contents.count(QRegularExpression(QStringLiteral("\\b\\w+\\b"), QRegularExpression::UseUnicodePropertiesOption));
0063 
0064         result->add(Property::WordCount, words);
0065         result->add(Property::LineCount, lines);
0066     } else if (mimeType == QLatin1String("application/vnd.ms-excel")) {
0067         result->addType(Type::Document);
0068         result->addType(Type::Spreadsheet);
0069 
0070         args << QStringLiteral("-c") << QStringLiteral(" ");
0071         args << QStringLiteral("-b") << QStringLiteral(" ");
0072         args << QStringLiteral("-q") << QStringLiteral("0");
0073         contents = textFromFile(fileUrl, m_xls2csv, args);
0074     } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint")) {
0075         result->addType(Type::Document);
0076         result->addType(Type::Presentation);
0077 
0078         contents = textFromFile(fileUrl, m_catppt, args);
0079     }
0080 
0081     if (contents.isEmpty()) {
0082         return;
0083     }
0084 
0085     result->append(contents);
0086 
0087     return;
0088 }
0089 
0090 QString OfficeExtractor::textFromFile(const QString& fileUrl, const QString& command, QStringList& arguments)
0091 {
0092     const QString exec = QStandardPaths::findExecutable(command);
0093     if (exec.isEmpty()) {
0094         qCDebug(KFILEMETADATA_LOG) << "Could not find executable in PATH:" << command;
0095         return {};
0096     }
0097 
0098     arguments << fileUrl;
0099 
0100     // Start a process and read its standard output
0101     QProcess process;
0102 
0103     process.setReadChannel(QProcess::StandardOutput);
0104     process.start(exec, arguments, QIODevice::ReadOnly);
0105     process.waitForFinished();
0106 
0107     if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0) {
0108         return QString();
0109     } else {
0110         return QString::fromUtf8(process.readAll());
0111     }
0112 }
0113 
0114 #include "moc_officeextractor.cpp"