File indexing completed on 2024-05-12 15:37:06

0001 /*
0002     SPDX-FileCopyrightText: 2012 Vishesh Handa <me@vhanda.in>
0003 
0004     SPDX-License-Identifier: LGPL-2.1-or-later
0005 */
0006 
0007 
0008 #include "plaintextextractor.h"
0009 
0010 #include <QFile>
0011 #include <QTextCodec>
0012 #include <QDebug>
0013 
0014 #include <fstream>
0015 
0016 #if defined(Q_OS_LINUX) || defined(__GLIBC__)
0017     #include <sys/types.h>
0018     #include <sys/stat.h>
0019     #include <fcntl.h>
0020     #include <unistd.h>
0021 #endif
0022 
0023 using namespace KFileMetaData;
0024 
0025 PlainTextExtractor::PlainTextExtractor(QObject* parent)
0026     : ExtractorPlugin(parent)
0027 {
0028 
0029 }
0030 
0031 const QStringList supportedMimeTypes = {
0032     QStringLiteral("text/plain"),
0033 };
0034 
0035 QStringList PlainTextExtractor::mimetypes() const
0036 {
0037     return supportedMimeTypes;
0038 }
0039 
0040 void PlainTextExtractor::extract(ExtractionResult* result)
0041 {
0042 #if defined(Q_OS_LINUX) || defined(__GLIBC__)
0043     QByteArray filePath = QFile::encodeName(result->inputUrl());
0044 
0045 #ifdef O_NOATIME
0046     int fd = open(filePath.constData(), O_RDONLY | O_NOATIME);
0047     if (fd < 0)
0048 #else
0049     int fd;
0050 #endif
0051     {
0052         fd = open(filePath.constData(), O_RDONLY);
0053     }
0054 
0055     if (fd < 0) {
0056         return;
0057     }
0058 
0059     result->addType(Type::Text);
0060     if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
0061         close(fd);
0062         return;
0063     }
0064 
0065     QTextCodec* codec = QTextCodec::codecForLocale();
0066 
0067     char* line = nullptr;
0068     size_t len = 0;
0069     int lines = 0;
0070     int r = 0;
0071 
0072     FILE* fp = fdopen(fd, "r");
0073 
0074     while ( (r = getline(&line, &len, fp)) != -1) {
0075         QTextCodec::ConverterState state;
0076         QString text = codec->toUnicode(line, r - 1, &state);
0077 
0078         if (state.invalidChars > 0) {
0079             qDebug() << "Invalid encoding. Ignoring" << result->inputUrl();
0080             free(line);
0081             close(fd);
0082             return;
0083         }
0084         result->append(text);
0085 
0086         lines += 1;
0087     }
0088     if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
0089         result->add(Property::LineCount, lines);
0090     }
0091 
0092     free(line);
0093     close(fd);
0094 
0095 #else
0096     std::string line;
0097     int lines = 0;
0098 
0099     std::ifstream fstream(QFile::encodeName(result->inputUrl()).constData());
0100     if (!fstream.is_open()) {
0101         return;
0102     }
0103 
0104     result->addType(Type::Text);
0105     if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
0106         return;
0107     }
0108 
0109     QTextCodec* codec = QTextCodec::codecForLocale();
0110     while (std::getline(fstream, line)) {
0111         QByteArray arr = QByteArray::fromRawData(line.c_str(), line.size());
0112 
0113         QTextCodec::ConverterState state;
0114         QString text = codec->toUnicode(arr.constData(), arr.size(), &state);
0115 
0116         if (state.invalidChars > 0) {
0117             qDebug() << "Invalid encoding. Ignoring" << result->inputUrl();
0118             return;
0119         }
0120         result->append(text);
0121 
0122         lines += 1;
0123     }
0124 
0125     result->add(Property::LineCount, lines);
0126 #endif
0127 }
0128 
0129 #include "moc_plaintextextractor.cpp"