src/buffer/katetextloader.h

0001 /*
0002     SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org>
0003
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006
0007 #ifndef KATE_TEXTLOADER_H
0008 #define KATE_TEXTLOADER_H
0009
0010 #include <QCryptographicHash>
0011 #include <QFile>
0012 #include <QMimeDatabase>
0013 #include <QString>
0014 #include <QStringDecoder>
0015
0016 #include <KCompressionDevice>
0017 #include <KEncodingProber>
0018
0019 #include "katetextbuffer.h"
0020
0021 namespace Kate
0022 {
0023 /**
0024  * loader block size, load 256 kb at once per default
0025  * if file size is smaller, fall back to file size
0026  * must be a multiple of 2
0027  */
0028 static const qint64 KATE_FILE_LOADER_BS = 256 * 1024;
0029
0030 /**
0031  * File Loader, will handle reading of files + detecting encoding
0032  */
0033 class TextLoader
0034 {
0035 public:
0036     /**
0037      * Construct file loader for given file.
0038      * @param filename file to open
0039      * @param proberType prober type
0040      */
0041     TextLoader(const QString &filename, KEncodingProber::ProberType proberType)
0042         : m_eof(false) // default to not eof
0043         , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline
0044         , m_lastWasR(false) // we have not found a \r as last char
0045         , m_position(0)
0046         , m_lastLineStart(0)
0047         , m_eol(TextBuffer::eolUnknown) // no eol type detected atm
0048         , m_buffer(KATE_FILE_LOADER_BS, 0)
0049         , m_digest(QCryptographicHash::Sha1)
0050         , m_bomFound(false)
0051         , m_firstRead(true)
0052         , m_proberType(proberType)
0053         , m_fileSize(0)
0054     {
0055         // try to get mimetype for on the fly decompression, don't rely on filename!
0056         QFile testMime(filename);
0057         if (testMime.open(QIODevice::ReadOnly)) {
0058             m_fileSize = testMime.size();
0059         }
0060         m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name();
0061
0062         // construct filter device
0063         KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(m_mimeType);
0064         m_file = new KCompressionDevice(filename, compressionType);
0065     }
0066
0067     /**
0068      * Destructor
0069      */
0070     ~TextLoader()
0071     {
0072         delete m_file;
0073     }
0074
0075     /**
0076      * open file with given codec
0077      * @param codec codec to use, if 0, will do some auto-detect or fallback
0078      * @return success
0079      */
0080     bool open(const QString &codec)
0081     {
0082         m_codec = codec;
0083         m_eof = false;
0084         m_lastWasEndOfLine = true;
0085         m_lastWasR = false;
0086         m_position = 0;
0087         m_lastLineStart = 0;
0088         m_eol = TextBuffer::eolUnknown;
0089         m_text.clear();
0090         m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData());
0091         m_bomFound = false;
0092         m_firstRead = true;
0093
0094         // init the hash with the git header
0095         const QString header = QStringLiteral("blob %1").arg(m_fileSize);
0096         m_digest.reset();
0097         m_digest.addData(QByteArray(header.toLatin1() + '\0'));
0098
0099         // if already opened, close the file...
0100         if (m_file->isOpen()) {
0101             m_file->close();
0102         }
0103
0104         return m_file->open(QIODevice::ReadOnly);
0105     }
0106
0107     /**
0108      * end of file reached?
0109      * @return end of file reached
0110      */
0111     bool eof() const
0112     {
0113         return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length());
0114     }
0115
0116     /**
0117      * Detected end of line mode for this file.
0118      * Detected during reading, is valid after complete file is read.
0119      * @return eol mode of this file
0120      */
0121     TextBuffer::EndOfLineMode eol() const
0122     {
0123         return m_eol;
0124     }
0125
0126     /**
0127      * BOM found?
0128      * @return byte order mark found?
0129      */
0130     bool byteOrderMarkFound() const
0131     {
0132         return m_bomFound;
0133     }
0134
0135     /**
0136      * mime type used to create filter dev
0137      * @return mime-type of filter device
0138      */
0139     const QString &mimeTypeForFilterDev() const
0140     {
0141         return m_mimeType;
0142     }
0143
0144     /**
0145      * internal Unicode data array
0146      * @return internal Unicode data
0147      */
0148     const QChar *unicode() const
0149     {
0150         return m_text.unicode();
0151     }
0152
0153     /**
0154      * Get codec for this loader
0155      * @return currently in use codec of this loader
0156      */
0157     QString textCodec() const
0158     {
0159         return m_codec;
0160     }
0161
0162     /**
0163      * read a line, return length + offset in Unicode data
0164      * @param offset offset into internal Unicode data for read line
0165      * @param length length of read line
0166      * @return true if no encoding errors occurred
0167      */
0168     bool readLine(int &offset, int &length)
0169     {
0170         length = 0;
0171         offset = 0;
0172         bool encodingError = false;
0173
0174         static const QLatin1Char cr(QLatin1Char('\r'));
0175         static const QLatin1Char lf(QLatin1Char('\n'));
0176
0177         /**
0178          * did we read two time but got no stuff? encoding error
0179          * fixes problem with one character latin-1 files, which lead to crash otherwise!
0180          * bug 272579
0181          */
0182         bool failedToConvertOnce = false;
0183         /**
0184          * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true
0185          * BUG: 440359
0186          */
0187         bool bomPreviouslyFound = m_bomFound;
0188
0189         /**
0190          * reading loop
0191          */
0192         while (m_position <= m_text.length()) {
0193             if (m_position == m_text.length()) {
0194                 // try to load more text if something is around
0195                 if (!m_eof) {
0196                     // kill the old lines...
0197                     m_text.remove(0, m_lastLineStart);
0198
0199                     // try to read new data
0200                     const int c = m_file->read(m_buffer.data(), m_buffer.size());
0201
0202                     // if any text is there, append it....
0203                     if (c > 0) {
0204                         // update hash sum
0205                         m_digest.addData(QByteArrayView(m_buffer.data(), c));
0206
0207                         // detect byte order marks & codec for byte order marks on first read
0208                         if (m_firstRead) {
0209                             /**
0210                              * if no codec given, do autodetection
0211                              */
0212                             if (!m_converterState.isValid()) {
0213                                 /**
0214                                  * first: try to get HTML header encoding, includes BOM handling
0215                                  */
0216                                 m_converterState = QStringDecoder::decoderForHtml(m_buffer);
0217
0218                                 /**
0219                                  * else: use KEncodingProber
0220                                  */
0221                                 if (!m_converterState.isValid()) {
0222                                     KEncodingProber prober(m_proberType);
0223                                     prober.feed(m_buffer.constData(), c);
0224
0225                                     // we found codec with some confidence?
0226                                     if (prober.confidence() > 0.5) {
0227                                         m_converterState = QStringDecoder(prober.encoding().constData());
0228                                     }
0229                                 }
0230
0231                                 // no codec, no chance, encoding error, else remember the codec name
0232                                 if (!m_converterState.isValid()) {
0233                                     return false;
0234                                 }
0235                             }
0236
0237                             // we want to convert the bom for later detection
0238                             m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom);
0239
0240                             // remember name, might have changed
0241                             m_codec = QString::fromUtf8(m_converterState.name());
0242                         }
0243
0244                         // detect broken encoding
0245                         Q_ASSERT(m_converterState.isValid());
0246                         const QString unicode = m_converterState.decode(QByteArrayView(m_buffer.data(), c));
0247                         encodingError = encodingError || m_converterState.hasError();
0248
0249                         // check and remove bom
0250                         if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) {
0251                             m_bomFound = true;
0252                             m_text.append(QStringView(unicode).last(unicode.size() - 1));
0253
0254                             // swapped BOM is encoding error
0255                             encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped;
0256                         } else {
0257                             m_text.append(unicode);
0258                         }
0259                         m_firstRead = false;
0260                     }
0261
0262                     // is file completely read ?
0263                     m_eof = (c == -1) || (c == 0);
0264
0265                     // recalc current pos and last pos
0266                     m_position -= m_lastLineStart;
0267                     m_lastLineStart = 0;
0268                 }
0269
0270                 // oh oh, end of file, escape !
0271                 if (m_eof && (m_position == m_text.length())) {
0272                     m_lastWasEndOfLine = false;
0273
0274                     // line data
0275                     offset = m_lastLineStart;
0276                     length = m_position - m_lastLineStart;
0277
0278                     m_lastLineStart = m_position;
0279
0280                     return !encodingError && !failedToConvertOnce;
0281                 }
0282
0283                 // empty? try again
0284                 if (m_position == m_text.length()) {
0285                     if (!bomPreviouslyFound && m_bomFound) {
0286                         // BOM was processed above, so we didn't fail to convert
0287                         bomPreviouslyFound = true;
0288                     } else {
0289                         failedToConvertOnce = true;
0290                     }
0291                     continue;
0292                 }
0293             }
0294             for (; m_position < m_text.length(); m_position++) {
0295                 QChar current_char = m_text.at(m_position);
0296                 if (current_char == lf) {
0297                     m_lastWasEndOfLine = true;
0298
0299                     if (m_lastWasR) {
0300                         m_lastLineStart++;
0301                         m_lastWasR = false;
0302                         m_eol = TextBuffer::eolDos;
0303                     } else {
0304                         // line data
0305                         offset = m_lastLineStart;
0306                         length = m_position - m_lastLineStart;
0307
0308                         m_lastLineStart = m_position + 1;
0309                         m_position++;
0310
0311                         // only win, if not dos!
0312                         if (m_eol != TextBuffer::eolDos) {
0313                             m_eol = TextBuffer::eolUnix;
0314                         }
0315
0316                         return !encodingError;
0317                     }
0318                 } else if (current_char == cr) {
0319                     m_lastWasEndOfLine = true;
0320                     m_lastWasR = true;
0321
0322                     // line data
0323                     offset = m_lastLineStart;
0324                     length = m_position - m_lastLineStart;
0325
0326                     m_lastLineStart = m_position + 1;
0327                     m_position++;
0328
0329                     // should only win of first time!
0330                     if (m_eol == TextBuffer::eolUnknown) {
0331                         m_eol = TextBuffer::eolMac;
0332                     }
0333
0334                     return !encodingError;
0335                 } else if (current_char == QChar::LineSeparator) {
0336                     m_lastWasEndOfLine = true;
0337
0338                     // line data
0339                     offset = m_lastLineStart;
0340                     length = m_position - m_lastLineStart;
0341
0342                     m_lastLineStart = m_position + 1;
0343                     m_position++;
0344
0345                     return !encodingError;
0346                 } else {
0347                     m_lastWasEndOfLine = false;
0348                     m_lastWasR = false;
0349                 }
0350             }
0351         }
0352
0353         return !encodingError;
0354     }
0355
0356     QByteArray digest()
0357     {
0358         return m_digest.result();
0359     }
0360
0361 private:
0362     QString m_codec;
0363     bool m_eof;
0364     bool m_lastWasEndOfLine;
0365     bool m_lastWasR;
0366     int m_position;
0367     int m_lastLineStart;
0368     TextBuffer::EndOfLineMode m_eol;
0369     QString m_mimeType;
0370     QIODevice *m_file;
0371     QByteArray m_buffer;
0372     QCryptographicHash m_digest;
0373     QString m_text;
0374     QStringDecoder m_converterState;
0375     bool m_bomFound;
0376     bool m_firstRead;
0377     KEncodingProber::ProberType m_proberType;
0378     quint64 m_fileSize;
0379 };
0380
0381 }
0382
0383 #endif