File indexing completed on 2024-09-08 05:08:15

0001 // clang-format off
0002 /*
0003  * KDiff3 - Text Diff And Merge Tool
0004  *
0005  * SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de
0006  * SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com
0007  * SPDX-License-Identifier: GPL-2.0-or-later
0008  */
0009 // clang-format on
0010 
0011 /* Features of class SourceData:
0012 - Read a file (from the given URL) or accept data via a string.
0013 - Allocate and free buffers as necessary.
0014 - Run a preprocessor, when specified.
0015 - Run the line-matching preprocessor, when specified.
0016 - Run other preprocessing steps: Uppercase, ignore comments,
0017                                  remove carriage return, ignore numbers.
0018 
0019 Order of operation:
0020  1. If data was given via a string then save it to a temp file. (see setData())
0021  2. If the specified file is nonlocal (URL) copy it to a temp file. (TODO revisit this)
0022  3. If a preprocessor was specified, run the input file through it.
0023  4. Read the output of the preprocessor.
0024  5. If Uppercase was specified: Turn the read data to uppercase.
0025  6. Write the result to a temp file.
0026  7. If a line-matching preprocessor was specified, run the temp file through it.
0027  8. Read the output of the line-matching preprocessor.
0028  9. If ignore numbers was specified, strip the LMPP-output of all numbers.
0029 10. If ignore comments was specified, strip the LMPP-output of comments.
0030 
0031 Optimizations: Skip unneeded steps.
0032 */
0033 #include "SourceData.h"
0034 
0035 #include "CommentParser.h"
0036 #include "compat.h"
0037 #include "diff.h"
0038 #include "EncodedDataStream.h"
0039 #include "LineRef.h"
0040 #include "Logging.h"
0041 #include "Utils.h"
0042 
0043 #include <algorithm>         // for min
0044 #include <memory>
0045 #include <optional>
0046 #include <vector>            // for vector
0047 
0048 #include <QtGlobal>
0049 
0050 #include <QByteArray>
0051 #include <QProcess>
0052 #include <QString>
0053 #include <QTemporaryFile>
0054 #include <QTextCodec>
0055 #include <QVector>
0056 
0057 extern std::unique_ptr<Options> gOptions;
0058 
0059 void SourceData::reset()
0060 {
0061     mFromClipBoard = false;
0062     mEncoding = u8"UTF-8";
0063     m_fileAccess = FileAccess();
0064     m_normalData.reset();
0065     m_lmppData.reset();
0066     if(!m_tempInputFileName.isEmpty())
0067     {
0068         m_tempFile.remove();
0069         m_tempInputFileName = "";
0070     }
0071 
0072     mErrors.clear();
0073 }
0074 
0075 void SourceData::setFilename(const QString& filename)
0076 {
0077     if(filename.isEmpty())
0078     {
0079         reset();
0080     }
0081     else
0082     {
0083         setFileAccess(FileAccess(filename));
0084     }
0085 }
0086 
0087 bool SourceData::isEmpty() const
0088 {
0089     return getFilename().isEmpty();
0090 }
0091 
0092 bool SourceData::hasData() const
0093 {
0094     return m_normalData.m_pBuf != nullptr;
0095 }
0096 
0097 bool SourceData::isValid() const
0098 {
0099     return isEmpty() || hasData();
0100 }
0101 
0102 QString SourceData::getFilename() const
0103 {
0104     return m_fileAccess.absoluteFilePath();
0105 }
0106 
0107 QString SourceData::getAliasName() const
0108 {
0109     return m_aliasName.isEmpty() ? m_fileAccess.prettyAbsPath() : m_aliasName;
0110 }
0111 
0112 void SourceData::setAliasName(const QString& name)
0113 {
0114     m_aliasName = name;
0115 }
0116 
0117 void SourceData::setFileAccess(const FileAccess& fileAccess)
0118 {
0119     mFromClipBoard = false;
0120 
0121     m_fileAccess = fileAccess;
0122     m_aliasName = QString();
0123     if(!m_tempInputFileName.isEmpty())
0124     {
0125         m_tempFile.remove();
0126         m_tempInputFileName = "";
0127     }
0128 
0129     mErrors.clear();
0130 }
0131 
0132 void SourceData::setEncoding(const char* encoding)
0133 {
0134     mEncoding = encoding;
0135 }
0136 
0137 void SourceData::setData(const QString& data)
0138 {
0139     mErrors.clear();
0140     // Create a temp file for preprocessing:
0141     if(m_tempInputFileName.isEmpty())
0142     {
0143         FileAccess::createTempFile(m_tempFile);
0144         m_tempInputFileName = m_tempFile.fileName();
0145     }
0146     m_fileAccess = FileAccess(m_tempInputFileName);
0147     QByteArray ba = QTextCodec::codecForName("UTF-8")->fromUnicode(data);
0148     bool bSuccess = m_fileAccess.writeFile(ba.constData(), ba.length());
0149     if(!bSuccess)
0150     {
0151         mErrors.append(i18n("Writing clipboard data to temp file failed."));
0152         return;
0153     }
0154     else
0155     {
0156         m_aliasName = i18n("From Clipboard");
0157         mFromClipBoard = true;
0158     }
0159 }
0160 
0161 const std::shared_ptr<LineDataVector>& SourceData::getLineDataForDiff() const
0162 {
0163     if(m_lmppData.m_pBuf == nullptr)
0164     {
0165         return m_normalData.m_v;
0166     }
0167     else
0168     {
0169         return m_lmppData.m_v;
0170     }
0171 }
0172 
0173 const std::shared_ptr<LineDataVector>& SourceData::getLineDataForDisplay() const
0174 {
0175     return m_normalData.m_v;
0176 }
0177 
0178 LineType SourceData::getSizeLines() const
0179 {
0180     return SafeInt<LineType>(m_normalData.lineCount());
0181 }
0182 
0183 qint64 SourceData::getSizeBytes() const
0184 {
0185     return m_normalData.byteCount();
0186 }
0187 
0188 const char* SourceData::getBuf() const
0189 {
0190     return m_normalData.m_pBuf.get();
0191 }
0192 
0193 const QString& SourceData::getText() const
0194 {
0195     return *m_normalData.m_unicodeBuf;
0196 }
0197 
0198 bool SourceData::isText() const
0199 {
0200     return m_normalData.isText() || m_normalData.isEmpty();
0201 }
0202 
0203 bool SourceData::isIncompleteConversion() const
0204 {
0205     return m_normalData.m_bIncompleteConversion;
0206 }
0207 
0208 bool SourceData::isFromBuffer() const
0209 {
0210     return mFromClipBoard;
0211 }
0212 
0213 bool SourceData::isBinaryEqualWith(const QSharedPointer<SourceData>& other) const
0214 {
0215     return m_fileAccess.exists() && other->m_fileAccess.exists() &&
0216            getSizeBytes() == other->getSizeBytes() &&
0217            (getSizeBytes() == 0 || memcmp(getBuf(), other->getBuf(), getSizeBytes()) == 0);
0218 }
0219 
0220 /*
0221     Warning: Do not call this function without re-running the comparison or
0222     otherwise resetting the DiffTextWindows as these store a pointer to the file
0223     data stored here.
0224 */
0225 void SourceData::FileData::reset()
0226 {
0227     m_pBuf.reset();
0228     m_v->clear();
0229     mDataSize = 0;
0230     mLineCount = 0;
0231     m_bIsText = false;
0232     m_bIncompleteConversion = false;
0233     m_eLineEndStyle = eLineEndStyleUndefined;
0234 }
0235 
0236 bool SourceData::FileData::readFile(FileAccess& file)
0237 {
0238     reset();
0239     if(file.fileName().isEmpty())
0240     {
0241         return true;
0242     }
0243 
0244     if(!file.isNormal())
0245         return true;
0246 
0247     mDataSize = file.sizeForReading();
0248     /*
0249         If the extra bytes are removed an unknown heap currption issue is triggered in the
0250         diff code. I don't have time to track this down to its true root cause.
0251     */
0252     m_pBuf = std::make_unique<char[]>(mDataSize + 100); // Alloc 100 byte extra: Safety hack, not nice but does no harm.
0253                                                         // Some extra bytes at the end of the buffer are needed by
0254                                                         // the diff algorithm. See also GnuDiff::diff_2_files().
0255     bool bSuccess = file.readFile(m_pBuf.get(), mDataSize);
0256     if(!bSuccess)
0257     {
0258         m_pBuf = nullptr;
0259         mDataSize = 0;
0260     }
0261     else
0262     {
0263         //null terminate buffer
0264         m_pBuf[mDataSize + 1] = 0;
0265         m_pBuf[mDataSize + 2] = 0;
0266         m_pBuf[mDataSize + 3] = 0;
0267         m_pBuf[mDataSize + 4] = 0;
0268     }
0269     return bSuccess;
0270 }
0271 
0272 bool SourceData::FileData::readFile(const QString& filename)
0273 {
0274     reset();
0275     if(filename.isEmpty())
0276     {
0277         return true;
0278     }
0279 
0280     FileAccess fa(filename);
0281     return readFile(fa);
0282 }
0283 
0284 bool SourceData::saveNormalDataAs(const QString& fileName)
0285 {
0286     return m_normalData.writeFile(fileName);
0287 }
0288 
0289 bool SourceData::FileData::writeFile(const QString& filename)
0290 {
0291     if(filename.isEmpty())
0292     {
0293         return true;
0294     }
0295 
0296     FileAccess fa(filename);
0297     bool bSuccess = fa.writeFile(m_pBuf.get(), mDataSize);
0298     return bSuccess;
0299 }
0300 
0301 //Deprecated
0302 void SourceData::FileData::copyBufFrom(const FileData& src) //TODO: Remove me.
0303 {
0304     reset();
0305     mDataSize = src.mDataSize;
0306     m_pBuf = std::make_unique<char[]>(mDataSize + 100);
0307     assert(src.m_pBuf != nullptr);
0308     memcpy(m_pBuf.get(), src.m_pBuf.get(), mDataSize);
0309 }
0310 
0311 std::optional<const char*> SourceData::detectEncoding(const QString& fileName)
0312 {
0313     QFile f(fileName);
0314     if(f.open(QIODevice::ReadOnly))
0315     {
0316         char buf[400];
0317 
0318         qint64 size = f.read(buf, sizeof(buf));
0319         FileOffset skipBytes = 0;
0320         return detectEncoding(buf, size, skipBytes);
0321     }
0322     return {};
0323 }
0324 
0325 void SourceData::readAndPreprocess(const char* encoding, bool bAutoDetect)
0326 {
0327     QTemporaryFile fileIn1, fileOut1;
0328     QString fileNameIn1;
0329     QString fileNameOut1;
0330     QString fileNameIn2;
0331     QString fileNameOut2;
0332 
0333     mEncoding = encoding;
0334 
0335     // Detect the input for the preprocessing operations
0336     if(!mFromClipBoard)
0337     {
0338         //Routine result of directory compare finding a file that isn't in all locations.
0339         if(!m_fileAccess.isValid()) return;
0340 
0341         assert(!m_fileAccess.exists() || !m_fileAccess.isDir());
0342         if(!m_fileAccess.isNormal())
0343         {
0344             mErrors.append(i18n("%1 is not a normal file.", m_fileAccess.prettyAbsPath()));
0345             return;
0346         }
0347 
0348         if(m_fileAccess.isLocal())
0349         {
0350             fileNameIn1 = m_fileAccess.absoluteFilePath();
0351         }
0352         else // File is not local: create a temporary local copy:
0353         {
0354             if(m_tempInputFileName.isEmpty())
0355             {
0356                 m_fileAccess.createLocalCopy();
0357                 m_tempInputFileName = m_fileAccess.getTempName();
0358             }
0359 
0360             fileNameIn1 = m_tempInputFileName;
0361         }
0362         if(bAutoDetect)
0363         {
0364             mEncoding = detectEncoding(fileNameIn1).value_or(encoding);
0365         }
0366     }
0367     else // The input was set via setData(), probably from clipboard.
0368     {
0369         /*
0370             Used to happen during early startup this is now a bug.
0371         */
0372         assert(!m_tempInputFileName.isEmpty());
0373 
0374         fileNameIn1 = m_tempInputFileName;
0375         mEncoding = "UTF-8";
0376     }
0377     const char* pEncoding1 = getEncoding();
0378     const char* pEncoding2 = getEncoding();
0379     const QString overSizedFile = i18nc("Error message. %1 = filepath", "File %1 too large to process. Skipping.", fileNameIn1);
0380 
0381     m_normalData.reset();
0382     m_lmppData.reset();
0383 
0384     FileAccess faIn(fileNameIn1);
0385     qint64 fileInSize = faIn.size();
0386 
0387     if(faIn.exists() && !faIn.isBrokenLink())
0388     {
0389         try
0390         {
0391             // Run the first preprocessor
0392             if(gOptions->m_PreProcessorCmd.isEmpty())
0393             {
0394                 // No preprocessing: Read the file directly:
0395                 if(!m_normalData.readFile(faIn))
0396                 {
0397                     mErrors.append(faIn.getStatusText());
0398                     return;
0399                 }
0400             }
0401             else
0402             {
0403                 unsigned char b;
0404                 //Don't fail the preprocessor command if the file can't be read.
0405                 if(!faIn.readFile(&b, 1))
0406                 {
0407                     mErrors.append(faIn.getStatusText());
0408                     mErrors.append(i18n("    Temp file is: %1", fileNameIn1));
0409                     return;
0410                 }
0411 
0412                 QTemporaryFile tmpInPPFile;
0413                 QString fileNameInPP = fileNameIn1;
0414 
0415                 if(pEncoding1 != gOptions->mEncodingPP)
0416                 {
0417                     // Before running the preprocessor convert to the format that the preprocessor expects.
0418                     FileAccess::createTempFile(tmpInPPFile);
0419                     fileNameInPP = tmpInPPFile.fileName();
0420                     pEncoding1 = gOptions->mEncodingPP;
0421                     convertFileEncoding(fileNameIn1, encoding, fileNameInPP, pEncoding1);
0422                 }
0423 
0424                 QString ppCmd = gOptions->m_PreProcessorCmd;
0425                 FileAccess::createTempFile(fileOut1);
0426                 fileNameOut1 = fileOut1.fileName();
0427 
0428                 QProcess ppProcess;
0429                 ppProcess.setStandardInputFile(fileNameInPP);
0430                 ppProcess.setStandardOutputFile(fileNameOut1);
0431                 QString program;
0432                 QStringList args;
0433                 QString errorReason = Utils::getArguments(ppCmd, program, args);
0434                 if(errorReason.isEmpty())
0435                 {
0436                     ppProcess.start(program, args);
0437                     ppProcess.waitForFinished(-1);
0438                 }
0439                 else
0440                     errorReason = "\n(" + errorReason + ')';
0441 
0442                 bool bSuccess = errorReason.isEmpty() && m_normalData.readFile(fileNameOut1);
0443                 if(fileInSize > 0 && (!bSuccess || m_normalData.byteCount() == 0))
0444                 {
0445                     mErrors.append(
0446                         i18n("Preprocessing possibly failed. Check this command:\n\n  %1"
0447                              "\n\nThe preprocessing command will be disabled now.",
0448                              ppCmd) +
0449                         errorReason);
0450                     gOptions->m_PreProcessorCmd = "";
0451 
0452                     pEncoding1 = getEncoding();
0453                 }
0454             }
0455         }
0456         catch(const std::bad_alloc&)
0457         {
0458             m_normalData.reset();
0459             mErrors.append(overSizedFile);
0460             return;
0461         }
0462 
0463         if(!m_normalData.preprocess(pEncoding1, false))
0464         {
0465             mErrors.append(overSizedFile);
0466             return;
0467         }
0468         //exit early for non text data further processing assumes a text file as input
0469         if(!m_normalData.isText())
0470             return;
0471 
0472         // LineMatching Preprocessor
0473         if(!gOptions->m_LineMatchingPreProcessorCmd.isEmpty())
0474         {
0475             QTemporaryFile tempOut2, fileInPP;
0476             fileNameIn2 = fileNameOut1.isEmpty() ? fileNameIn1 : fileNameOut1;
0477             QString fileNameInPP = fileNameIn2;
0478             pEncoding2 = pEncoding1;
0479             if(pEncoding2 != gOptions->mEncodingPP)
0480             {
0481                 // Before running the preprocessor convert to the format that the preprocessor expects.
0482                 FileAccess::createTempFile(fileInPP);
0483                 fileNameInPP = fileInPP.fileName();
0484                 pEncoding2 = gOptions->mEncodingPP;
0485                 convertFileEncoding(fileNameIn2, pEncoding1, fileNameInPP, pEncoding2);
0486             }
0487 
0488             QString ppCmd = gOptions->m_LineMatchingPreProcessorCmd;
0489             FileAccess::createTempFile(tempOut2);
0490             fileNameOut2 = tempOut2.fileName();
0491             QProcess ppProcess;
0492             ppProcess.setStandardInputFile(fileNameInPP);
0493             ppProcess.setStandardOutputFile(fileNameOut2);
0494             QString program;
0495             QStringList args;
0496             QString errorReason = Utils::getArguments(ppCmd, program, args);
0497             if(errorReason.isEmpty())
0498             {
0499                 ppProcess.start(program, args);
0500                 ppProcess.waitForFinished(-1);
0501             }
0502             else
0503                 errorReason = "\n(" + errorReason + ')';
0504 
0505             bool bSuccess = errorReason.isEmpty() && m_lmppData.readFile(fileNameOut2);
0506             if(FileAccess(fileNameIn2).size() > 0 && (!bSuccess || m_lmppData.byteCount() == 0))
0507             {
0508                 mErrors.append(
0509                     i18n("The line-matching-preprocessing possibly failed. Check this command:\n\n  %1"
0510                          "\n\nThe line-matching-preprocessing command will be disabled now.", ppCmd) +
0511                     errorReason);
0512                 gOptions->m_LineMatchingPreProcessorCmd = "";
0513                 if(!m_lmppData.readFile(fileNameIn2))
0514                 {
0515                     mErrors.append(i18nc("Read error message. %1 = filepath", "Failed to read file: %1", fileNameIn2));
0516                     return;
0517                 }
0518             }
0519         }
0520         else if(gOptions->ignoreComments() || gOptions->m_bIgnoreCase)
0521         {
0522             // We need a copy of the normal data.
0523             m_lmppData.copyBufFrom(m_normalData);
0524         }
0525     }
0526     else
0527     {
0528         //exit early for nonexistent files
0529         return;
0530     }
0531 
0532     if(!m_lmppData.preprocess(pEncoding2, true))
0533     {
0534         mErrors.append(overSizedFile);
0535         return;
0536     }
0537 
0538     assert(m_lmppData.isText());
0539     //TODO: Needed?
0540     if(m_lmppData.lineCount() < m_normalData.lineCount())
0541     {
0542         // Preprocessing command may result in smaller data buffer so adjust size
0543         for(qint64 i = m_lmppData.lineCount(); i < m_normalData.lineCount(); ++i)
0544         { // Set all empty lines to point to the end of the buffer.
0545             m_lmppData.m_v->push_back(LineData(m_lmppData.m_unicodeBuf, m_lmppData.m_unicodeBuf->length()));
0546         }
0547 
0548         m_lmppData.mLineCount = m_normalData.lineCount();
0549     }
0550 
0551     // Ignore comments
0552     if(gOptions->ignoreComments() && hasData())
0553     {
0554         qint64 vSize = std::min(m_normalData.lineCount(), m_lmppData.lineCount());
0555 
0556         for(qint64 i = 0; i < vSize; ++i)
0557         {
0558             //TODO: Phase this out. We should not be messing with these flags outside the parser.
0559             (*m_normalData.m_v)[i].setPureComment((*m_lmppData.m_v)[i].isPureComment());
0560             (*m_normalData.m_v)[i].setSkipable((*m_lmppData.m_v)[i].isSkipable());
0561         }
0562     }
0563 }
0564 
0565 /** Prepare the linedata vector for every input line.*/
0566 bool SourceData::FileData::preprocess(const QByteArray& encoding, bool removeComments)
0567 {
0568     if(m_pBuf == nullptr)
0569         return true;
0570 
0571     QString line;
0572     QChar curChar, prevChar = '\0';
0573     LineType lines = 0;
0574     QtSizeType lastOffset = 0;
0575     FileOffset skipBytes = 0;
0576     std::unique_ptr<CommentParser> parser(new DefaultCommentParser());
0577 
0578     // detect line end style
0579     QVector<e_LineEndStyle> vOrigDataLineEndStyle;
0580     m_eLineEndStyle = eLineEndStyleUndefined;
0581 
0582     QByteArray pCodec = detectEncoding(m_pBuf.get(), mDataSize, skipBytes).value_or(encoding);
0583     if(pCodec != encoding)
0584         skipBytes = 0;
0585 
0586     if(mDataSize - skipBytes > limits<qint32>::max())
0587     {
0588         reset();
0589         return false;
0590     }
0591 
0592     try
0593     {
0594         const QByteArray ba = QByteArray::fromRawData(m_pBuf.get() + skipBytes, (QtSizeType)(mDataSize - skipBytes));
0595         EncodedDataStream ds(ba);
0596 
0597         mHasBOM = skipBytes != 0;
0598         ds.setEncoding(encoding);
0599         ds.setGenerateByteOrderMark(skipBytes != 0);
0600 
0601         m_bIncompleteConversion = false;
0602         m_unicodeBuf->clear();
0603 
0604         assert(m_unicodeBuf->length() == 0);
0605 
0606         mHasEOLTermination = false;
0607         bool skipNextRead = false;
0608         while(!ds.atEnd())
0609         {
0610             line.clear();
0611             if(lines >= limits<LineType>::max() - 5)
0612             {
0613                 reset();
0614                 return false;
0615             }
0616 
0617             if(!skipNextRead)
0618             {
0619                 prevChar = curChar;
0620                 ds.readChar(curChar);
0621             }
0622             else
0623                 skipNextRead = false;
0624 
0625             QtSizeType firstNonwhite = 0;
0626             bool foundNonWhite = false;
0627 
0628             while(curChar != '\n' && curChar != '\r')
0629             {
0630                 if(curChar.isNull() || curChar.isNonCharacter())
0631                 {
0632                     m_v->clear();
0633                     return true;
0634                 }
0635 
0636                 if(curChar == QChar::ReplacementCharacter)
0637                     m_bIncompleteConversion = true;
0638 
0639                 line.append(curChar);
0640                 if(!curChar.isSpace() && !foundNonWhite)
0641                 {
0642                     firstNonwhite = line.length();
0643                     foundNonWhite = true;
0644                 }
0645 
0646                 if(ds.atEnd())
0647                     break;
0648 
0649                 prevChar = curChar;
0650                 ds.readChar(curChar);
0651             }
0652 
0653             switch(curChar.unicode())
0654             {
0655                 case '\n':
0656                     vOrigDataLineEndStyle.push_back(eLineEndStyleUnix);
0657                     break;
0658                 case '\r':
0659                     if((FileOffset)lastOffset < mDataSize)
0660                     {
0661                         prevChar = curChar;
0662                         ds.readChar(curChar);
0663 
0664                         if(curChar == '\n')
0665                         {
0666                             vOrigDataLineEndStyle.push_back(eLineEndStyleDos);
0667                             break;
0668                         }
0669                         //work around for lack of seek API in QDataStream
0670                         skipNextRead = true;
0671                     }
0672 
0673                     //old mac style ending.
0674                     vOrigDataLineEndStyle.push_back(eLineEndStyleOldMac);
0675                     break;
0676             }
0677             parser->processLine(line);
0678             if(removeComments)
0679                 parser->removeComment(line);
0680             //Qt6 intrudes 64bit sizes
0681             if(line.size() >= limits<LineType>::max())
0682             {
0683                 reset();
0684                 return false;
0685             }
0686 
0687             ++lines;
0688             m_v->push_back(LineData(m_unicodeBuf, lastOffset, line.length(), firstNonwhite, parser->isSkipable(), parser->isPureComment()));
0689             //The last line may not have an EOL mark. In that case don't add one to our buffer.
0690             m_unicodeBuf->append(line);
0691             if(curChar == '\n' || curChar == '\r' || prevChar == '\r')
0692             {
0693                 //kdiff3 internally uses only unix style endings for simplicity.
0694                 m_unicodeBuf->append('\n');
0695             }
0696 
0697             assert(m_unicodeBuf->length() != lastOffset);
0698             lastOffset = m_unicodeBuf->length();
0699         }
0700 
0701         /*
0702             Process trailing new line as if there were a blank non-terminated line after it.
0703             But do nothing to the data buffer since this is a phantom line needed for internal purposes.
0704         */
0705         if(curChar == '\n' || curChar == '\r')
0706         {
0707             mHasEOLTermination = true;
0708             ++lines;
0709 
0710             parser->processLine("");
0711             m_v->push_back(LineData(m_unicodeBuf, lastOffset, 0, 0, parser->isSkipable(), parser->isPureComment()));
0712         }
0713 
0714         m_v->push_back(LineData(m_unicodeBuf, lastOffset));
0715 
0716         m_bIsText = true;
0717 
0718         if(!vOrigDataLineEndStyle.isEmpty())
0719             m_eLineEndStyle = vOrigDataLineEndStyle[0];
0720 
0721         mLineCount = lines;
0722         return true;
0723     }
0724     catch(const std::bad_alloc&)
0725     {
0726         reset();
0727         return false;
0728     }
0729 }
0730 
0731 // Convert the input file from input encoding to output encoding and write it to the output file.
0732 bool SourceData::convertFileEncoding(const QString& fileNameIn, const QByteArray& pCodecIn,
0733                                      const QString& fileNameOut, const QByteArray& pCodecOut)
0734 {
0735     QFile in(fileNameIn);
0736     if(!in.open(QIODevice::ReadOnly))
0737         return false;
0738     EncodedDataStream inStream(&in);
0739     inStream.setEncoding(pCodecIn);
0740 
0741     QFile out(fileNameOut);
0742     if(!out.open(QIODevice::WriteOnly))
0743         return false;
0744     EncodedDataStream outStream(&out);
0745     outStream.setEncoding(pCodecOut);
0746 
0747     QString data;
0748     while(!inStream.atEnd())
0749     {
0750         QChar c;
0751         inStream.readChar(c);
0752         data += c;
0753     }
0754     outStream << data;
0755 
0756     return true;
0757 }
0758 
0759 std::optional<const char*> SourceData::getEncodingFromTag(const QByteArray& s, const QByteArray& encodingTag)
0760 {
0761     QtSizeType encodingPos = s.indexOf(encodingTag);
0762     if(encodingPos >= 0)
0763     {
0764         QtSizeType apostrophPos = s.indexOf('"', encodingPos + encodingTag.length());
0765         QtSizeType apostroph2Pos = s.indexOf('\'', encodingPos + encodingTag.length());
0766         char apostroph = '"';
0767         if(apostroph2Pos >= 0 && (apostrophPos < 0 || apostroph2Pos < apostrophPos))
0768         {
0769             apostroph = '\'';
0770             apostrophPos = apostroph2Pos;
0771         }
0772 
0773         QtSizeType encodingEnd = s.indexOf(apostroph, apostrophPos + 1);
0774         if(encodingEnd >= 0) // e.g.: <meta charset="utf-8"> or <?xml version="1.0" encoding="ISO-8859-1"?>
0775         {
0776             QByteArray encoding = s.mid(apostrophPos + 1, encodingEnd - (apostrophPos + 1));
0777             if(QTextCodec::codecForName(encoding))
0778                 return encoding;
0779         }
0780         else // e.g.: <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
0781         {
0782             QByteArray encoding = s.mid(encodingPos + encodingTag.length(), apostrophPos - (encodingPos + encodingTag.length()));
0783             if(QTextCodec::codecForName(encoding))
0784                 return encoding;
0785         }
0786     }
0787     return {};
0788 }
0789 
0790 std::optional<const char*> SourceData::detectEncoding(const char* buf, qint64 size, FileOffset& skipBytes)
0791 {
0792     if(size >= 2)
0793     {
0794         if(buf[0] == '\xFF' && buf[1] == '\xFE')
0795         {
0796             skipBytes = 2;
0797             return "UTF-16LE";
0798         }
0799 
0800         if(buf[0] == '\xFE' && buf[1] == '\xFF')
0801         {
0802             skipBytes = 2;
0803             return "UTF-16BE";
0804         }
0805     }
0806     if(size >= 3)
0807     {
0808         if(buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF')
0809         {
0810             skipBytes = 3;
0811             return "UTF-8-BOM";
0812         }
0813     }
0814     skipBytes = 0;
0815     QByteArray s;
0816     /*
0817         We don't need the whole file here just the header.
0818     */
0819     if(size <= 5000)
0820         s = QByteArray(buf, (QtSizeType)size);
0821     else
0822         s = QByteArray(buf, 5000);
0823 
0824     QtSizeType xmlHeaderPos = s.indexOf("<?xml");
0825     if(xmlHeaderPos >= 0)
0826     {
0827         QtSizeType xmlHeaderEnd = s.indexOf("?>", xmlHeaderPos);
0828         if(xmlHeaderEnd >= 0)
0829         {
0830             std::optional<const char*> encoding = getEncodingFromTag(s.mid(xmlHeaderPos, xmlHeaderEnd - xmlHeaderPos), "encoding=");
0831             if(encoding.has_value())
0832                 return encoding;
0833         }
0834     }
0835     else // HTML
0836     {
0837         QtSizeType metaHeaderPos = s.indexOf("<meta");
0838         while(metaHeaderPos >= 0)
0839         {
0840             QtSizeType metaHeaderEnd = s.indexOf(">", metaHeaderPos);
0841             if(metaHeaderEnd >= 0)
0842             {
0843                 std::optional<const char*> encoding = getEncodingFromTag(s.mid(metaHeaderPos, metaHeaderEnd - metaHeaderPos), "charset=");
0844                 if(encoding.has_value())
0845                     return encoding;
0846 
0847                 metaHeaderPos = s.indexOf("<meta", metaHeaderEnd);
0848             }
0849             else
0850                 break;
0851         }
0852     }
0853     //Attempt to detect non-bom UTF8. This is a very common encoding.
0854     return detectUTF8(s);
0855 }
0856 
0857 std::optional<const char*> SourceData::detectUTF8(const QByteArray& data)
0858 {
0859     QTextCodec* utf8 = QTextCodec::codecForName("UTF-8");
0860 
0861     QTextCodec::ConverterState state;
0862     utf8->toUnicode(data.constData(), SafeInt<qint32>(data.size()), &state);
0863 
0864     if(state.invalidChars == 0)
0865         for (qint32 i = 0; i < data.size()-state.remainingChars; i++)
0866             if ((unsigned)data.at(i) > 127)
0867                 return "UTF-8";
0868 
0869     return {};
0870 }