File indexing completed on 2024-09-08 13:28:58
0001 // clang-format off 0002 /* 0003 * KDiff3 - Text Diff And Merge Tool 0004 * 0005 * SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de 0006 * SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com 0007 * SPDX-License-Identifier: GPL-2.0-or-later 0008 */ 0009 // clang-format on 0010 0011 /* Features of class SourceData: 0012 - Read a file (from the given URL) or accept data via a string. 0013 - Allocate and free buffers as necessary. 0014 - Run a preprocessor, when specified. 0015 - Run the line-matching preprocessor, when specified. 0016 - Run other preprocessing steps: Uppercase, ignore comments, 0017 remove carriage return, ignore numbers. 0018 0019 Order of operation: 0020 1. If data was given via a string then save it to a temp file. (see setData()) 0021 2. If the specified file is nonlocal (URL) copy it to a temp file. (TODO revisit this) 0022 3. If a preprocessor was specified, run the input file through it. 0023 4. Read the output of the preprocessor. 0024 5. If Uppercase was specified: Turn the read data to uppercase. 0025 6. Write the result to a temp file. 0026 7. If a line-matching preprocessor was specified, run the temp file through it. 0027 8. Read the output of the line-matching preprocessor. 0028 9. If ignore numbers was specified, strip the LMPP-output of all numbers. 0029 10. If ignore comments was specified, strip the LMPP-output of comments. 0030 0031 Optimizations: Skip unneeded steps. 0032 */ 0033 #include "SourceData.h" 0034 0035 #include "CommentParser.h" 0036 #include "compat.h" 0037 #include "diff.h" 0038 #include "EncodedDataStream.h" 0039 #include "LineRef.h" 0040 #include "Logging.h" 0041 #include "Utils.h" 0042 0043 #include <algorithm> // for min 0044 #include <memory> 0045 #include <optional> 0046 #include <vector> // for vector 0047 0048 #include <QtGlobal> 0049 0050 #include <QByteArray> 0051 #include <QProcess> 0052 #include <QScopedPointer> 0053 #include <QString> 0054 #include <QTemporaryFile> 0055 #include <QTextCodec> 0056 #include <QVector> 0057 0058 extern std::unique_ptr<Options> gOptions; 0059 0060 void SourceData::reset() 0061 { 0062 mFromClipBoard = false; 0063 mEncoding = u8"UTF-8"; 0064 m_fileAccess = FileAccess(); 0065 m_normalData.reset(); 0066 m_lmppData.reset(); 0067 if(!m_tempInputFileName.isEmpty()) 0068 { 0069 m_tempFile.remove(); 0070 m_tempInputFileName = ""; 0071 } 0072 0073 mErrors.clear(); 0074 } 0075 0076 void SourceData::setFilename(const QString& filename) 0077 { 0078 if(filename.isEmpty()) 0079 { 0080 reset(); 0081 } 0082 else 0083 { 0084 setFileAccess(FileAccess(filename)); 0085 } 0086 } 0087 0088 bool SourceData::isEmpty() const 0089 { 0090 return getFilename().isEmpty(); 0091 } 0092 0093 bool SourceData::hasData() const 0094 { 0095 return m_normalData.m_pBuf != nullptr; 0096 } 0097 0098 bool SourceData::isValid() const 0099 { 0100 return isEmpty() || hasData(); 0101 } 0102 0103 QString SourceData::getFilename() const 0104 { 0105 return m_fileAccess.absoluteFilePath(); 0106 } 0107 0108 QString SourceData::getAliasName() const 0109 { 0110 return m_aliasName.isEmpty() ? m_fileAccess.prettyAbsPath() : m_aliasName; 0111 } 0112 0113 void SourceData::setAliasName(const QString& name) 0114 { 0115 m_aliasName = name; 0116 } 0117 0118 void SourceData::setFileAccess(const FileAccess& fileAccess) 0119 { 0120 mFromClipBoard = false; 0121 0122 m_fileAccess = fileAccess; 0123 m_aliasName = QString(); 0124 if(!m_tempInputFileName.isEmpty()) 0125 { 0126 m_tempFile.remove(); 0127 m_tempInputFileName = ""; 0128 } 0129 0130 mErrors.clear(); 0131 } 0132 0133 void SourceData::setEncoding(const char* encoding) 0134 { 0135 mEncoding = encoding; 0136 } 0137 0138 void SourceData::setData(const QString& data) 0139 { 0140 mErrors.clear(); 0141 // Create a temp file for preprocessing: 0142 if(m_tempInputFileName.isEmpty()) 0143 { 0144 FileAccess::createTempFile(m_tempFile); 0145 m_tempInputFileName = m_tempFile.fileName(); 0146 } 0147 m_fileAccess = FileAccess(m_tempInputFileName); 0148 QByteArray ba = QTextCodec::codecForName("UTF-8")->fromUnicode(data); 0149 bool bSuccess = m_fileAccess.writeFile(ba.constData(), ba.length()); 0150 if(!bSuccess) 0151 { 0152 mErrors.append(i18n("Writing clipboard data to temp file failed.")); 0153 return; 0154 } 0155 else 0156 { 0157 m_aliasName = i18n("From Clipboard"); 0158 mFromClipBoard = true; 0159 } 0160 } 0161 0162 const std::shared_ptr<LineDataVector>& SourceData::getLineDataForDiff() const 0163 { 0164 if(m_lmppData.m_pBuf == nullptr) 0165 { 0166 return m_normalData.m_v; 0167 } 0168 else 0169 { 0170 return m_lmppData.m_v; 0171 } 0172 } 0173 0174 const std::shared_ptr<LineDataVector>& SourceData::getLineDataForDisplay() const 0175 { 0176 return m_normalData.m_v; 0177 } 0178 0179 LineType SourceData::getSizeLines() const 0180 { 0181 return SafeInt<LineType>(m_normalData.lineCount()); 0182 } 0183 0184 qint64 SourceData::getSizeBytes() const 0185 { 0186 return m_normalData.byteCount(); 0187 } 0188 0189 const char* SourceData::getBuf() const 0190 { 0191 return m_normalData.m_pBuf.get(); 0192 } 0193 0194 const QString& SourceData::getText() const 0195 { 0196 return *m_normalData.m_unicodeBuf; 0197 } 0198 0199 bool SourceData::isText() const 0200 { 0201 return m_normalData.isText() || m_normalData.isEmpty(); 0202 } 0203 0204 bool SourceData::isIncompleteConversion() const 0205 { 0206 return m_normalData.m_bIncompleteConversion; 0207 } 0208 0209 bool SourceData::isFromBuffer() const 0210 { 0211 return mFromClipBoard; 0212 } 0213 0214 bool SourceData::isBinaryEqualWith(const QSharedPointer<SourceData>& other) const 0215 { 0216 return m_fileAccess.exists() && other->m_fileAccess.exists() && 0217 getSizeBytes() == other->getSizeBytes() && 0218 (getSizeBytes() == 0 || memcmp(getBuf(), other->getBuf(), getSizeBytes()) == 0); 0219 } 0220 0221 /* 0222 Warning: Do not call this function without re-running the comparison or 0223 otherwise resetting the DiffTextWindows as these store a pointer to the file 0224 data stored here. 0225 */ 0226 void SourceData::FileData::reset() 0227 { 0228 m_pBuf.reset(); 0229 m_v->clear(); 0230 mDataSize = 0; 0231 mLineCount = 0; 0232 m_bIsText = false; 0233 m_bIncompleteConversion = false; 0234 m_eLineEndStyle = eLineEndStyleUndefined; 0235 } 0236 0237 bool SourceData::FileData::readFile(FileAccess& file) 0238 { 0239 reset(); 0240 if(file.fileName().isEmpty()) 0241 { 0242 return true; 0243 } 0244 0245 if(!file.isNormal()) 0246 return true; 0247 0248 mDataSize = file.sizeForReading(); 0249 /* 0250 If the extra bytes are removed an unknown heap currption issue is triggered in the 0251 diff code. I don't have time to track this down to its true root cause. 0252 */ 0253 m_pBuf = std::make_unique<char[]>(mDataSize + 100); // Alloc 100 byte extra: Safety hack, not nice but does no harm. 0254 // Some extra bytes at the end of the buffer are needed by 0255 // the diff algorithm. See also GnuDiff::diff_2_files(). 0256 bool bSuccess = file.readFile(m_pBuf.get(), mDataSize); 0257 if(!bSuccess) 0258 { 0259 m_pBuf = nullptr; 0260 mDataSize = 0; 0261 } 0262 else 0263 { 0264 //null terminate buffer 0265 m_pBuf[mDataSize + 1] = 0; 0266 m_pBuf[mDataSize + 2] = 0; 0267 m_pBuf[mDataSize + 3] = 0; 0268 m_pBuf[mDataSize + 4] = 0; 0269 } 0270 return bSuccess; 0271 } 0272 0273 bool SourceData::FileData::readFile(const QString& filename) 0274 { 0275 reset(); 0276 if(filename.isEmpty()) 0277 { 0278 return true; 0279 } 0280 0281 FileAccess fa(filename); 0282 return readFile(fa); 0283 } 0284 0285 bool SourceData::saveNormalDataAs(const QString& fileName) 0286 { 0287 return m_normalData.writeFile(fileName); 0288 } 0289 0290 bool SourceData::FileData::writeFile(const QString& filename) 0291 { 0292 if(filename.isEmpty()) 0293 { 0294 return true; 0295 } 0296 0297 FileAccess fa(filename); 0298 bool bSuccess = fa.writeFile(m_pBuf.get(), mDataSize); 0299 return bSuccess; 0300 } 0301 0302 //Deprecated 0303 void SourceData::FileData::copyBufFrom(const FileData& src) //TODO: Remove me. 0304 { 0305 reset(); 0306 mDataSize = src.mDataSize; 0307 m_pBuf = std::make_unique<char[]>(mDataSize + 100); 0308 assert(src.m_pBuf != nullptr); 0309 memcpy(m_pBuf.get(), src.m_pBuf.get(), mDataSize); 0310 } 0311 0312 std::optional<const char*> SourceData::detectEncoding(const QString& fileName) 0313 { 0314 QFile f(fileName); 0315 if(f.open(QIODevice::ReadOnly)) 0316 { 0317 char buf[400]; 0318 0319 qint64 size = f.read(buf, sizeof(buf)); 0320 FileOffset skipBytes = 0; 0321 return detectEncoding(buf, size, skipBytes); 0322 } 0323 return {}; 0324 } 0325 0326 void SourceData::readAndPreprocess(const char* encoding, bool bAutoDetect) 0327 { 0328 QTemporaryFile fileIn1, fileOut1; 0329 QString fileNameIn1; 0330 QString fileNameOut1; 0331 QString fileNameIn2; 0332 QString fileNameOut2; 0333 0334 mEncoding = encoding; 0335 0336 // Detect the input for the preprocessing operations 0337 if(!mFromClipBoard) 0338 { 0339 //Routine result of directory compare finding a file that isn't in all locations. 0340 if(!m_fileAccess.isValid()) return; 0341 0342 assert(!m_fileAccess.exists() || !m_fileAccess.isDir()); 0343 if(!m_fileAccess.isNormal()) 0344 { 0345 mErrors.append(i18n("%1 is not a normal file.", m_fileAccess.prettyAbsPath())); 0346 return; 0347 } 0348 0349 if(m_fileAccess.isLocal()) 0350 { 0351 fileNameIn1 = m_fileAccess.absoluteFilePath(); 0352 } 0353 else // File is not local: create a temporary local copy: 0354 { 0355 if(m_tempInputFileName.isEmpty()) 0356 { 0357 m_fileAccess.createLocalCopy(); 0358 m_tempInputFileName = m_fileAccess.getTempName(); 0359 } 0360 0361 fileNameIn1 = m_tempInputFileName; 0362 } 0363 if(bAutoDetect) 0364 { 0365 mEncoding = detectEncoding(fileNameIn1).value_or(encoding); 0366 } 0367 } 0368 else // The input was set via setData(), probably from clipboard. 0369 { 0370 /* 0371 Used to happen during early startup this is now a bug. 0372 */ 0373 assert(!m_tempInputFileName.isEmpty()); 0374 0375 fileNameIn1 = m_tempInputFileName; 0376 mEncoding = "UTF-8"; 0377 } 0378 const char* pEncoding1 = getEncoding(); 0379 const char* pEncoding2 = getEncoding(); 0380 const QString overSizedFile = i18nc("Error message. %1 = filepath", "File %1 too large to process. Skipping.", fileNameIn1); 0381 0382 m_normalData.reset(); 0383 m_lmppData.reset(); 0384 0385 FileAccess faIn(fileNameIn1); 0386 qint64 fileInSize = faIn.size(); 0387 0388 if(faIn.exists() && !faIn.isBrokenLink()) 0389 { 0390 try 0391 { 0392 // Run the first preprocessor 0393 if(gOptions->m_PreProcessorCmd.isEmpty()) 0394 { 0395 // No preprocessing: Read the file directly: 0396 if(!m_normalData.readFile(faIn)) 0397 { 0398 mErrors.append(faIn.getStatusText()); 0399 return; 0400 } 0401 } 0402 else 0403 { 0404 unsigned char b; 0405 //Don't fail the preprocessor command if the file can't be read. 0406 if(!faIn.readFile(&b, 1)) 0407 { 0408 mErrors.append(faIn.getStatusText()); 0409 mErrors.append(i18n(" Temp file is: %1", fileNameIn1)); 0410 return; 0411 } 0412 0413 QTemporaryFile tmpInPPFile; 0414 QString fileNameInPP = fileNameIn1; 0415 0416 if(pEncoding1 != gOptions->mEncodingPP) 0417 { 0418 // Before running the preprocessor convert to the format that the preprocessor expects. 0419 FileAccess::createTempFile(tmpInPPFile); 0420 fileNameInPP = tmpInPPFile.fileName(); 0421 pEncoding1 = gOptions->mEncodingPP; 0422 convertFileEncoding(fileNameIn1, encoding, fileNameInPP, pEncoding1); 0423 } 0424 0425 QString ppCmd = gOptions->m_PreProcessorCmd; 0426 FileAccess::createTempFile(fileOut1); 0427 fileNameOut1 = fileOut1.fileName(); 0428 0429 QProcess ppProcess; 0430 ppProcess.setStandardInputFile(fileNameInPP); 0431 ppProcess.setStandardOutputFile(fileNameOut1); 0432 QString program; 0433 QStringList args; 0434 QString errorReason = Utils::getArguments(ppCmd, program, args); 0435 if(errorReason.isEmpty()) 0436 { 0437 ppProcess.start(program, args); 0438 ppProcess.waitForFinished(-1); 0439 } 0440 else 0441 errorReason = "\n(" + errorReason + ')'; 0442 0443 bool bSuccess = errorReason.isEmpty() && m_normalData.readFile(fileNameOut1); 0444 if(fileInSize > 0 && (!bSuccess || m_normalData.byteCount() == 0)) 0445 { 0446 mErrors.append( 0447 i18n("Preprocessing possibly failed. Check this command:\n\n %1" 0448 "\n\nThe preprocessing command will be disabled now.", 0449 ppCmd) + 0450 errorReason); 0451 gOptions->m_PreProcessorCmd = ""; 0452 0453 pEncoding1 = getEncoding(); 0454 } 0455 } 0456 } 0457 catch(const std::bad_alloc&) 0458 { 0459 m_normalData.reset(); 0460 mErrors.append(overSizedFile); 0461 return; 0462 } 0463 0464 if(!m_normalData.preprocess(pEncoding1, false)) 0465 { 0466 mErrors.append(overSizedFile); 0467 return; 0468 } 0469 //exit early for non text data further processing assumes a text file as input 0470 if(!m_normalData.isText()) 0471 return; 0472 0473 // LineMatching Preprocessor 0474 if(!gOptions->m_LineMatchingPreProcessorCmd.isEmpty()) 0475 { 0476 QTemporaryFile tempOut2, fileInPP; 0477 fileNameIn2 = fileNameOut1.isEmpty() ? fileNameIn1 : fileNameOut1; 0478 QString fileNameInPP = fileNameIn2; 0479 pEncoding2 = pEncoding1; 0480 if(pEncoding2 != gOptions->mEncodingPP) 0481 { 0482 // Before running the preprocessor convert to the format that the preprocessor expects. 0483 FileAccess::createTempFile(fileInPP); 0484 fileNameInPP = fileInPP.fileName(); 0485 pEncoding2 = gOptions->mEncodingPP; 0486 convertFileEncoding(fileNameIn2, pEncoding1, fileNameInPP, pEncoding2); 0487 } 0488 0489 QString ppCmd = gOptions->m_LineMatchingPreProcessorCmd; 0490 FileAccess::createTempFile(tempOut2); 0491 fileNameOut2 = tempOut2.fileName(); 0492 QProcess ppProcess; 0493 ppProcess.setStandardInputFile(fileNameInPP); 0494 ppProcess.setStandardOutputFile(fileNameOut2); 0495 QString program; 0496 QStringList args; 0497 QString errorReason = Utils::getArguments(ppCmd, program, args); 0498 if(errorReason.isEmpty()) 0499 { 0500 ppProcess.start(program, args); 0501 ppProcess.waitForFinished(-1); 0502 } 0503 else 0504 errorReason = "\n(" + errorReason + ')'; 0505 0506 bool bSuccess = errorReason.isEmpty() && m_lmppData.readFile(fileNameOut2); 0507 if(FileAccess(fileNameIn2).size() > 0 && (!bSuccess || m_lmppData.byteCount() == 0)) 0508 { 0509 mErrors.append( 0510 i18n("The line-matching-preprocessing possibly failed. Check this command:\n\n %1" 0511 "\n\nThe line-matching-preprocessing command will be disabled now.", ppCmd) + 0512 errorReason); 0513 gOptions->m_LineMatchingPreProcessorCmd = ""; 0514 if(!m_lmppData.readFile(fileNameIn2)) 0515 { 0516 mErrors.append(i18nc("Read error message. %1 = filepath", "Failed to read file: %1", fileNameIn2)); 0517 return; 0518 } 0519 } 0520 } 0521 else if(gOptions->ignoreComments() || gOptions->m_bIgnoreCase) 0522 { 0523 // We need a copy of the normal data. 0524 m_lmppData.copyBufFrom(m_normalData); 0525 } 0526 } 0527 else 0528 { 0529 //exit early for nonexistent files 0530 return; 0531 } 0532 0533 if(!m_lmppData.preprocess(pEncoding2, true)) 0534 { 0535 mErrors.append(overSizedFile); 0536 return; 0537 } 0538 0539 assert(m_lmppData.isText()); 0540 //TODO: Needed? 0541 if(m_lmppData.lineCount() < m_normalData.lineCount()) 0542 { 0543 // Preprocessing command may result in smaller data buffer so adjust size 0544 for(qint64 i = m_lmppData.lineCount(); i < m_normalData.lineCount(); ++i) 0545 { // Set all empty lines to point to the end of the buffer. 0546 m_lmppData.m_v->push_back(LineData(m_lmppData.m_unicodeBuf, m_lmppData.m_unicodeBuf->length())); 0547 } 0548 0549 m_lmppData.mLineCount = m_normalData.lineCount(); 0550 } 0551 0552 // Ignore comments 0553 if(gOptions->ignoreComments() && hasData()) 0554 { 0555 qint64 vSize = std::min(m_normalData.lineCount(), m_lmppData.lineCount()); 0556 0557 for(qint64 i = 0; i < vSize; ++i) 0558 { 0559 //TODO: Phase this out. We should not be messing with these flags outside the parser. 0560 (*m_normalData.m_v)[i].setPureComment((*m_lmppData.m_v)[i].isPureComment()); 0561 (*m_normalData.m_v)[i].setSkipable((*m_lmppData.m_v)[i].isSkipable()); 0562 } 0563 } 0564 } 0565 0566 /** Prepare the linedata vector for every input line.*/ 0567 bool SourceData::FileData::preprocess(const QByteArray& encoding, bool removeComments) 0568 { 0569 if(m_pBuf == nullptr) 0570 return true; 0571 0572 QString line; 0573 QChar curChar, prevChar = '\0'; 0574 LineType lines = 0; 0575 QtSizeType lastOffset = 0; 0576 FileOffset skipBytes = 0; 0577 QScopedPointer<CommentParser> parser(new DefaultCommentParser()); 0578 0579 // detect line end style 0580 QVector<e_LineEndStyle> vOrigDataLineEndStyle; 0581 m_eLineEndStyle = eLineEndStyleUndefined; 0582 0583 QByteArray pCodec = detectEncoding(m_pBuf.get(), mDataSize, skipBytes).value_or(encoding); 0584 if(pCodec != encoding) 0585 skipBytes = 0; 0586 0587 if(mDataSize - skipBytes > limits<qint32>::max()) 0588 { 0589 reset(); 0590 return false; 0591 } 0592 0593 try 0594 { 0595 const QByteArray ba = QByteArray::fromRawData(m_pBuf.get() + skipBytes, (QtSizeType)(mDataSize - skipBytes)); 0596 EncodedDataStream ds(ba); 0597 0598 mHasBOM = skipBytes != 0; 0599 ds.setEncoding(encoding); 0600 ds.setGenerateByteOrderMark(skipBytes != 0); 0601 0602 m_bIncompleteConversion = false; 0603 m_unicodeBuf->clear(); 0604 0605 assert(m_unicodeBuf->length() == 0); 0606 0607 mHasEOLTermination = false; 0608 bool skipNextRead = false; 0609 while(!ds.atEnd()) 0610 { 0611 line.clear(); 0612 if(lines >= limits<LineType>::max() - 5) 0613 { 0614 reset(); 0615 return false; 0616 } 0617 0618 if(!skipNextRead) 0619 { 0620 prevChar = curChar; 0621 ds.readChar(curChar); 0622 } 0623 else 0624 skipNextRead = false; 0625 0626 QtSizeType firstNonwhite = 0; 0627 bool foundNonWhite = false; 0628 0629 while(curChar != '\n' && curChar != '\r') 0630 { 0631 if(curChar.isNull() || curChar.isNonCharacter()) 0632 { 0633 m_v->clear(); 0634 return true; 0635 } 0636 0637 if(curChar == QChar::ReplacementCharacter) 0638 m_bIncompleteConversion = true; 0639 0640 line.append(curChar); 0641 if(!curChar.isSpace() && !foundNonWhite) 0642 { 0643 firstNonwhite = line.length(); 0644 foundNonWhite = true; 0645 } 0646 0647 if(ds.atEnd()) 0648 break; 0649 0650 prevChar = curChar; 0651 ds.readChar(curChar); 0652 } 0653 0654 switch(curChar.unicode()) 0655 { 0656 case '\n': 0657 vOrigDataLineEndStyle.push_back(eLineEndStyleUnix); 0658 break; 0659 case '\r': 0660 if((FileOffset)lastOffset < mDataSize) 0661 { 0662 prevChar = curChar; 0663 ds.readChar(curChar); 0664 0665 if(curChar == '\n') 0666 { 0667 vOrigDataLineEndStyle.push_back(eLineEndStyleDos); 0668 break; 0669 } 0670 //work around for lack of seek API in QDataStream 0671 skipNextRead = true; 0672 } 0673 0674 //old mac style ending. 0675 vOrigDataLineEndStyle.push_back(eLineEndStyleOldMac); 0676 break; 0677 } 0678 parser->processLine(line); 0679 if(removeComments) 0680 parser->removeComment(line); 0681 //Qt6 intrudes 64bit sizes 0682 if(line.size() >= limits<LineType>::max()) 0683 { 0684 reset(); 0685 return false; 0686 } 0687 0688 ++lines; 0689 m_v->push_back(LineData(m_unicodeBuf, lastOffset, line.length(), firstNonwhite, parser->isSkipable(), parser->isPureComment())); 0690 //The last line may not have an EOL mark. In that case don't add one to our buffer. 0691 m_unicodeBuf->append(line); 0692 if(curChar == '\n' || curChar == '\r' || prevChar == '\r') 0693 { 0694 //kdiff3 internally uses only unix style endings for simplicity. 0695 m_unicodeBuf->append('\n'); 0696 } 0697 0698 assert(m_unicodeBuf->length() != lastOffset); 0699 lastOffset = m_unicodeBuf->length(); 0700 } 0701 0702 /* 0703 Process trailing new line as if there were a blank non-terminated line after it. 0704 But do nothing to the data buffer since this is a phantom line needed for internal purposes. 0705 */ 0706 if(curChar == '\n' || curChar == '\r') 0707 { 0708 mHasEOLTermination = true; 0709 ++lines; 0710 0711 parser->processLine(""); 0712 m_v->push_back(LineData(m_unicodeBuf, lastOffset, 0, 0, parser->isSkipable(), parser->isPureComment())); 0713 } 0714 0715 m_v->push_back(LineData(m_unicodeBuf, lastOffset)); 0716 0717 m_bIsText = true; 0718 0719 if(!vOrigDataLineEndStyle.isEmpty()) 0720 m_eLineEndStyle = vOrigDataLineEndStyle[0]; 0721 0722 mLineCount = lines; 0723 return true; 0724 } 0725 catch(const std::bad_alloc&) 0726 { 0727 reset(); 0728 return false; 0729 } 0730 } 0731 0732 // Convert the input file from input encoding to output encoding and write it to the output file. 0733 bool SourceData::convertFileEncoding(const QString& fileNameIn, const QByteArray& pCodecIn, 0734 const QString& fileNameOut, const QByteArray& pCodecOut) 0735 { 0736 QFile in(fileNameIn); 0737 if(!in.open(QIODevice::ReadOnly)) 0738 return false; 0739 EncodedDataStream inStream(&in); 0740 inStream.setEncoding(pCodecIn); 0741 0742 QFile out(fileNameOut); 0743 if(!out.open(QIODevice::WriteOnly)) 0744 return false; 0745 EncodedDataStream outStream(&out); 0746 outStream.setEncoding(pCodecOut); 0747 0748 QString data; 0749 while(!inStream.atEnd()) 0750 { 0751 QChar c; 0752 inStream.readChar(c); 0753 data += c; 0754 } 0755 outStream << data; 0756 0757 return true; 0758 } 0759 0760 std::optional<const char*> SourceData::getEncodingFromTag(const QByteArray& s, const QByteArray& encodingTag) 0761 { 0762 QtSizeType encodingPos = s.indexOf(encodingTag); 0763 if(encodingPos >= 0) 0764 { 0765 QtSizeType apostrophPos = s.indexOf('"', encodingPos + encodingTag.length()); 0766 QtSizeType apostroph2Pos = s.indexOf('\'', encodingPos + encodingTag.length()); 0767 char apostroph = '"'; 0768 if(apostroph2Pos >= 0 && (apostrophPos < 0 || apostroph2Pos < apostrophPos)) 0769 { 0770 apostroph = '\''; 0771 apostrophPos = apostroph2Pos; 0772 } 0773 0774 QtSizeType encodingEnd = s.indexOf(apostroph, apostrophPos + 1); 0775 if(encodingEnd >= 0) // e.g.: <meta charset="utf-8"> or <?xml version="1.0" encoding="ISO-8859-1"?> 0776 { 0777 QByteArray encoding = s.mid(apostrophPos + 1, encodingEnd - (apostrophPos + 1)); 0778 if(QTextCodec::codecForName(encoding)) 0779 return encoding; 0780 } 0781 else // e.g.: <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 0782 { 0783 QByteArray encoding = s.mid(encodingPos + encodingTag.length(), apostrophPos - (encodingPos + encodingTag.length())); 0784 if(QTextCodec::codecForName(encoding)) 0785 return encoding; 0786 } 0787 } 0788 return {}; 0789 } 0790 0791 std::optional<const char*> SourceData::detectEncoding(const char* buf, qint64 size, FileOffset& skipBytes) 0792 { 0793 if(size >= 2) 0794 { 0795 if(buf[0] == '\xFF' && buf[1] == '\xFE') 0796 { 0797 skipBytes = 2; 0798 return "UTF-16LE"; 0799 } 0800 0801 if(buf[0] == '\xFE' && buf[1] == '\xFF') 0802 { 0803 skipBytes = 2; 0804 return "UTF-16BE"; 0805 } 0806 } 0807 if(size >= 3) 0808 { 0809 if(buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') 0810 { 0811 skipBytes = 3; 0812 return "UTF-8-BOM"; 0813 } 0814 } 0815 skipBytes = 0; 0816 QByteArray s; 0817 /* 0818 We don't need the whole file here just the header. 0819 */ 0820 if(size <= 5000) 0821 s = QByteArray(buf, (QtSizeType)size); 0822 else 0823 s = QByteArray(buf, 5000); 0824 0825 QtSizeType xmlHeaderPos = s.indexOf("<?xml"); 0826 if(xmlHeaderPos >= 0) 0827 { 0828 QtSizeType xmlHeaderEnd = s.indexOf("?>", xmlHeaderPos); 0829 if(xmlHeaderEnd >= 0) 0830 { 0831 std::optional<const char*> encoding = getEncodingFromTag(s.mid(xmlHeaderPos, xmlHeaderEnd - xmlHeaderPos), "encoding="); 0832 if(encoding.has_value()) 0833 return encoding; 0834 } 0835 } 0836 else // HTML 0837 { 0838 QtSizeType metaHeaderPos = s.indexOf("<meta"); 0839 while(metaHeaderPos >= 0) 0840 { 0841 QtSizeType metaHeaderEnd = s.indexOf(">", metaHeaderPos); 0842 if(metaHeaderEnd >= 0) 0843 { 0844 std::optional<const char*> encoding = getEncodingFromTag(s.mid(metaHeaderPos, metaHeaderEnd - metaHeaderPos), "charset="); 0845 if(encoding.has_value()) 0846 return encoding; 0847 0848 metaHeaderPos = s.indexOf("<meta", metaHeaderEnd); 0849 } 0850 else 0851 break; 0852 } 0853 } 0854 //Attempt to detect non-bom UTF8. This is a very common encoding. 0855 return detectUTF8(s); 0856 } 0857 0858 std::optional<const char*> SourceData::detectUTF8(const QByteArray& data) 0859 { 0860 QTextCodec* utf8 = QTextCodec::codecForName("UTF-8"); 0861 0862 QTextCodec::ConverterState state; 0863 utf8->toUnicode(data.constData(), SafeInt<qint32>(data.size()), &state); 0864 0865 if(state.invalidChars == 0) 0866 for (qint32 i = 0; i < data.size()-state.remainingChars; i++) 0867 if ((unsigned)data.at(i) > 127) 0868 return "UTF-8"; 0869 0870 return {}; 0871 }