File indexing completed on 2024-09-08 05:08:15
0001 // clang-format off 0002 /* 0003 * KDiff3 - Text Diff And Merge Tool 0004 * 0005 * SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de 0006 * SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com 0007 * SPDX-License-Identifier: GPL-2.0-or-later 0008 */ 0009 // clang-format on 0010 0011 /* Features of class SourceData: 0012 - Read a file (from the given URL) or accept data via a string. 0013 - Allocate and free buffers as necessary. 0014 - Run a preprocessor, when specified. 0015 - Run the line-matching preprocessor, when specified. 0016 - Run other preprocessing steps: Uppercase, ignore comments, 0017 remove carriage return, ignore numbers. 0018 0019 Order of operation: 0020 1. If data was given via a string then save it to a temp file. (see setData()) 0021 2. If the specified file is nonlocal (URL) copy it to a temp file. (TODO revisit this) 0022 3. If a preprocessor was specified, run the input file through it. 0023 4. Read the output of the preprocessor. 0024 5. If Uppercase was specified: Turn the read data to uppercase. 0025 6. Write the result to a temp file. 0026 7. If a line-matching preprocessor was specified, run the temp file through it. 0027 8. Read the output of the line-matching preprocessor. 0028 9. If ignore numbers was specified, strip the LMPP-output of all numbers. 0029 10. If ignore comments was specified, strip the LMPP-output of comments. 0030 0031 Optimizations: Skip unneeded steps. 0032 */ 0033 #include "SourceData.h" 0034 0035 #include "CommentParser.h" 0036 #include "compat.h" 0037 #include "diff.h" 0038 #include "EncodedDataStream.h" 0039 #include "LineRef.h" 0040 #include "Logging.h" 0041 #include "Utils.h" 0042 0043 #include <algorithm> // for min 0044 #include <memory> 0045 #include <optional> 0046 #include <vector> // for vector 0047 0048 #include <QtGlobal> 0049 0050 #include <QByteArray> 0051 #include <QProcess> 0052 #include <QString> 0053 #include <QTemporaryFile> 0054 #include <QTextCodec> 0055 #include <QVector> 0056 0057 extern std::unique_ptr<Options> gOptions; 0058 0059 void SourceData::reset() 0060 { 0061 mFromClipBoard = false; 0062 mEncoding = u8"UTF-8"; 0063 m_fileAccess = FileAccess(); 0064 m_normalData.reset(); 0065 m_lmppData.reset(); 0066 if(!m_tempInputFileName.isEmpty()) 0067 { 0068 m_tempFile.remove(); 0069 m_tempInputFileName = ""; 0070 } 0071 0072 mErrors.clear(); 0073 } 0074 0075 void SourceData::setFilename(const QString& filename) 0076 { 0077 if(filename.isEmpty()) 0078 { 0079 reset(); 0080 } 0081 else 0082 { 0083 setFileAccess(FileAccess(filename)); 0084 } 0085 } 0086 0087 bool SourceData::isEmpty() const 0088 { 0089 return getFilename().isEmpty(); 0090 } 0091 0092 bool SourceData::hasData() const 0093 { 0094 return m_normalData.m_pBuf != nullptr; 0095 } 0096 0097 bool SourceData::isValid() const 0098 { 0099 return isEmpty() || hasData(); 0100 } 0101 0102 QString SourceData::getFilename() const 0103 { 0104 return m_fileAccess.absoluteFilePath(); 0105 } 0106 0107 QString SourceData::getAliasName() const 0108 { 0109 return m_aliasName.isEmpty() ? m_fileAccess.prettyAbsPath() : m_aliasName; 0110 } 0111 0112 void SourceData::setAliasName(const QString& name) 0113 { 0114 m_aliasName = name; 0115 } 0116 0117 void SourceData::setFileAccess(const FileAccess& fileAccess) 0118 { 0119 mFromClipBoard = false; 0120 0121 m_fileAccess = fileAccess; 0122 m_aliasName = QString(); 0123 if(!m_tempInputFileName.isEmpty()) 0124 { 0125 m_tempFile.remove(); 0126 m_tempInputFileName = ""; 0127 } 0128 0129 mErrors.clear(); 0130 } 0131 0132 void SourceData::setEncoding(const char* encoding) 0133 { 0134 mEncoding = encoding; 0135 } 0136 0137 void SourceData::setData(const QString& data) 0138 { 0139 mErrors.clear(); 0140 // Create a temp file for preprocessing: 0141 if(m_tempInputFileName.isEmpty()) 0142 { 0143 FileAccess::createTempFile(m_tempFile); 0144 m_tempInputFileName = m_tempFile.fileName(); 0145 } 0146 m_fileAccess = FileAccess(m_tempInputFileName); 0147 QByteArray ba = QTextCodec::codecForName("UTF-8")->fromUnicode(data); 0148 bool bSuccess = m_fileAccess.writeFile(ba.constData(), ba.length()); 0149 if(!bSuccess) 0150 { 0151 mErrors.append(i18n("Writing clipboard data to temp file failed.")); 0152 return; 0153 } 0154 else 0155 { 0156 m_aliasName = i18n("From Clipboard"); 0157 mFromClipBoard = true; 0158 } 0159 } 0160 0161 const std::shared_ptr<LineDataVector>& SourceData::getLineDataForDiff() const 0162 { 0163 if(m_lmppData.m_pBuf == nullptr) 0164 { 0165 return m_normalData.m_v; 0166 } 0167 else 0168 { 0169 return m_lmppData.m_v; 0170 } 0171 } 0172 0173 const std::shared_ptr<LineDataVector>& SourceData::getLineDataForDisplay() const 0174 { 0175 return m_normalData.m_v; 0176 } 0177 0178 LineType SourceData::getSizeLines() const 0179 { 0180 return SafeInt<LineType>(m_normalData.lineCount()); 0181 } 0182 0183 qint64 SourceData::getSizeBytes() const 0184 { 0185 return m_normalData.byteCount(); 0186 } 0187 0188 const char* SourceData::getBuf() const 0189 { 0190 return m_normalData.m_pBuf.get(); 0191 } 0192 0193 const QString& SourceData::getText() const 0194 { 0195 return *m_normalData.m_unicodeBuf; 0196 } 0197 0198 bool SourceData::isText() const 0199 { 0200 return m_normalData.isText() || m_normalData.isEmpty(); 0201 } 0202 0203 bool SourceData::isIncompleteConversion() const 0204 { 0205 return m_normalData.m_bIncompleteConversion; 0206 } 0207 0208 bool SourceData::isFromBuffer() const 0209 { 0210 return mFromClipBoard; 0211 } 0212 0213 bool SourceData::isBinaryEqualWith(const QSharedPointer<SourceData>& other) const 0214 { 0215 return m_fileAccess.exists() && other->m_fileAccess.exists() && 0216 getSizeBytes() == other->getSizeBytes() && 0217 (getSizeBytes() == 0 || memcmp(getBuf(), other->getBuf(), getSizeBytes()) == 0); 0218 } 0219 0220 /* 0221 Warning: Do not call this function without re-running the comparison or 0222 otherwise resetting the DiffTextWindows as these store a pointer to the file 0223 data stored here. 0224 */ 0225 void SourceData::FileData::reset() 0226 { 0227 m_pBuf.reset(); 0228 m_v->clear(); 0229 mDataSize = 0; 0230 mLineCount = 0; 0231 m_bIsText = false; 0232 m_bIncompleteConversion = false; 0233 m_eLineEndStyle = eLineEndStyleUndefined; 0234 } 0235 0236 bool SourceData::FileData::readFile(FileAccess& file) 0237 { 0238 reset(); 0239 if(file.fileName().isEmpty()) 0240 { 0241 return true; 0242 } 0243 0244 if(!file.isNormal()) 0245 return true; 0246 0247 mDataSize = file.sizeForReading(); 0248 /* 0249 If the extra bytes are removed an unknown heap currption issue is triggered in the 0250 diff code. I don't have time to track this down to its true root cause. 0251 */ 0252 m_pBuf = std::make_unique<char[]>(mDataSize + 100); // Alloc 100 byte extra: Safety hack, not nice but does no harm. 0253 // Some extra bytes at the end of the buffer are needed by 0254 // the diff algorithm. See also GnuDiff::diff_2_files(). 0255 bool bSuccess = file.readFile(m_pBuf.get(), mDataSize); 0256 if(!bSuccess) 0257 { 0258 m_pBuf = nullptr; 0259 mDataSize = 0; 0260 } 0261 else 0262 { 0263 //null terminate buffer 0264 m_pBuf[mDataSize + 1] = 0; 0265 m_pBuf[mDataSize + 2] = 0; 0266 m_pBuf[mDataSize + 3] = 0; 0267 m_pBuf[mDataSize + 4] = 0; 0268 } 0269 return bSuccess; 0270 } 0271 0272 bool SourceData::FileData::readFile(const QString& filename) 0273 { 0274 reset(); 0275 if(filename.isEmpty()) 0276 { 0277 return true; 0278 } 0279 0280 FileAccess fa(filename); 0281 return readFile(fa); 0282 } 0283 0284 bool SourceData::saveNormalDataAs(const QString& fileName) 0285 { 0286 return m_normalData.writeFile(fileName); 0287 } 0288 0289 bool SourceData::FileData::writeFile(const QString& filename) 0290 { 0291 if(filename.isEmpty()) 0292 { 0293 return true; 0294 } 0295 0296 FileAccess fa(filename); 0297 bool bSuccess = fa.writeFile(m_pBuf.get(), mDataSize); 0298 return bSuccess; 0299 } 0300 0301 //Deprecated 0302 void SourceData::FileData::copyBufFrom(const FileData& src) //TODO: Remove me. 0303 { 0304 reset(); 0305 mDataSize = src.mDataSize; 0306 m_pBuf = std::make_unique<char[]>(mDataSize + 100); 0307 assert(src.m_pBuf != nullptr); 0308 memcpy(m_pBuf.get(), src.m_pBuf.get(), mDataSize); 0309 } 0310 0311 std::optional<const char*> SourceData::detectEncoding(const QString& fileName) 0312 { 0313 QFile f(fileName); 0314 if(f.open(QIODevice::ReadOnly)) 0315 { 0316 char buf[400]; 0317 0318 qint64 size = f.read(buf, sizeof(buf)); 0319 FileOffset skipBytes = 0; 0320 return detectEncoding(buf, size, skipBytes); 0321 } 0322 return {}; 0323 } 0324 0325 void SourceData::readAndPreprocess(const char* encoding, bool bAutoDetect) 0326 { 0327 QTemporaryFile fileIn1, fileOut1; 0328 QString fileNameIn1; 0329 QString fileNameOut1; 0330 QString fileNameIn2; 0331 QString fileNameOut2; 0332 0333 mEncoding = encoding; 0334 0335 // Detect the input for the preprocessing operations 0336 if(!mFromClipBoard) 0337 { 0338 //Routine result of directory compare finding a file that isn't in all locations. 0339 if(!m_fileAccess.isValid()) return; 0340 0341 assert(!m_fileAccess.exists() || !m_fileAccess.isDir()); 0342 if(!m_fileAccess.isNormal()) 0343 { 0344 mErrors.append(i18n("%1 is not a normal file.", m_fileAccess.prettyAbsPath())); 0345 return; 0346 } 0347 0348 if(m_fileAccess.isLocal()) 0349 { 0350 fileNameIn1 = m_fileAccess.absoluteFilePath(); 0351 } 0352 else // File is not local: create a temporary local copy: 0353 { 0354 if(m_tempInputFileName.isEmpty()) 0355 { 0356 m_fileAccess.createLocalCopy(); 0357 m_tempInputFileName = m_fileAccess.getTempName(); 0358 } 0359 0360 fileNameIn1 = m_tempInputFileName; 0361 } 0362 if(bAutoDetect) 0363 { 0364 mEncoding = detectEncoding(fileNameIn1).value_or(encoding); 0365 } 0366 } 0367 else // The input was set via setData(), probably from clipboard. 0368 { 0369 /* 0370 Used to happen during early startup this is now a bug. 0371 */ 0372 assert(!m_tempInputFileName.isEmpty()); 0373 0374 fileNameIn1 = m_tempInputFileName; 0375 mEncoding = "UTF-8"; 0376 } 0377 const char* pEncoding1 = getEncoding(); 0378 const char* pEncoding2 = getEncoding(); 0379 const QString overSizedFile = i18nc("Error message. %1 = filepath", "File %1 too large to process. Skipping.", fileNameIn1); 0380 0381 m_normalData.reset(); 0382 m_lmppData.reset(); 0383 0384 FileAccess faIn(fileNameIn1); 0385 qint64 fileInSize = faIn.size(); 0386 0387 if(faIn.exists() && !faIn.isBrokenLink()) 0388 { 0389 try 0390 { 0391 // Run the first preprocessor 0392 if(gOptions->m_PreProcessorCmd.isEmpty()) 0393 { 0394 // No preprocessing: Read the file directly: 0395 if(!m_normalData.readFile(faIn)) 0396 { 0397 mErrors.append(faIn.getStatusText()); 0398 return; 0399 } 0400 } 0401 else 0402 { 0403 unsigned char b; 0404 //Don't fail the preprocessor command if the file can't be read. 0405 if(!faIn.readFile(&b, 1)) 0406 { 0407 mErrors.append(faIn.getStatusText()); 0408 mErrors.append(i18n(" Temp file is: %1", fileNameIn1)); 0409 return; 0410 } 0411 0412 QTemporaryFile tmpInPPFile; 0413 QString fileNameInPP = fileNameIn1; 0414 0415 if(pEncoding1 != gOptions->mEncodingPP) 0416 { 0417 // Before running the preprocessor convert to the format that the preprocessor expects. 0418 FileAccess::createTempFile(tmpInPPFile); 0419 fileNameInPP = tmpInPPFile.fileName(); 0420 pEncoding1 = gOptions->mEncodingPP; 0421 convertFileEncoding(fileNameIn1, encoding, fileNameInPP, pEncoding1); 0422 } 0423 0424 QString ppCmd = gOptions->m_PreProcessorCmd; 0425 FileAccess::createTempFile(fileOut1); 0426 fileNameOut1 = fileOut1.fileName(); 0427 0428 QProcess ppProcess; 0429 ppProcess.setStandardInputFile(fileNameInPP); 0430 ppProcess.setStandardOutputFile(fileNameOut1); 0431 QString program; 0432 QStringList args; 0433 QString errorReason = Utils::getArguments(ppCmd, program, args); 0434 if(errorReason.isEmpty()) 0435 { 0436 ppProcess.start(program, args); 0437 ppProcess.waitForFinished(-1); 0438 } 0439 else 0440 errorReason = "\n(" + errorReason + ')'; 0441 0442 bool bSuccess = errorReason.isEmpty() && m_normalData.readFile(fileNameOut1); 0443 if(fileInSize > 0 && (!bSuccess || m_normalData.byteCount() == 0)) 0444 { 0445 mErrors.append( 0446 i18n("Preprocessing possibly failed. Check this command:\n\n %1" 0447 "\n\nThe preprocessing command will be disabled now.", 0448 ppCmd) + 0449 errorReason); 0450 gOptions->m_PreProcessorCmd = ""; 0451 0452 pEncoding1 = getEncoding(); 0453 } 0454 } 0455 } 0456 catch(const std::bad_alloc&) 0457 { 0458 m_normalData.reset(); 0459 mErrors.append(overSizedFile); 0460 return; 0461 } 0462 0463 if(!m_normalData.preprocess(pEncoding1, false)) 0464 { 0465 mErrors.append(overSizedFile); 0466 return; 0467 } 0468 //exit early for non text data further processing assumes a text file as input 0469 if(!m_normalData.isText()) 0470 return; 0471 0472 // LineMatching Preprocessor 0473 if(!gOptions->m_LineMatchingPreProcessorCmd.isEmpty()) 0474 { 0475 QTemporaryFile tempOut2, fileInPP; 0476 fileNameIn2 = fileNameOut1.isEmpty() ? fileNameIn1 : fileNameOut1; 0477 QString fileNameInPP = fileNameIn2; 0478 pEncoding2 = pEncoding1; 0479 if(pEncoding2 != gOptions->mEncodingPP) 0480 { 0481 // Before running the preprocessor convert to the format that the preprocessor expects. 0482 FileAccess::createTempFile(fileInPP); 0483 fileNameInPP = fileInPP.fileName(); 0484 pEncoding2 = gOptions->mEncodingPP; 0485 convertFileEncoding(fileNameIn2, pEncoding1, fileNameInPP, pEncoding2); 0486 } 0487 0488 QString ppCmd = gOptions->m_LineMatchingPreProcessorCmd; 0489 FileAccess::createTempFile(tempOut2); 0490 fileNameOut2 = tempOut2.fileName(); 0491 QProcess ppProcess; 0492 ppProcess.setStandardInputFile(fileNameInPP); 0493 ppProcess.setStandardOutputFile(fileNameOut2); 0494 QString program; 0495 QStringList args; 0496 QString errorReason = Utils::getArguments(ppCmd, program, args); 0497 if(errorReason.isEmpty()) 0498 { 0499 ppProcess.start(program, args); 0500 ppProcess.waitForFinished(-1); 0501 } 0502 else 0503 errorReason = "\n(" + errorReason + ')'; 0504 0505 bool bSuccess = errorReason.isEmpty() && m_lmppData.readFile(fileNameOut2); 0506 if(FileAccess(fileNameIn2).size() > 0 && (!bSuccess || m_lmppData.byteCount() == 0)) 0507 { 0508 mErrors.append( 0509 i18n("The line-matching-preprocessing possibly failed. Check this command:\n\n %1" 0510 "\n\nThe line-matching-preprocessing command will be disabled now.", ppCmd) + 0511 errorReason); 0512 gOptions->m_LineMatchingPreProcessorCmd = ""; 0513 if(!m_lmppData.readFile(fileNameIn2)) 0514 { 0515 mErrors.append(i18nc("Read error message. %1 = filepath", "Failed to read file: %1", fileNameIn2)); 0516 return; 0517 } 0518 } 0519 } 0520 else if(gOptions->ignoreComments() || gOptions->m_bIgnoreCase) 0521 { 0522 // We need a copy of the normal data. 0523 m_lmppData.copyBufFrom(m_normalData); 0524 } 0525 } 0526 else 0527 { 0528 //exit early for nonexistent files 0529 return; 0530 } 0531 0532 if(!m_lmppData.preprocess(pEncoding2, true)) 0533 { 0534 mErrors.append(overSizedFile); 0535 return; 0536 } 0537 0538 assert(m_lmppData.isText()); 0539 //TODO: Needed? 0540 if(m_lmppData.lineCount() < m_normalData.lineCount()) 0541 { 0542 // Preprocessing command may result in smaller data buffer so adjust size 0543 for(qint64 i = m_lmppData.lineCount(); i < m_normalData.lineCount(); ++i) 0544 { // Set all empty lines to point to the end of the buffer. 0545 m_lmppData.m_v->push_back(LineData(m_lmppData.m_unicodeBuf, m_lmppData.m_unicodeBuf->length())); 0546 } 0547 0548 m_lmppData.mLineCount = m_normalData.lineCount(); 0549 } 0550 0551 // Ignore comments 0552 if(gOptions->ignoreComments() && hasData()) 0553 { 0554 qint64 vSize = std::min(m_normalData.lineCount(), m_lmppData.lineCount()); 0555 0556 for(qint64 i = 0; i < vSize; ++i) 0557 { 0558 //TODO: Phase this out. We should not be messing with these flags outside the parser. 0559 (*m_normalData.m_v)[i].setPureComment((*m_lmppData.m_v)[i].isPureComment()); 0560 (*m_normalData.m_v)[i].setSkipable((*m_lmppData.m_v)[i].isSkipable()); 0561 } 0562 } 0563 } 0564 0565 /** Prepare the linedata vector for every input line.*/ 0566 bool SourceData::FileData::preprocess(const QByteArray& encoding, bool removeComments) 0567 { 0568 if(m_pBuf == nullptr) 0569 return true; 0570 0571 QString line; 0572 QChar curChar, prevChar = '\0'; 0573 LineType lines = 0; 0574 QtSizeType lastOffset = 0; 0575 FileOffset skipBytes = 0; 0576 std::unique_ptr<CommentParser> parser(new DefaultCommentParser()); 0577 0578 // detect line end style 0579 QVector<e_LineEndStyle> vOrigDataLineEndStyle; 0580 m_eLineEndStyle = eLineEndStyleUndefined; 0581 0582 QByteArray pCodec = detectEncoding(m_pBuf.get(), mDataSize, skipBytes).value_or(encoding); 0583 if(pCodec != encoding) 0584 skipBytes = 0; 0585 0586 if(mDataSize - skipBytes > limits<qint32>::max()) 0587 { 0588 reset(); 0589 return false; 0590 } 0591 0592 try 0593 { 0594 const QByteArray ba = QByteArray::fromRawData(m_pBuf.get() + skipBytes, (QtSizeType)(mDataSize - skipBytes)); 0595 EncodedDataStream ds(ba); 0596 0597 mHasBOM = skipBytes != 0; 0598 ds.setEncoding(encoding); 0599 ds.setGenerateByteOrderMark(skipBytes != 0); 0600 0601 m_bIncompleteConversion = false; 0602 m_unicodeBuf->clear(); 0603 0604 assert(m_unicodeBuf->length() == 0); 0605 0606 mHasEOLTermination = false; 0607 bool skipNextRead = false; 0608 while(!ds.atEnd()) 0609 { 0610 line.clear(); 0611 if(lines >= limits<LineType>::max() - 5) 0612 { 0613 reset(); 0614 return false; 0615 } 0616 0617 if(!skipNextRead) 0618 { 0619 prevChar = curChar; 0620 ds.readChar(curChar); 0621 } 0622 else 0623 skipNextRead = false; 0624 0625 QtSizeType firstNonwhite = 0; 0626 bool foundNonWhite = false; 0627 0628 while(curChar != '\n' && curChar != '\r') 0629 { 0630 if(curChar.isNull() || curChar.isNonCharacter()) 0631 { 0632 m_v->clear(); 0633 return true; 0634 } 0635 0636 if(curChar == QChar::ReplacementCharacter) 0637 m_bIncompleteConversion = true; 0638 0639 line.append(curChar); 0640 if(!curChar.isSpace() && !foundNonWhite) 0641 { 0642 firstNonwhite = line.length(); 0643 foundNonWhite = true; 0644 } 0645 0646 if(ds.atEnd()) 0647 break; 0648 0649 prevChar = curChar; 0650 ds.readChar(curChar); 0651 } 0652 0653 switch(curChar.unicode()) 0654 { 0655 case '\n': 0656 vOrigDataLineEndStyle.push_back(eLineEndStyleUnix); 0657 break; 0658 case '\r': 0659 if((FileOffset)lastOffset < mDataSize) 0660 { 0661 prevChar = curChar; 0662 ds.readChar(curChar); 0663 0664 if(curChar == '\n') 0665 { 0666 vOrigDataLineEndStyle.push_back(eLineEndStyleDos); 0667 break; 0668 } 0669 //work around for lack of seek API in QDataStream 0670 skipNextRead = true; 0671 } 0672 0673 //old mac style ending. 0674 vOrigDataLineEndStyle.push_back(eLineEndStyleOldMac); 0675 break; 0676 } 0677 parser->processLine(line); 0678 if(removeComments) 0679 parser->removeComment(line); 0680 //Qt6 intrudes 64bit sizes 0681 if(line.size() >= limits<LineType>::max()) 0682 { 0683 reset(); 0684 return false; 0685 } 0686 0687 ++lines; 0688 m_v->push_back(LineData(m_unicodeBuf, lastOffset, line.length(), firstNonwhite, parser->isSkipable(), parser->isPureComment())); 0689 //The last line may not have an EOL mark. In that case don't add one to our buffer. 0690 m_unicodeBuf->append(line); 0691 if(curChar == '\n' || curChar == '\r' || prevChar == '\r') 0692 { 0693 //kdiff3 internally uses only unix style endings for simplicity. 0694 m_unicodeBuf->append('\n'); 0695 } 0696 0697 assert(m_unicodeBuf->length() != lastOffset); 0698 lastOffset = m_unicodeBuf->length(); 0699 } 0700 0701 /* 0702 Process trailing new line as if there were a blank non-terminated line after it. 0703 But do nothing to the data buffer since this is a phantom line needed for internal purposes. 0704 */ 0705 if(curChar == '\n' || curChar == '\r') 0706 { 0707 mHasEOLTermination = true; 0708 ++lines; 0709 0710 parser->processLine(""); 0711 m_v->push_back(LineData(m_unicodeBuf, lastOffset, 0, 0, parser->isSkipable(), parser->isPureComment())); 0712 } 0713 0714 m_v->push_back(LineData(m_unicodeBuf, lastOffset)); 0715 0716 m_bIsText = true; 0717 0718 if(!vOrigDataLineEndStyle.isEmpty()) 0719 m_eLineEndStyle = vOrigDataLineEndStyle[0]; 0720 0721 mLineCount = lines; 0722 return true; 0723 } 0724 catch(const std::bad_alloc&) 0725 { 0726 reset(); 0727 return false; 0728 } 0729 } 0730 0731 // Convert the input file from input encoding to output encoding and write it to the output file. 0732 bool SourceData::convertFileEncoding(const QString& fileNameIn, const QByteArray& pCodecIn, 0733 const QString& fileNameOut, const QByteArray& pCodecOut) 0734 { 0735 QFile in(fileNameIn); 0736 if(!in.open(QIODevice::ReadOnly)) 0737 return false; 0738 EncodedDataStream inStream(&in); 0739 inStream.setEncoding(pCodecIn); 0740 0741 QFile out(fileNameOut); 0742 if(!out.open(QIODevice::WriteOnly)) 0743 return false; 0744 EncodedDataStream outStream(&out); 0745 outStream.setEncoding(pCodecOut); 0746 0747 QString data; 0748 while(!inStream.atEnd()) 0749 { 0750 QChar c; 0751 inStream.readChar(c); 0752 data += c; 0753 } 0754 outStream << data; 0755 0756 return true; 0757 } 0758 0759 std::optional<const char*> SourceData::getEncodingFromTag(const QByteArray& s, const QByteArray& encodingTag) 0760 { 0761 QtSizeType encodingPos = s.indexOf(encodingTag); 0762 if(encodingPos >= 0) 0763 { 0764 QtSizeType apostrophPos = s.indexOf('"', encodingPos + encodingTag.length()); 0765 QtSizeType apostroph2Pos = s.indexOf('\'', encodingPos + encodingTag.length()); 0766 char apostroph = '"'; 0767 if(apostroph2Pos >= 0 && (apostrophPos < 0 || apostroph2Pos < apostrophPos)) 0768 { 0769 apostroph = '\''; 0770 apostrophPos = apostroph2Pos; 0771 } 0772 0773 QtSizeType encodingEnd = s.indexOf(apostroph, apostrophPos + 1); 0774 if(encodingEnd >= 0) // e.g.: <meta charset="utf-8"> or <?xml version="1.0" encoding="ISO-8859-1"?> 0775 { 0776 QByteArray encoding = s.mid(apostrophPos + 1, encodingEnd - (apostrophPos + 1)); 0777 if(QTextCodec::codecForName(encoding)) 0778 return encoding; 0779 } 0780 else // e.g.: <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 0781 { 0782 QByteArray encoding = s.mid(encodingPos + encodingTag.length(), apostrophPos - (encodingPos + encodingTag.length())); 0783 if(QTextCodec::codecForName(encoding)) 0784 return encoding; 0785 } 0786 } 0787 return {}; 0788 } 0789 0790 std::optional<const char*> SourceData::detectEncoding(const char* buf, qint64 size, FileOffset& skipBytes) 0791 { 0792 if(size >= 2) 0793 { 0794 if(buf[0] == '\xFF' && buf[1] == '\xFE') 0795 { 0796 skipBytes = 2; 0797 return "UTF-16LE"; 0798 } 0799 0800 if(buf[0] == '\xFE' && buf[1] == '\xFF') 0801 { 0802 skipBytes = 2; 0803 return "UTF-16BE"; 0804 } 0805 } 0806 if(size >= 3) 0807 { 0808 if(buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') 0809 { 0810 skipBytes = 3; 0811 return "UTF-8-BOM"; 0812 } 0813 } 0814 skipBytes = 0; 0815 QByteArray s; 0816 /* 0817 We don't need the whole file here just the header. 0818 */ 0819 if(size <= 5000) 0820 s = QByteArray(buf, (QtSizeType)size); 0821 else 0822 s = QByteArray(buf, 5000); 0823 0824 QtSizeType xmlHeaderPos = s.indexOf("<?xml"); 0825 if(xmlHeaderPos >= 0) 0826 { 0827 QtSizeType xmlHeaderEnd = s.indexOf("?>", xmlHeaderPos); 0828 if(xmlHeaderEnd >= 0) 0829 { 0830 std::optional<const char*> encoding = getEncodingFromTag(s.mid(xmlHeaderPos, xmlHeaderEnd - xmlHeaderPos), "encoding="); 0831 if(encoding.has_value()) 0832 return encoding; 0833 } 0834 } 0835 else // HTML 0836 { 0837 QtSizeType metaHeaderPos = s.indexOf("<meta"); 0838 while(metaHeaderPos >= 0) 0839 { 0840 QtSizeType metaHeaderEnd = s.indexOf(">", metaHeaderPos); 0841 if(metaHeaderEnd >= 0) 0842 { 0843 std::optional<const char*> encoding = getEncodingFromTag(s.mid(metaHeaderPos, metaHeaderEnd - metaHeaderPos), "charset="); 0844 if(encoding.has_value()) 0845 return encoding; 0846 0847 metaHeaderPos = s.indexOf("<meta", metaHeaderEnd); 0848 } 0849 else 0850 break; 0851 } 0852 } 0853 //Attempt to detect non-bom UTF8. This is a very common encoding. 0854 return detectUTF8(s); 0855 } 0856 0857 std::optional<const char*> SourceData::detectUTF8(const QByteArray& data) 0858 { 0859 QTextCodec* utf8 = QTextCodec::codecForName("UTF-8"); 0860 0861 QTextCodec::ConverterState state; 0862 utf8->toUnicode(data.constData(), SafeInt<qint32>(data.size()), &state); 0863 0864 if(state.invalidChars == 0) 0865 for (qint32 i = 0; i < data.size()-state.remainingChars; i++) 0866 if ((unsigned)data.at(i) > 127) 0867 return "UTF-8"; 0868 0869 return {}; 0870 }