File indexing completed on 2024-04-21 03:57:11

0001 /*
0002     This file is part of the Kate project.
0003     SPDX-FileCopyrightText: 2021 Jan Paul Batrina <jpmbatrina01@gmail.com>
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #include "encodingtest.h"
0008 #include "katetextbuffer.h"
0009 
0010 QTEST_MAIN(KateEncodingTest)
0011 
0012 void KateEncodingTest::utfBomTest()
0013 {
0014     // setup stuff
0015     Kate::TextBuffer buffer(nullptr);
0016     buffer.setFallbackTextCodec(QStringLiteral("UTF-8"));
0017     bool encodingErrors;
0018     bool tooLongLinesWrapped;
0019     bool success;
0020     int longestLineLoaded;
0021     QString prefixText;
0022 
0023     // utf8 tests
0024     buffer.setTextCodec(QStringLiteral("UTF-8"));
0025     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/utf8.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, true);
0026     QVERIFY(success && !encodingErrors);
0027     QVERIFY(!buffer.generateByteOrderMark()); // file has no bom
0028     // since the utf8 bom is 3 bytes, the first 3 chars should not be the bom
0029     prefixText = buffer.text().left(3);
0030     QCOMPARE(prefixText, QLatin1String("Tes"));
0031 
0032     buffer.setTextCodec(QStringLiteral("UTF-8"));
0033     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/utf8-bom-only.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, true);
0034     QVERIFY(success && !encodingErrors);
0035     QVERIFY(buffer.generateByteOrderMark());
0036     // aside from bom, file is empty so there should be no text
0037     prefixText = buffer.text();
0038     QVERIFY(prefixText.isEmpty());
0039 
0040     // utf16 tests, we need to allow encoding detection as if the byteorder is not ok, QStringDecoder will not show errors
0041     buffer.setTextCodec(QStringLiteral("UTF-16"));
0042     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/utf16.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, false);
0043     QVERIFY(success && !encodingErrors);
0044     QVERIFY(buffer.generateByteOrderMark());
0045     // since the utf16 bom is 2 bytes, the first 2 chars should not be the bom
0046     prefixText = buffer.text().left(2);
0047     QCOMPARE(prefixText, QLatin1String("Te"));
0048 
0049     buffer.setTextCodec(QStringLiteral("UTF-16"));
0050     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/utf16be.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, false);
0051     QVERIFY(success && !encodingErrors);
0052     QVERIFY(buffer.generateByteOrderMark());
0053     // since the utf16 bom is 2 bytes, the first 2 chars should not be the bom
0054     prefixText = buffer.text().left(2);
0055     QCOMPARE(prefixText, QLatin1String("Te"));
0056 
0057     // utf32 tests
0058     buffer.setTextCodec(QStringLiteral("UTF-32"));
0059     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/utf32.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, true);
0060     QVERIFY(success && !encodingErrors);
0061     QVERIFY(buffer.generateByteOrderMark());
0062     // since the utf16 bom is 4 bytes, the first 4 chars should not be the bom
0063     prefixText = buffer.text().left(4);
0064     QCOMPARE(prefixText, QLatin1String("Test"));
0065 
0066     buffer.setTextCodec(QStringLiteral("UTF-32"));
0067     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/utf32be.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, true);
0068     QVERIFY(success && !encodingErrors);
0069     QVERIFY(buffer.generateByteOrderMark());
0070     // since the utf16 bom is 4 bytes, the first 4 chars should not be the bom
0071     prefixText = buffer.text().left(4);
0072     QCOMPARE(prefixText, QLatin1String("Test"));
0073 
0074     // Ensure that a mismatching bom is not processed (e.g. utf8 bom should not be used for utf16)
0075     buffer.setTextCodec(QStringLiteral("UTF-16"));
0076     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/utf8-bom-only.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, true);
0077     QVERIFY(success && !encodingErrors);
0078     // even though the file does not have a bom, Kate::TextBuffer::setTextCodec always enables bom generation
0079     // for utf16 and utf32 since the byte order is useful and relevant for reading the file
0080     QVERIFY(buffer.generateByteOrderMark());
0081     prefixText = buffer.text();
0082     // 0xFFBBEF is processed as a single char 0xBBEF, which is a hangul character
0083     QCOMPARE(prefixText.front(), QChar(0xBBEF));
0084 }
0085 
0086 void KateEncodingTest::nonUtfNoBomTest()
0087 {
0088     // setup stuff
0089     Kate::TextBuffer buffer(nullptr);
0090     buffer.setFallbackTextCodec(QStringLiteral("UTF-8"));
0091     bool encodingErrors;
0092     bool tooLongLinesWrapped;
0093     bool success;
0094     int longestLineLoaded;
0095     QString prefixText;
0096 
0097     // latin15, should not contain any bom
0098     buffer.setTextCodec(QString::fromUtf8(QStringConverter::nameForEncoding(QStringConverter::Latin1)));
0099     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/latin15.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, true);
0100     QVERIFY(success && !encodingErrors);
0101     QVERIFY(!buffer.generateByteOrderMark());
0102     prefixText = buffer.text().left(4);
0103     QCOMPARE(prefixText, QLatin1String("Test"));
0104 
0105     // Even if a bom is somehow found, it should be processed normally as text for non-UTF char sets
0106     buffer.setTextCodec(QString::fromUtf8(QStringConverter::nameForEncoding(QStringConverter::Latin1)));
0107     success = buffer.load(QLatin1String(TEST_DATA_DIR "encoding/latin15-with-utf8-bom.txt"), encodingErrors, tooLongLinesWrapped, longestLineLoaded, true);
0108     QVERIFY(success && !encodingErrors);
0109     QVERIFY(!buffer.generateByteOrderMark()); // utf8 bom shouldn't be processed
0110     // the utf8 bom is 0xEFBBBF, which is "" in Latin15
0111     prefixText = buffer.text().left(3);
0112     QCOMPARE(prefixText, QStringLiteral(""));
0113 }
0114 
0115 #include "moc_encodingtest.cpp"