File indexing completed on 2024-05-19 07:40:32
0001 /* 0002 This file is part of the KDE Baloo project. 0003 SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <vhanda@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.1-or-later 0006 */ 0007 0008 #include "termgenerator.h" 0009 #include "document.h" 0010 0011 #include <QTest> 0012 0013 using namespace Baloo; 0014 0015 #include <QObject> 0016 0017 class Baloo::TermGeneratorTest : public QObject 0018 { 0019 Q_OBJECT 0020 0021 private Q_SLOTS: 0022 void testWordBoundaries(); 0023 void testWordBoundariesCJK(); 0024 void testWordBoundariesCJKMixed(); 0025 void testUnderscoreWord(); 0026 void testUnderscore_splitting(); 0027 void testAccentCharacters(); 0028 void testUnicodeCompatibleComposition(); 0029 void testUnicodeLowering(); 0030 void testEmails(); 0031 void testWordPositions(); 0032 void testWordPositionsCJK(); 0033 void testNumbers(); 0034 void testControlCharacter(); 0035 void testFilePaths(); 0036 void testFilePaths_data(); 0037 void testApostroph(); 0038 void testApostroph_data(); 0039 0040 QList<QByteArray> allWords(const QString& str) 0041 { 0042 Document doc; 0043 TermGenerator termGen(doc); 0044 termGen.indexText(str); 0045 0046 return doc.m_terms.keys(); 0047 } 0048 }; 0049 0050 void TermGeneratorTest::testWordBoundaries() 0051 { 0052 QString str = QString::fromLatin1("The quick (\"brown\") 'fox' can't jump 32.3 feet, right? No-Wrong;xx.txt"); 0053 0054 QList<QByteArray> words = allWords(str); 0055 0056 QList<QByteArray> expectedWords; 0057 expectedWords << QByteArray("32.3") << QByteArray("brown") << QByteArray("can't") << QByteArray("feet") << QByteArray("fox") << QByteArray("jump") 0058 << QByteArray("no") << QByteArray("quick") << QByteArray("right") << QByteArray("the") << QByteArray("txt") << QByteArray("wrong") 0059 << QByteArray("xx"); 0060 0061 QCOMPARE(words, expectedWords); 0062 } 0063 0064 void TermGeneratorTest::testWordBoundariesCJK() 0065 { 0066 QString str = QString::fromUtf8("你"); 0067 0068 QList<QByteArray> words = allWords(str); 0069 QList<QByteArray> expectedWords; 0070 expectedWords << QByteArray("你"); 0071 0072 QCOMPARE(words, expectedWords); 0073 } 0074 0075 void TermGeneratorTest::testWordBoundariesCJKMixed() 0076 { 0077 // This is a English and CJK mixed string. 0078 QString str = QString::fromUtf8("hello world!你好世界貴方元気켐ㅇㄹ?☺"); 0079 0080 QList<QByteArray> words = allWords(str); 0081 QList<QByteArray> expectedWords; 0082 expectedWords << QByteArray("hello") << QByteArray("world") << QByteArray("☺") << QByteArray("你好世界貴方元気") << QByteArray("켐ᄋᄅ"); 0083 0084 QCOMPARE(words, expectedWords); 0085 } 0086 0087 void TermGeneratorTest::testUnderscoreWord() 0088 { 0089 QString str = QString::fromLatin1("_plant"); 0090 0091 QList<QByteArray> words = allWords(str); 0092 0093 QList<QByteArray> expectedWords; 0094 expectedWords << QByteArray("plant"); 0095 0096 QCOMPARE(words, expectedWords); 0097 } 0098 0099 void TermGeneratorTest::testUnderscore_splitting() 0100 { 0101 QString str = QString::fromLatin1("Hello_Howdy"); 0102 0103 QList<QByteArray> words = allWords(str); 0104 0105 QList<QByteArray> expectedWords; 0106 expectedWords << QByteArray("hello") << QByteArray("howdy"); 0107 0108 QCOMPARE(words, expectedWords); 0109 } 0110 0111 void TermGeneratorTest::testAccentCharacters() 0112 { 0113 QString str = QString::fromUtf8("Como est\xC3\xA1 K\xC3\xBBg"); // "Como está Kûg" 0114 0115 QList<QByteArray> words = allWords(str); 0116 0117 QList<QByteArray> expectedWords; 0118 expectedWords << QByteArray("como") << QByteArray("esta") << QByteArray("kug"); 0119 0120 QCOMPARE(words, expectedWords); 0121 } 0122 0123 void TermGeneratorTest::testUnicodeCompatibleComposition() 0124 { 0125 // The 0xfb00 corresponds to U+FB00 which is a 'ff' ligature 0126 QString expected = QLatin1String("maffab"); 0127 QString str = QLatin1String("ma") + QChar(0xfb00) + QStringLiteral("ab"); 0128 0129 QList<QByteArray> words = allWords(str); 0130 QCOMPARE(words.size(), 1); 0131 0132 QByteArray output = words.first(); 0133 QCOMPARE(expected.toUtf8(), output); 0134 } 0135 0136 void TermGeneratorTest::testUnicodeLowering() 0137 { 0138 // This string is unicode mathematical italic "Hedge" 0139 QString str = QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92"); 0140 0141 QList<QByteArray> words = allWords(str); 0142 0143 QCOMPARE(words, {QByteArray("hedge")}); 0144 } 0145 0146 void TermGeneratorTest::testEmails() 0147 { 0148 QString str = QString::fromLatin1("me@vhanda.in"); 0149 0150 QList<QByteArray> words = allWords(str); 0151 0152 QList<QByteArray> expectedWords; 0153 expectedWords << QByteArray("in") << QByteArray("me") << QByteArray("vhanda"); 0154 0155 QCOMPARE(words, expectedWords); 0156 } 0157 0158 void TermGeneratorTest::testWordPositions() 0159 { 0160 Document doc; 0161 TermGenerator termGen(doc); 0162 0163 QString str = QString::fromLatin1("Hello hi how hi"); 0164 termGen.indexText(str); 0165 0166 QList<QByteArray> words = doc.m_terms.keys(); 0167 0168 QList<QByteArray> expectedWords; 0169 expectedWords << QByteArray("hello") << QByteArray("hi") << QByteArray("how"); 0170 QCOMPARE(words, expectedWords); 0171 0172 QVector<uint> posInfo1 = doc.m_terms.value("hello").positions; 0173 QCOMPARE(posInfo1, QVector<uint>() << 1); 0174 0175 QVector<uint> posInfo2 = doc.m_terms.value("hi").positions; 0176 QCOMPARE(posInfo2, QVector<uint>() << 2 << 4); 0177 0178 QVector<uint> posInfo3 = doc.m_terms.value("how").positions; 0179 QCOMPARE(posInfo3, QVector<uint>() << 3); 0180 } 0181 0182 void TermGeneratorTest::testWordPositionsCJK() 0183 { 0184 Document doc; 0185 TermGenerator termGen(doc); 0186 0187 // This is a Chinese sentence: Hello! I know about you. 0188 QString str = QString::fromUtf8("你好你好!我认识你。"); 0189 termGen.indexText(str); 0190 0191 QList<QByteArray> words = doc.m_terms.keys(); 0192 QList<QByteArray> expectedWords; 0193 // Full width question mark is split point, and the fullwidth period is trimmed. 0194 expectedWords << QByteArray("你好你好") << QByteArray("我认识你"); 0195 QCOMPARE(words, expectedWords); 0196 0197 QVector<uint> posInfo1 = doc.m_terms.value("你好你好").positions; 0198 QCOMPARE(posInfo1, QVector<uint>() << 1); 0199 0200 QVector<uint> posInfo2 = doc.m_terms.value("我认识你").positions; 0201 QCOMPARE(posInfo2, QVector<uint>() << 2); 0202 } 0203 0204 void TermGeneratorTest::testNumbers() 0205 { 0206 QString str = QString::fromLatin1("1 5 10 -3 -12, 5.6, -13.4 -7e3"); 0207 0208 QList<QByteArray> words = allWords(str); 0209 0210 QList<QByteArray> expectedWords; 0211 // TODO: Signs are dropped by the TermGenerator 0212 expectedWords = { "1", "10", "12", "13.4", "3", "5", "5.6", "7e3"}; 0213 QCOMPARE(words, expectedWords); 0214 0215 expectedWords = { "1", "10", "12", "-13.4", "-3", "5", "5.6", "-7e3"}; 0216 QEXPECT_FAIL("", "signs not handled correctly", Continue); 0217 QCOMPARE(words, expectedWords); 0218 } 0219 0220 void TermGeneratorTest::testControlCharacter() 0221 { 0222 QString str = QString::fromUtf8("word1\u0001word2"); 0223 0224 QList<QByteArray> words = allWords(str); 0225 QList<QByteArray> expectedWords = { "word1", "word2" }; 0226 0227 QCOMPARE(words, expectedWords); 0228 } 0229 0230 void TermGeneratorTest::testFilePaths() 0231 { 0232 QFETCH(QString, input); 0233 QFETCH(QList<QByteArray>, expectedWords); 0234 0235 auto words = allWords(input); 0236 QCOMPARE(words, expectedWords); 0237 } 0238 0239 void TermGeneratorTest::testFilePaths_data() 0240 { 0241 QTest::addColumn<QString>("input"); 0242 QTest::addColumn<QList<QByteArray>>("expectedWords"); 0243 0244 QTest::addRow("filename with suffix") << QStringLiteral("file.png") 0245 << QList<QByteArray>({"file", "png"}); 0246 QTest::addRow("filename") << QStringLiteral("foo_bar.png") 0247 << QList<QByteArray>({"bar", "foo", "png"}); 0248 QTest::addRow("filepath") << QStringLiteral("/foo/bar") 0249 << QList<QByteArray>({"bar", "foo"}); 0250 } 0251 0252 void TermGeneratorTest::testApostroph() 0253 { 0254 QFETCH(QString, input); 0255 QFETCH(QList<QByteArray>, expectedWords); 0256 0257 auto words = allWords(input); 0258 QCOMPARE(words, expectedWords); 0259 } 0260 0261 void TermGeneratorTest::testApostroph_data() 0262 { 0263 QTest::addColumn<QString>("input"); 0264 QTest::addColumn<QList<QByteArray>>("expectedWords"); 0265 0266 QTest::addRow("Leading") << QStringLiteral("'one two") 0267 << QList<QByteArray>({"one", "two"}); 0268 QTest::addRow("Middle") << QStringLiteral("one'two three") 0269 << QList<QByteArray>({"one'two", "three"}); 0270 QTest::addRow("End") << QStringLiteral("one' two") 0271 << QList<QByteArray>({"one", "two"}); 0272 } 0273 0274 QTEST_MAIN(TermGeneratorTest) 0275 0276 #include "termgeneratortest.moc"