File indexing completed on 2024-04-28 03:51:39

0001 /*
0002     This file is part of the KDE Baloo project.
0003     SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <vhanda@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.1-or-later
0006 */
0007 
0008 #include "termgenerator.h"
0009 #include "document.h"
0010 
0011 #include <QTest>
0012 
0013 using namespace Baloo;
0014 
0015 #include <QObject>
0016 
0017 class Baloo::TermGeneratorTest : public QObject
0018 {
0019     Q_OBJECT
0020 
0021 private Q_SLOTS:
0022     void testWordBoundaries();
0023     void testWordBoundariesCJK();
0024     void testWordBoundariesCJKMixed();
0025     void testUnderscoreWord();
0026     void testUnderscore_splitting();
0027     void testAccentCharacters();
0028     void testUnicodeCompatibleComposition();
0029     void testUnicodeLowering();
0030     void testEmails();
0031     void testWordPositions();
0032     void testWordPositionsCJK();
0033     void testNumbers();
0034     void testControlCharacter();
0035     void testFilePaths();
0036     void testFilePaths_data();
0037     void testApostroph();
0038     void testApostroph_data();
0039 
0040     QList<QByteArray> allWords(const QString& str)
0041     {
0042         Document doc;
0043         TermGenerator termGen(doc);
0044         termGen.indexText(str);
0045 
0046         return doc.m_terms.keys();
0047     }
0048 };
0049 
0050 void TermGeneratorTest::testWordBoundaries()
0051 {
0052     QString str = QString::fromLatin1("The quick (\"brown\") 'fox' can't jump 32.3 feet, right? No-Wrong;xx.txt");
0053 
0054     QList<QByteArray> words = allWords(str);
0055 
0056     QList<QByteArray> expectedWords;
0057     expectedWords << QByteArray("32.3") << QByteArray("brown") << QByteArray("can't") << QByteArray("feet") << QByteArray("fox") << QByteArray("jump")
0058                   << QByteArray("no") << QByteArray("quick") << QByteArray("right") << QByteArray("the") << QByteArray("txt") << QByteArray("wrong")
0059                   << QByteArray("xx");
0060 
0061     QCOMPARE(words, expectedWords);
0062 }
0063 
0064 void TermGeneratorTest::testWordBoundariesCJK()
0065 {
0066     QString str = QString::fromUtf8("你");
0067 
0068     QList<QByteArray> words = allWords(str);
0069     QList<QByteArray> expectedWords;
0070     expectedWords << QByteArray("你");
0071 
0072     QCOMPARE(words, expectedWords);
0073 }
0074 
0075 void TermGeneratorTest::testWordBoundariesCJKMixed()
0076 {
0077     // This is a English and CJK mixed string.
0078     QString str = QString::fromUtf8("hello world!你好世界貴方元気켐ㅇㄹ?☺");
0079 
0080     QList<QByteArray> words = allWords(str);
0081     QList<QByteArray> expectedWords;
0082     expectedWords << QByteArray("hello") << QByteArray("world") << QByteArray("☺") << QByteArray("你好世界貴方元気") << QByteArray("켐ᄋᄅ");
0083 
0084     QCOMPARE(words, expectedWords);
0085 }
0086 
0087 void TermGeneratorTest::testUnderscoreWord()
0088 {
0089     QString str = QString::fromLatin1("_plant");
0090 
0091     QList<QByteArray> words = allWords(str);
0092 
0093     QList<QByteArray> expectedWords;
0094     expectedWords << QByteArray("plant");
0095 
0096     QCOMPARE(words, expectedWords);
0097 }
0098 
0099 void TermGeneratorTest::testUnderscore_splitting()
0100 {
0101     QString str = QString::fromLatin1("Hello_Howdy");
0102 
0103     QList<QByteArray> words = allWords(str);
0104 
0105     QList<QByteArray> expectedWords;
0106     expectedWords << QByteArray("hello") << QByteArray("howdy");
0107 
0108     QCOMPARE(words, expectedWords);
0109 }
0110 
0111 void TermGeneratorTest::testAccentCharacters()
0112 {
0113     QString str = QString::fromUtf8("Como est\xC3\xA1 K\xC3\xBBg"); // "Como está Kûg"
0114 
0115     QList<QByteArray> words = allWords(str);
0116 
0117     QList<QByteArray> expectedWords;
0118     expectedWords << QByteArray("como") << QByteArray("esta") << QByteArray("kug");
0119 
0120     QCOMPARE(words, expectedWords);
0121 }
0122 
0123 void TermGeneratorTest::testUnicodeCompatibleComposition()
0124 {
0125     // The 0xfb00 corresponds to U+FB00 which is a 'ff' ligature
0126     QString expected = QLatin1String("maffab");
0127     QString str = QLatin1String("ma") + QChar(0xfb00) + QStringLiteral("ab");
0128 
0129     QList<QByteArray> words = allWords(str);
0130     QCOMPARE(words.size(), 1);
0131 
0132     QByteArray output = words.first();
0133     QCOMPARE(expected.toUtf8(), output);
0134 }
0135 
0136 void TermGeneratorTest::testUnicodeLowering()
0137 {
0138     // This string is unicode mathematical italic "Hedge"
0139     QString str = QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92");
0140 
0141     QList<QByteArray> words = allWords(str);
0142 
0143     QCOMPARE(words, {QByteArray("hedge")});
0144 }
0145 
0146 void TermGeneratorTest::testEmails()
0147 {
0148     QString str = QString::fromLatin1("me@vhanda.in");
0149 
0150     QList<QByteArray> words = allWords(str);
0151 
0152     QList<QByteArray> expectedWords;
0153     expectedWords << QByteArray("in") << QByteArray("me") << QByteArray("vhanda");
0154 
0155     QCOMPARE(words, expectedWords);
0156 }
0157 
0158 void TermGeneratorTest::testWordPositions()
0159 {
0160     Document doc;
0161     TermGenerator termGen(doc);
0162 
0163     QString str = QString::fromLatin1("Hello hi how hi");
0164     termGen.indexText(str);
0165 
0166     QList<QByteArray> words = doc.m_terms.keys();
0167 
0168     QList<QByteArray> expectedWords;
0169     expectedWords << QByteArray("hello") << QByteArray("hi") << QByteArray("how");
0170     QCOMPARE(words, expectedWords);
0171 
0172     QVector<uint> posInfo1 = doc.m_terms.value("hello").positions;
0173     QCOMPARE(posInfo1, QVector<uint>() << 1);
0174 
0175     QVector<uint> posInfo2 = doc.m_terms.value("hi").positions;
0176     QCOMPARE(posInfo2, QVector<uint>() << 2 << 4);
0177 
0178     QVector<uint> posInfo3 = doc.m_terms.value("how").positions;
0179     QCOMPARE(posInfo3, QVector<uint>() << 3);
0180 }
0181 
0182 void TermGeneratorTest::testWordPositionsCJK()
0183 {
0184     Document doc;
0185     TermGenerator termGen(doc);
0186 
0187     // This is a Chinese sentence: Hello! I know about you.
0188     QString str = QString::fromUtf8("你好你好!我认识你。");
0189     termGen.indexText(str);
0190 
0191     QList<QByteArray> words = doc.m_terms.keys();
0192     QList<QByteArray> expectedWords;
0193     // Full width question mark is split point, and the fullwidth period is trimmed.
0194     expectedWords << QByteArray("你好你好") << QByteArray("我认识你");
0195     QCOMPARE(words, expectedWords);
0196 
0197     QVector<uint> posInfo1 = doc.m_terms.value("你好你好").positions;
0198     QCOMPARE(posInfo1, QVector<uint>() << 1);
0199 
0200     QVector<uint> posInfo2 = doc.m_terms.value("我认识你").positions;
0201     QCOMPARE(posInfo2, QVector<uint>() << 2);
0202 }
0203 
0204 void TermGeneratorTest::testNumbers()
0205 {
0206     QString str = QString::fromLatin1("1 5 10 -3 -12, 5.6, -13.4 -7e3");
0207 
0208     QList<QByteArray> words = allWords(str);
0209 
0210     QList<QByteArray> expectedWords;
0211     // TODO: Signs are dropped by the TermGenerator
0212     expectedWords = { "1", "10", "12", "13.4", "3", "5", "5.6", "7e3"};
0213     QCOMPARE(words, expectedWords);
0214 
0215     expectedWords = { "1", "10", "12", "-13.4", "-3", "5", "5.6", "-7e3"};
0216     QEXPECT_FAIL("", "signs not handled correctly", Continue);
0217     QCOMPARE(words, expectedWords);
0218 }
0219 
0220 void TermGeneratorTest::testControlCharacter()
0221 {
0222     QString str = QString::fromUtf8("word1\u0001word2");
0223 
0224     QList<QByteArray> words = allWords(str);
0225     QList<QByteArray> expectedWords = { "word1", "word2" };
0226 
0227     QCOMPARE(words, expectedWords);
0228 }
0229 
0230 void TermGeneratorTest::testFilePaths()
0231 {
0232     QFETCH(QString, input);
0233     QFETCH(QList<QByteArray>, expectedWords);
0234 
0235     auto words = allWords(input);
0236     QCOMPARE(words, expectedWords);
0237 }
0238 
0239 void TermGeneratorTest::testFilePaths_data()
0240 {
0241     QTest::addColumn<QString>("input");
0242     QTest::addColumn<QList<QByteArray>>("expectedWords");
0243 
0244     QTest::addRow("filename with suffix") << QStringLiteral("file.png")
0245         << QList<QByteArray>({"file", "png"});
0246     QTest::addRow("filename") << QStringLiteral("foo_bar.png")
0247         << QList<QByteArray>({"bar", "foo", "png"});
0248     QTest::addRow("filepath") << QStringLiteral("/foo/bar")
0249         << QList<QByteArray>({"bar", "foo"});
0250 }
0251 
0252 void TermGeneratorTest::testApostroph()
0253 {
0254     QFETCH(QString, input);
0255     QFETCH(QList<QByteArray>, expectedWords);
0256 
0257     auto words = allWords(input);
0258     QCOMPARE(words, expectedWords);
0259 }
0260 
0261 void TermGeneratorTest::testApostroph_data()
0262 {
0263     QTest::addColumn<QString>("input");
0264     QTest::addColumn<QList<QByteArray>>("expectedWords");
0265 
0266     QTest::addRow("Leading") << QStringLiteral("'one two")
0267         << QList<QByteArray>({"one", "two"});
0268     QTest::addRow("Middle") << QStringLiteral("one'two three")
0269         << QList<QByteArray>({"one'two", "three"});
0270     QTest::addRow("End") << QStringLiteral("one' two")
0271         << QList<QByteArray>({"one", "two"});
0272 }
0273 
0274 QTEST_MAIN(TermGeneratorTest)
0275 
0276 #include "termgeneratortest.moc"