File indexing completed on 2024-09-15 12:05:28

0001 /**
0002  * test_filter.cpp
0003  *
0004  * SPDX-FileCopyrightText: 2004 Zack Rusin <zack@kde.org>
0005  *
0006  * SPDX-License-Identifier: LGPL-2.1-or-later
0007  */
0008 
0009 #include "test_filter.h"
0010 #include "tokenizer_p.h"
0011 
0012 #include <QTest>
0013 #include <QVector>
0014 
0015 QTEST_GUILESS_MAIN(SonnetFilterTest)
0016 
0017 using namespace Sonnet;
0018 
0019 struct Hit {
0020     Hit(const QString &w, int s)
0021         : word(w)
0022         , start(s)
0023     {
0024     }
0025     QString word;
0026     int start;
0027 };
0028 
0029 void SonnetFilterTest::testLatin()
0030 {
0031     QString buffer(QStringLiteral("  This is     a sample thing. Please test me ...     He's don't Le'Clerk."));
0032     QList<Hit> hits;
0033     hits.append(Hit(QStringLiteral("This"), 2));
0034     hits.append(Hit(QStringLiteral("is"), 7));
0035     hits.append(Hit(QStringLiteral("a"), 14));
0036     hits.append(Hit(QStringLiteral("sample"), 16));
0037     hits.append(Hit(QStringLiteral("thing"), 23));
0038     hits.append(Hit(QStringLiteral("Please"), 30));
0039     hits.append(Hit(QStringLiteral("test"), 37));
0040     hits.append(Hit(QStringLiteral("me"), 42));
0041     hits.append(Hit(QStringLiteral("He's"), 53));
0042     hits.append(Hit(QStringLiteral("don't"), 58));
0043     hits.append(Hit(QStringLiteral("Le'Clerk"), 64));
0044 
0045     WordTokenizer tokenizer;
0046     tokenizer.setBuffer(buffer);
0047 
0048     Token w;
0049     int hitNumber = 0;
0050     while (tokenizer.hasNext()) {
0051         w = tokenizer.next();
0052         QCOMPARE(w.toString(), hits[hitNumber].word);
0053         QCOMPARE(w.position(), hits[hitNumber].start);
0054         ++hitNumber;
0055     }
0056     QCOMPARE(hitNumber, hits.count());
0057 }
0058 
0059 static QVector<ushort> convertToUnicode(const QString &str)
0060 {
0061     QVector<ushort> unicode;
0062     for (int i = 0; i < str.length(); ++i) {
0063         unicode += str[i].unicode();
0064     }
0065     return unicode;
0066 }
0067 
0068 void SonnetFilterTest::testIndic()
0069 {
0070     QString buffer;
0071     QList<Hit> hits;
0072     hits.append(Hit(QString::fromUtf8("मराठी"), 0));
0073     hits.append(Hit(QString::fromUtf8("भाषा"), 6));
0074     hits.append(Hit(QString::fromUtf8("महाराष्ट्र"), 11));
0075     hits.append(Hit(QString::fromUtf8("व"), 22));
0076     hits.append(Hit(QString::fromUtf8("गोवा"), 24));
0077     hits.append(Hit(QString::fromUtf8("राज्याची"), 29));
0078     hits.append(Hit(QString::fromUtf8("राजभाषा"), 38));
0079     hits.append(Hit(QString::fromUtf8("असून"), 46));
0080     hits.append(Hit(QString::fromUtf8("सुमारे"), 51));
0081     hits.append(Hit(QString::fromUtf8("९"), 58)); // This is the number 9, so we don't spell-check it
0082     hits.append(Hit(QString::fromUtf8("कोटी"), 60));
0083     hits.append(Hit(QString::fromUtf8("लोकांची"), 65));
0084     hits.append(Hit(QString::fromUtf8("मातृभाषा"), 73));
0085     hits.append(Hit(QString::fromUtf8("आहे"), 82));
0086     hits.append(Hit(QString::fromUtf8("मराठी"), 87));
0087     hits.append(Hit(QString::fromUtf8("भाषा"), 93));
0088     hits.append(Hit(QString::fromUtf8("कमीत"), 98));
0089     hits.append(Hit(QString::fromUtf8("कमी"), 103));
0090     hits.append(Hit(QString::fromUtf8("१०००"), 107)); // just a number
0091     hits.append(Hit(QString::fromUtf8("वर्षापासून"), 112));
0092     hits.append(Hit(QString::fromUtf8("अस्तित्वात"), 123));
0093     hits.append(Hit(QString::fromUtf8("आहे"), 134));
0094 
0095     buffer = QString::fromUtf8("मराठी भाषा महाराष्ट्र व गोवा राज्याची राजभाषा असून सुमारे ९ कोटी लोकांची मातृभाषा आहे. मराठी भाषा कमीत कमी १००० वर्षापासून अस्तित्वात आहे.");
0096 
0097     WordTokenizer tokenizer;
0098     tokenizer.setBuffer(buffer);
0099     Token w;
0100     int hitNumber = 0;
0101     while (tokenizer.hasNext()) {
0102         w = tokenizer.next();
0103         QVector<ushort> unicode = convertToUnicode(w.toString());
0104         QCOMPARE(w.toString(), hits[hitNumber].word);
0105         QCOMPARE(w.position(), hits[hitNumber].start);
0106         ++hitNumber;
0107     }
0108     QCOMPARE(hitNumber, hits.count());
0109 }
0110 
0111 void SonnetFilterTest::testSentence()
0112 {
0113     QString buffer(QStringLiteral("This is     a sample thing. Please test me ...     He's don't Le'Clerk."));
0114     QList<Hit> hits;
0115     hits.append(Hit(QStringLiteral("This is     a sample thing. "), 0));
0116     hits.append(Hit(QStringLiteral("Please test me ...     "), 28));
0117     hits.append(Hit(QStringLiteral("He's don't Le'Clerk."), 51));
0118 
0119     SentenceTokenizer tokenizer;
0120     tokenizer.setBuffer(buffer);
0121 
0122     Token w;
0123     int hitNumber = 0;
0124     while (tokenizer.hasNext()) {
0125         w = tokenizer.next();
0126         QCOMPARE(w.toString(), hits[hitNumber].word);
0127         QCOMPARE(w.position(), hits[hitNumber].start);
0128         ++hitNumber;
0129     }
0130     QCOMPARE(hitNumber, hits.count());
0131 }
0132 
0133 #include "moc_test_filter.cpp"