File indexing completed on 2024-05-19 05:23:09

0001 /*  -*- c++ -*-
0002     tests/lexertest.cpp
0003 
0004     This file is part of the testsuite of KSieve,
0005     the KDE internet mail/usenet news message filtering library.
0006     SPDX-FileCopyrightText: 2003 Marc Mutz <mutz@kde.org>
0007 
0008     SPDX-License-Identifier: GPL-2.0-only
0009 */
0010 
0011 #include "lexer.h"
0012 using KSieve::Lexer;
0013 
0014 #include "error.h"
0015 using KSieve::Error;
0016 
0017 #include <QString>
0018 
0019 #include <cstdlib>
0020 #include <iostream>
0021 
0022 using std::cerr;
0023 using std::cout;
0024 using std::endl;
0025 
0026 static const char *token2string(Lexer::Token t)
0027 {
0028     switch (t) {
0029 #define CASE(x)                                                                                                                                                \
0030     case Lexer::x:                                                                                                                                             \
0031         return #x
0032         CASE(None);
0033         CASE(HashComment);
0034         CASE(BracketComment);
0035         CASE(Identifier);
0036         CASE(Tag);
0037         CASE(Number);
0038         CASE(MultiLineString);
0039         CASE(QuotedString);
0040         CASE(Special);
0041         CASE(LineFeeds);
0042     }
0043     return "";
0044 #undef CASE
0045 }
0046 
0047 struct TestCase {
0048     const char *name;
0049     const char *string;
0050     struct {
0051         Lexer::Token token;
0052         const char *result;
0053     } expected[16]; // end with { None, 0 }
0054     Error::Type expectedError;
0055     int errorLine, errorCol;
0056 };
0057 
0058 static const TestCase testcases[] = {
0059     //
0060     // Whitespace:
0061     //
0062 
0063     {"Null script", nullptr, {{Lexer::None, nullptr}}, Error::None, 0, 0},
0064 
0065     {"Empty script", "", {{Lexer::None, nullptr}}, Error::None, 0, 0},
0066 
0067     {"Whitespace-only script", " \t\n\t \n", {{Lexer::LineFeeds, "2"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0068 
0069     {"Lone CR", "\r", {{Lexer::None, nullptr}}, Error::CRWithoutLF, 0, 1},
0070 
0071     {"CR+Space", "\r ", {{Lexer::None, nullptr}}, Error::CRWithoutLF, 0, 1},
0072 
0073     {"CRLF alone", "\r\n", {{Lexer::LineFeeds, "1"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0074 
0075     //
0076     // hash comments:
0077     //
0078 
0079     {"Basic hash comment (no newline)", "#comment", {{Lexer::HashComment, "comment"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0080 
0081     {"Basic hash comment (LF)", "#comment\n", {{Lexer::HashComment, "comment"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0082 
0083     {"Basic hash comment (CRLF)", "#comment\r\n", {{Lexer::HashComment, "comment"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0084 
0085     {"Basic hash comment (CR)", "#comment\r", {{Lexer::HashComment, nullptr}}, Error::CRWithoutLF, 0, 9},
0086 
0087     {"Non-UTF-8 in hash comment", "#\xA9 copyright", {{Lexer::HashComment, nullptr}}, Error::InvalidUTF8, 0, 12},
0088 
0089     //
0090     // bracket comments:
0091     //
0092 
0093     {"Basic bracket comment", "/* comment */", {{Lexer::BracketComment, " comment "}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0094 
0095     {"Basic bracket comment - missing trailing slash", "/* comment *", {{Lexer::BracketComment, nullptr}}, Error::UnfinishedBracketComment, 0, 0},
0096 
0097     {"Basic bracket comment - missing trailing asterisk + slash", "/* comment ", {{Lexer::BracketComment, nullptr}}, Error::UnfinishedBracketComment, 0, 0},
0098 
0099     {"Basic bracket comment - missing leading slash", "* comment */", {{Lexer::None, nullptr}}, Error::IllegalCharacter, 0, 0},
0100 
0101     {"Basic bracket comment - missing leading asterisk + slash",
0102      "comment */",
0103      {{Lexer::Identifier, "comment"}, {Lexer::None, nullptr}},
0104      Error::IllegalCharacter,
0105      0,
0106      8},
0107 
0108     {"Basic multiline bracket comment (LF)",
0109      "/* comment\ncomment */",
0110      {{Lexer::BracketComment, " comment\ncomment "}, {Lexer::None, nullptr}},
0111      Error::None,
0112      0,
0113      0},
0114 
0115     {"Basic multiline bracket comment (CRLF)",
0116      "/* comment\r\ncomment */",
0117      {{Lexer::BracketComment, " comment\ncomment "}, {Lexer::None, nullptr}},
0118      Error::None,
0119      0,
0120      0},
0121 
0122     {"Basic multiline bracket comment (CR)", "/* comment\rcomment */", {{Lexer::BracketComment, nullptr}}, Error::CRWithoutLF, 0, 11},
0123 
0124     {"Non-UTF-8 in bracket comment", "/*\xA9 copyright*/", {{Lexer::BracketComment, nullptr}}, Error::InvalidUTF8, 0, 14},
0125 
0126     //
0127     // numbers:
0128     //
0129     {"Basic number 1", "1", {{Lexer::Number, "1"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0130     {"Basic number 01", "01", {{Lexer::Number, "01"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0131     {"Qualified number 1k", "1k", {{Lexer::Number, "1k"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0132     {"Qualified number 1M", "1M", {{Lexer::Number, "1M"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0133     {"Qualified number 1G", "1G", {{Lexer::Number, "1G"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0134     //
0135     // identifiers:
0136     //
0137     {"Basic identifier \"id\"", "id", {{Lexer::Identifier, "id"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0138     {"Basic identifier \"_id\"", "_id", {{Lexer::Identifier, "_id"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0139     //
0140     // tags:
0141     //
0142     {"Basic tag \":tag\"", ":tag", {{Lexer::Tag, "tag"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0143     {"Basic tag \":_tag\"", ":_tag", {{Lexer::Tag, "_tag"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0144     //
0145     // specials:
0146     //
0147     {"Basic special \"{}[]();,\"",
0148      "{}[]();,",
0149      {{Lexer::Special, "{"},
0150       {Lexer::Special, "}"},
0151       {Lexer::Special, "["},
0152       {Lexer::Special, "]"},
0153       {Lexer::Special, "("},
0154       {Lexer::Special, ")"},
0155       {Lexer::Special, ";"},
0156       {Lexer::Special, ","},
0157       {Lexer::None, nullptr}},
0158      Error::None,
0159      0,
0160      0},
0161     //
0162     // quoted-string:
0163     //
0164     {"Basic quoted string \"foo\"", "\"foo\"", {{Lexer::QuotedString, "foo"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0165     {"Basic quoted string, UTF-8",
0166      "\"foo\xC3\xB1"
0167      "foo\"", // fooäfoo
0168      {{Lexer::QuotedString,
0169        "foo\xC3\xB1"
0170        "foo"},
0171       {Lexer::None, nullptr}},
0172      Error::None,
0173      0,
0174      0},
0175     {"Quoted string, escaped '\"'", R"("foo\"bar")", {{Lexer::QuotedString, "foo\"bar"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0176     {"Quoted string, escaped '\\'", R"("foo\\bar")", {{Lexer::QuotedString, "foo\\bar"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0177     {"Quoted string, excessive escapes", R"("\fo\o")", {{Lexer::QuotedString, "foo"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0178     {"Quoted string across lines (LF)", "\"foo\nbar\"", {{Lexer::QuotedString, "foo\nbar"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0179     {"Quoted string across lines (CRLF)", "\"foo\r\nbar\"", {{Lexer::QuotedString, "foo\nbar"}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0180     //
0181     // multiline strings:
0182     //
0183     {"Basic multiline string I (LF)", "text:\nfoo\n.", {{Lexer::MultiLineString, "foo" /* "foo\n" ? */}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0184     {"Basic multiline string I (CRLF)", "text:\r\nfoo\r\n.", {{Lexer::MultiLineString, "foo" /* "foo\n" ? */}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0185     {"Basic multiline string II (LF)", "text:\nfoo\n.\n", {{Lexer::MultiLineString, "foo" /* "foo\n" ? */}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0186     {"Basic multiline string II (CRLF)", "text:\r\nfoo\r\n.\r\n", {{Lexer::MultiLineString, "foo" /* "foo\n" ? */}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0187     {"Dotstuffed multiline string (LF)", "text:\n..foo\n.", {{Lexer::MultiLineString, ".foo" /* ".foo\n" ? */}, {Lexer::None, nullptr}}, Error::None, 0, 0},
0188     {"Dotstuffed multiline string (CRLF)",
0189      "text:\r\n..foo\r\n.",
0190      {{Lexer::MultiLineString, ".foo" /* ".foo\n" ? */}, {Lexer::None, nullptr}},
0191      Error::None,
0192      0,
0193      0},
0194     {"Incompletely dotstuffed multiline string (LF)",
0195      "text:\n.foo\n.",
0196      {{Lexer::MultiLineString, ".foo" /* ".foo\n" ? */}, {Lexer::None, nullptr}},
0197      Error::None,
0198      0,
0199      0},
0200     {"Incompletely dotstuffed multiline string (CRLF)",
0201      "text:\r\n.foo\r\n.",
0202      {{Lexer::MultiLineString, ".foo" /* ".foo\n" ? */}, {Lexer::None, nullptr}},
0203      Error::None,
0204      0,
0205      0},
0206     {"Multiline with a line with only one '.'",
0207      "text:\r\nfoo\r\n..\r\nbar\r\n.",
0208      {{Lexer::MultiLineString, "foo\n.\nbar"}, {Lexer::None, nullptr}},
0209      Error::None,
0210      0,
0211      0},
0212 
0213     //
0214     // Errors in single tokens:
0215     //
0216 
0217     //
0218     // numbers:
0219     //
0220     {"Number, unknown qualifier", "100f", {{Lexer::Number, "100"}}, Error::UnexpectedCharacter, 0, 3},
0221     {"Negative number", "-100", {{Lexer::None, nullptr}}, Error::IllegalCharacter, 0, 0},
0222     //
0223     // identifiers:
0224     //
0225     {"Identifier, leading digits", "0id", {{Lexer::Number, "0"}}, Error::UnexpectedCharacter, 0, 1},
0226     {"Identifier, embedded umlaut", "idäid", {{Lexer::Identifier, "id"}}, Error::IllegalCharacter, 0, 2},
0227     //
0228     // tags:
0229     //
0230     {"Lone ':' (at end)", ":", {{Lexer::Tag, nullptr}}, Error::UnexpectedCharacter, 0, 0},
0231     {"Lone ':' (in stream)", ": ", {{Lexer::Tag, nullptr}}, Error::UnexpectedCharacter, 0, 1},
0232     {"Tag, leading digits", ":0tag", {{Lexer::Tag, nullptr}}, Error::NoLeadingDigits, 0, 1},
0233     {"Tag, embedded umlaut", ":tagätag", {{Lexer::Tag, "tag"}}, Error::IllegalCharacter, 0, 4},
0234     //
0235     // specials: (none)
0236     // quoted string:
0237     //
0238     {"Premature end of quoted string", "\"foo", {{Lexer::QuotedString, "foo"}}, Error::PrematureEndOfQuotedString, 0, 0},
0239     {"Invalid UTF-8 in quoted string",
0240      "\"foo\xC0\xA0"
0241      "foo\"",
0242      {{Lexer::QuotedString, "foo"}},
0243      Error::InvalidUTF8,
0244      0,
0245      4},
0246 
0247     //
0248     // Whitespace / token separation: valid
0249     //
0250 
0251     {"Two identifiers with linebreaks",
0252      "foo\nbar\n",
0253      {{Lexer::Identifier, "foo"}, {Lexer::LineFeeds, "1"}, {Lexer::Identifier, "bar"}, {Lexer::LineFeeds, "1"}, {Lexer::None, nullptr}},
0254      Error::None,
0255      0,
0256      0},
0257 
0258     //
0259     // Whitespace / token separation: invalid
0260     //
0261 };
0262 
0263 static const int numTestCases = sizeof testcases / sizeof *testcases;
0264 
0265 int main(int argc, char *argv[])
0266 {
0267     if (argc == 2) { // manual test
0268         const char *scursor = argv[1];
0269         const char *const send = argv[1] + qstrlen(argv[1]);
0270 
0271         Lexer lexer(scursor, send);
0272 
0273         cout << "Begin" << endl;
0274         while (!lexer.atEnd()) {
0275             QString result;
0276             Lexer::Token token = lexer.nextToken(result);
0277             if (lexer.error()) {
0278                 cout << "Error " << token2string(token) << ": \"" << lexer.error().asString().toLatin1().constData() << "\" at (" << lexer.error().line() << ","
0279                      << lexer.error().column() << ")" << endl;
0280                 break;
0281             } else {
0282                 cout << "Got " << token2string(token) << ": \"" << result.toUtf8().data() << "\" at (" << lexer.line() << "," << lexer.column() << ")" << endl;
0283             }
0284         }
0285         cout << "End" << endl;
0286     } else if (argc == 1) { // automated test
0287         bool success = true;
0288         for (int i = 0; i < numTestCases; ++i) {
0289             bool ok = true;
0290             const TestCase &t = testcases[i];
0291             const char *const send = t.string + qstrlen(t.string);
0292             Lexer lexer(t.string, send, Lexer::IncludeComments);
0293             cerr << t.name << ":";
0294             for (int j = 0; !lexer.atEnd(); ++j) {
0295                 QString result;
0296                 Lexer::Token token = lexer.nextToken(result);
0297                 Error error = lexer.error();
0298                 if (t.expected[j].token != token) {
0299                     ok = false;
0300                     cerr << " expected token " << token2string(t.expected[j].token) << ", got " << token2string(token);
0301                 }
0302                 if (QString::fromUtf8(t.expected[j].result) != result) {
0303                     ok = false;
0304                     if (t.expected[j].result) {
0305                         cerr << " expected string \"" << t.expected[j].result << "\"";
0306                     } else {
0307                         cerr << " expected null string";
0308                     }
0309                     if (!result.toUtf8().isNull()) {
0310                         cerr << ", got \"" << result.toUtf8().data() << "\"";
0311                     } else {
0312                         cerr << ", got null string";
0313                     }
0314                 }
0315                 if (error && error.type() != t.expectedError) {
0316                     ok = false;
0317                     cerr << " expected error #" << (int)t.expectedError << ", got #" << (int)error.type();
0318                 }
0319                 if (error && (error.line() != t.errorLine || error.column() != t.errorCol)) {
0320                     ok = false;
0321                     cerr << " expected position (" << t.errorLine << "," << t.errorCol << "), got (" << error.line() << "," << error.column() << ")";
0322                 }
0323                 if (error) {
0324                     goto ErrorOut;
0325                 }
0326                 if (t.expected[j].token == Lexer::None && t.expected[j].result == nullptr) {
0327                     break;
0328                 }
0329             }
0330             if (!lexer.atEnd()) {
0331                 ok = false;
0332                 cerr << " premature end of expected token list";
0333             }
0334         ErrorOut:
0335             if (ok) {
0336                 cerr << " ok";
0337             }
0338             cerr << endl;
0339             if (!ok) {
0340                 success = false;
0341             }
0342         }
0343         if (!success) {
0344             return 1;
0345         }
0346     } else { // usage error
0347         cerr << "usage: lexertest [ <string> ]" << endl;
0348         exit(1);
0349     }
0350 
0351     return 0;
0352 }