src/ksieve/lexer.cpp

0001 /*  -*- c++ -*-
0002     parser/lexer.cpp
0003
0004     This file is part of KSieve,
0005     the KDE internet mail/usenet news message filtering library.
0006     SPDX-FileCopyrightText: 2002-2003 Marc Mutz <mutz@kde.org>
0007
0008     SPDX-License-Identifier: GPL-2.0-only
0009 */
0010
0011 #include "lexer_p.h"
0012
0013 #include "error.h"
0014 #include "utf8validator.h"
0015
0016 #include <QString>
0017 #include <QStringList>
0018
0019 #include <memory> // std::unique_ptr
0020
0021 #include <QStringDecoder>
0022 #include <cassert>
0023 #include <cctype> // isdigit
0024
0025 #ifdef STR_DIM
0026 #undef STR_DIM
0027 #endif
0028 #define STR_DIM(x) (sizeof(x) - 1)
0029
0030 namespace KSieve
0031 {
0032 //
0033 //
0034 // Lexer Bridge implementation
0035 //
0036 //
0037
0038 Lexer::Lexer(const char *scursor, const char *send, int options)
0039     : i(new Impl(scursor, send, options))
0040 {
0041 }
0042
0043 Lexer::~Lexer()
0044 {
0045     delete i;
0046     i = nullptr;
0047 }
0048
0049 bool Lexer::ignoreComments() const
0050 {
0051     assert(i);
0052     return i->ignoreComments();
0053 }
0054
0055 const Error &Lexer::error() const
0056 {
0057     assert(i);
0058     return i->error();
0059 }
0060
0061 bool Lexer::atEnd() const
0062 {
0063     assert(i);
0064     return i->atEnd();
0065 }
0066
0067 int Lexer::column() const
0068 {
0069     assert(i);
0070     return i->column();
0071 }
0072
0073 int Lexer::line() const
0074 {
0075     assert(i);
0076     return i->line();
0077 }
0078
0079 void Lexer::save()
0080 {
0081     assert(i);
0082     i->save();
0083 }
0084
0085 void Lexer::restore()
0086 {
0087     assert(i);
0088     i->restore();
0089 }
0090
0091 Lexer::Token Lexer::nextToken(QString &result)
0092 {
0093     assert(i);
0094     return i->nextToken(result);
0095 }
0096 } // namespace KSieve
0097
0098 // none except a-zA-Z0-9_
0099 static const unsigned char iTextMap[16] = {
0100     0x00,
0101     0x00,
0102     0x00,
0103     0x00, // CTLs:        none
0104     0x00,
0105     0x00,
0106     0xFF,
0107     0xC0, // SP ... '?':  0-9
0108     0x7F,
0109     0xFF,
0110     0xFF,
0111     0xE1, // '@' ... '_': A-Z_
0112     0x7F,
0113     0xFF,
0114     0xFF,
0115     0xE0 // '`' ... DEL: a-z
0116 };
0117
0118 // SP, HT, CR, LF, {}[]();,#/
0119 // ### exclude '['? Why would one want to write identifier["foo"]?
0120 static const unsigned char delimMap[16] = {
0121     0x00,
0122     0x64,
0123     0x00,
0124     0x00, // CTLs:        CR, HT, LF
0125     0x90,
0126     0xC9,
0127     0x00,
0128     0x10, // SP ... '?':  SP, #(),;
0129     0x00,
0130     0x00,
0131     0x00,
0132     0x16, // '@' ... '_': []
0133     0x00,
0134     0x00,
0135     0x00,
0136     0x16 // '`' ... DEL: {}
0137 };
0138
0139 // All except iText, delim, "*:
0140 static const unsigned char illegalMap[16] = {0xFF, 0x9B, 0xFF, 0xFF, 0x4F, 0x16, 0x00, 0x0F, 0x80, 0x00, 0x00, 0x0A, 0x80, 0x00, 0x00, 0x0A};
0141
0142 static inline bool isOfSet(const unsigned char map[16], unsigned char ch)
0143 {
0144     assert(ch < 128);
0145     return map[ch / 8] & 0x80 >> ch % 8;
0146 }
0147
0148 static inline bool isIText(unsigned char ch)
0149 {
0150     return ch <= 'z' && isOfSet(iTextMap, ch);
0151 }
0152
0153 static inline bool isDelim(unsigned char ch)
0154 {
0155     return ch <= '}' && isOfSet(delimMap, ch);
0156 }
0157
0158 static inline bool isIllegal(unsigned char ch)
0159 {
0160     return ch >= '~' || isOfSet(illegalMap, ch);
0161 }
0162
0163 static inline bool is8Bit(signed char ch)
0164 {
0165     return ch < 0;
0166 }
0167
0168 static QString removeCRLF(const QString &s)
0169 {
0170     const bool CRLF = s.endsWith(QLatin1StringView("\r\n"));
0171     const bool LF = !CRLF && s.endsWith(QLatin1Char('\n'));
0172
0173     const int e = CRLF ? 2 : LF ? 1 : 0; // what to chop off at the end
0174
0175     return s.left(s.length() - e);
0176 }
0177
0178 static QString removeDotStuff(const QString &s)
0179 {
0180     return s.startsWith(QLatin1StringView("..")) ? s.mid(1) : s;
0181 }
0182
0183 namespace KSieve
0184 {
0185 //
0186 //
0187 // Lexer Implementation
0188 //
0189 //
0190
0191 Lexer::Impl::Impl(const char *scursor, const char *send, int options)
0192     : mState(scursor ? scursor : send)
0193     , mEnd(send ? send : scursor)
0194     , mIgnoreComments(options & IgnoreComments)
0195     , mIgnoreLF(options & IgnoreLineFeeds)
0196 {
0197     if (!scursor || !send) {
0198         assert(atEnd());
0199     }
0200 }
0201
0202 Lexer::Token Lexer::Impl::nextToken(QString &result)
0203 {
0204     assert(!atEnd());
0205     result.clear();
0206     // clearErrors();
0207
0208     const int oldLine = line();
0209
0210     const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS();
0211
0212     if (!ignoreLineFeeds() && oldLine != line()) {
0213         result.setNum(line() - oldLine); // return number of linefeeds encountered
0214         return LineFeeds;
0215     }
0216
0217     if (!eatingWSSucceeded) {
0218         return None;
0219     }
0220
0221     if (atEnd()) {
0222         return None;
0223     }
0224
0225     switch (*mState.cursor) {
0226     case '#': // HashComment
0227         assert(!ignoreComments());
0228         ++mState.cursor;
0229         if (!atEnd()) {
0230             parseHashComment(result, true);
0231         }
0232         return HashComment;
0233     case '/': // BracketComment
0234         assert(!ignoreComments());
0235         ++mState.cursor; // eat slash
0236         if (atEnd() || *mState.cursor != '*') {
0237             makeError(Error::SlashWithoutAsterisk);
0238             return BracketComment;
0239         }
0240         ++mState.cursor; // eat asterisk
0241         if (atEnd()) {
0242             makeError(Error::UnfinishedBracketComment);
0243             return BracketComment;
0244         }
0245         parseBracketComment(result, true);
0246         return BracketComment;
0247     case ':': // Tag
0248         ++mState.cursor;
0249         if (atEnd()) {
0250             makeError(Error::UnexpectedCharacter, line(), column() - 1);
0251             return Tag;
0252         }
0253         if (!isIText(*mState.cursor)) {
0254             makeIllegalCharError(*mState.cursor);
0255             return Tag;
0256         }
0257         parseTag(result);
0258         return Tag;
0259     case '"': // QuotedString
0260         ++mState.cursor;
0261         parseQuotedString(result);
0262         return QuotedString;
0263     case '{':
0264     case '}':
0265     case '[':
0266     case ']':
0267     case '(':
0268     case ')':
0269     case ';':
0270     case ',': // Special
0271         result = QLatin1Char(*mState.cursor++);
0272         return Special;
0273     case '0':
0274     case '1':
0275     case '2':
0276     case '3':
0277     case '4':
0278     case '5':
0279     case '6':
0280     case '7':
0281     case '8':
0282     case '9': // Number
0283         parseNumber(result);
0284         return Number;
0285     case 't': // maybe MultiLineString, else Identifier
0286         if (_strnicmp(mState.cursor, "text:", STR_DIM("text:")) == 0) {
0287             // MultiLineString
0288             mState.cursor += STR_DIM("text:");
0289             parseMultiLine(result);
0290             // ### FIXME: There can be a hash-comment between "text:"
0291             // and CRLF! That should be preserved somehow...
0292             return MultiLineString;
0293         }
0294         [[fallthrough]];
0295     default: // Identifier (first must not be 0-9, and can't (caught by Number above))
0296         if (!isIText(*mState.cursor)) {
0297             makeError(Error::IllegalCharacter);
0298             return None;
0299         }
0300         parseIdentifier(result);
0301         return Identifier;
0302     }
0303 }
0304
0305 bool Lexer::Impl::eatWS()
0306 {
0307     while (!atEnd()) {
0308         switch (*mState.cursor) {
0309         case '\r':
0310         case '\n':
0311             if (!eatCRLF()) {
0312                 return false;
0313             }
0314             break;
0315         case ' ':
0316         case '\t':
0317             ++mState.cursor;
0318             break;
0319         default:
0320             return true;
0321         }
0322     }
0323
0324     // at end:
0325     return true;
0326 }
0327
0328 bool Lexer::Impl::eatCRLF()
0329 {
0330     assert(!atEnd());
0331     assert(*mState.cursor == '\n' || *mState.cursor == '\r');
0332
0333     if (*mState.cursor == '\r') {
0334         ++mState.cursor;
0335         if (atEnd() || *mState.cursor != '\n') {
0336             // CR w/o LF -> error
0337             makeError(Error::CRWithoutLF);
0338             return false;
0339         } else {
0340             // good CRLF
0341             newLine();
0342             return true;
0343         }
0344     } else { /* *mState.cursor == '\n' */
0345         // good, LF only
0346         newLine();
0347         return true;
0348     }
0349 }
0350
0351 bool Lexer::Impl::parseHashComment(QString &result, bool reallySave)
0352 {
0353     // hash-comment := "#" *CHAR-NOT-CRLF CRLF
0354
0355     // check that the caller plays by the rules:
0356     assert(*(mState.cursor - 1) == '#');
0357
0358     const char *const commentStart = mState.cursor;
0359
0360     // find next CRLF:
0361     while (!atEnd()) {
0362         if (*mState.cursor == '\n' || *mState.cursor == '\r') {
0363             break;
0364         }
0365         ++mState.cursor;
0366     }
0367     const char *const commentEnd = mState.cursor - 1;
0368
0369     // Laurent it creates a problem when we have just "#F" => it doesn't see it as a comment
0370     //    if (commentEnd == commentStart) {
0371     //        return true;    // # was last char in script...
0372     //    }
0373
0374     if (atEnd() || eatCRLF()) {
0375         const int commentLength = commentEnd - commentStart + 1;
0376         if (commentLength > 0) {
0377             if (!isValidUtf8(commentStart, commentLength)) {
0378                 makeError(Error::InvalidUTF8);
0379                 return false;
0380             }
0381             if (reallySave) {
0382                 result += QString::fromUtf8(commentStart, commentLength);
0383                 // In comment < or > breaks parsing => convert them to double quote
0384                 // See src/ksieveui/scriptsparsing/tests/failed/script1.siv
0385                 result.replace(QLatin1Char('<'), QLatin1Char('"'));
0386                 result.replace(QLatin1Char('>'), QLatin1Char('"'));
0387             }
0388         }
0389         return true;
0390     }
0391
0392     return false;
0393 }
0394
0395 bool Lexer::Impl::parseBracketComment(QString &result, bool reallySave)
0396 {
0397     // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
0398
0399     // check that caller plays by the rules:
0400     assert(*(mState.cursor - 2) == '/');
0401     assert(*(mState.cursor - 1) == '*');
0402
0403     const char *const commentStart = mState.cursor;
0404     const int commentCol = column() - 2;
0405     const int commentLine = line();
0406
0407     // find next asterisk:
0408     do {
0409         if (!skipTo('*')) {
0410             if (!error()) {
0411                 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
0412             }
0413             return false;
0414         }
0415     } while (!atEnd() && *++mState.cursor != '/');
0416
0417     if (atEnd()) {
0418         makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
0419         return false;
0420     }
0421
0422     assert(*mState.cursor == '/');
0423
0424     const int commentLength = mState.cursor - commentStart - 1;
0425     if (commentLength > 0) {
0426         if (!isValidUtf8(commentStart, commentLength)) {
0427             makeError(Error::InvalidUTF8);
0428             return false;
0429         }
0430         if (reallySave) {
0431             QString tmp = QString::fromUtf8(commentStart, commentLength);
0432             result += tmp.remove(QLatin1Char('\r')); // get rid of CR in CRLF pairs
0433         }
0434     }
0435
0436     ++mState.cursor; // eat '/'
0437     return true;
0438 }
0439
0440 bool Lexer::Impl::parseComment(QString &result, bool reallySave)
0441 {
0442     // comment := hash-comment / bracket-comment
0443
0444     switch (*mState.cursor) {
0445     case '#':
0446         ++mState.cursor;
0447         return parseHashComment(result, reallySave);
0448     case '/':
0449         if (charsLeft() < 2 || mState.cursor[1] != '*') {
0450             makeError(Error::IllegalCharacter);
0451             return false;
0452         } else {
0453             mState.cursor += 2; // eat "/*"
0454             return parseBracketComment(result, reallySave);
0455         }
0456     default:
0457         return false; // don't set an error here - there was no comment
0458     }
0459 }
0460
0461 bool Lexer::Impl::eatCWS()
0462 {
0463     // white-space := 1*(SP / CRLF / HTAB / comment )
0464
0465     while (!atEnd()) {
0466         switch (*mState.cursor) {
0467         case ' ':
0468         case '\t': // SP / HTAB
0469             ++mState.cursor;
0470             break;
0471         case '\n':
0472         case '\r': // CRLF
0473             if (!eatCRLF()) {
0474                 return false;
0475             }
0476             break;
0477         case '#':
0478         case '/': { // comments
0479             QString dummy;
0480             if (!parseComment(dummy)) {
0481                 return false;
0482             }
0483             break;
0484         }
0485         default:
0486             return true;
0487         }
0488     }
0489     return true;
0490 }
0491
0492 bool Lexer::Impl::parseIdentifier(QString &result)
0493 {
0494     // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
0495
0496     assert(isIText(*mState.cursor));
0497
0498     const char *const identifierStart = mState.cursor;
0499
0500     // first char:
0501     if (isdigit(*mState.cursor)) { // no digits for the first
0502         makeError(Error::NoLeadingDigits);
0503         return false;
0504     }
0505
0506     // rest of identifier chars ( now digits are allowed ):
0507     for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor) { }
0508
0509     const int identifierLength = mState.cursor - identifierStart;
0510
0511     // Can use the fast fromLatin1 here, since identifiers are always
0512     // in the us-ascii subset:
0513     result += QString::fromLatin1(identifierStart, identifierLength);
0514
0515     if (atEnd() || isDelim(*mState.cursor)) {
0516         return true;
0517     }
0518
0519     makeIllegalCharError(*mState.cursor);
0520     return false;
0521 }
0522
0523 bool Lexer::Impl::parseTag(QString &result)
0524 {
0525     // tag := ":" identifier
0526
0527     // check that the caller plays by the rules:
0528     assert(*(mState.cursor - 1) == ':');
0529     assert(!atEnd());
0530     assert(isIText(*mState.cursor));
0531
0532     return parseIdentifier(result);
0533 }
0534
0535 bool Lexer::Impl::parseNumber(QString &result)
0536 {
0537     // number     := 1*DIGIT [QUANTIFIER]
0538     // QUANTIFIER := "K" / "M" / "G"
0539
0540     assert(isdigit(*mState.cursor));
0541
0542     while (!atEnd() && isdigit(*mState.cursor)) {
0543         result += QLatin1Char(*mState.cursor++);
0544     }
0545
0546     if (atEnd() || isDelim(*mState.cursor)) {
0547         return true;
0548     }
0549
0550     switch (*mState.cursor) {
0551     case 'G':
0552     case 'g':
0553     case 'M':
0554     case 'm':
0555     case 'K':
0556     case 'k':
0557         result += QLatin1Char(*mState.cursor++);
0558         break;
0559     default:
0560         makeIllegalCharError();
0561         return false;
0562     }
0563
0564     // quantifier found. Check for delimiter:
0565     if (atEnd() || isDelim(*mState.cursor)) {
0566         return true;
0567     }
0568     makeIllegalCharError();
0569     return false;
0570 }
0571
0572 bool Lexer::Impl::parseMultiLine(QString &result)
0573 {
0574     // multi-line          := "text:" *(SP / HTAB) (hash-comment / CRLF)
0575     //                        *(multi-line-literal / multi-line-dotstuff)
0576     //                        "." CRLF
0577     // multi-line-literal  := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
0578     // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
0579     //         ;; A line containing only "." ends the multi-line.
0580     //         ;; Remove a leading '.' if followed by another '.'.
0581
0582     assert(_strnicmp(mState.cursor - 5, "text:", STR_DIM("text:")) == 0);
0583
0584     const int mlBeginLine = line();
0585     const int mlBeginCol = column() - 5;
0586
0587     while (!atEnd()) {
0588         switch (*mState.cursor) {
0589         case ' ':
0590         case '\t':
0591             ++mState.cursor;
0592             break;
0593         case '#': {
0594             ++mState.cursor;
0595             QString dummy;
0596             if (!parseHashComment(dummy)) {
0597                 return false;
0598             }
0599             goto MultiLineStart; // break from switch _and_ while
0600         }
0601         case '\n':
0602         case '\r':
0603             if (!eatCRLF()) {
0604                 return false;
0605             }
0606             goto MultiLineStart; // break from switch _and_ while
0607         default:
0608             makeError(Error::NonCWSAfterTextColon);
0609             return false;
0610         }
0611     }
0612
0613 MultiLineStart:
0614     if (atEnd()) {
0615         makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
0616         return false;
0617     }
0618
0619     // Now, collect the single lines until one with only a single dot is found:
0620     QStringList lines;
0621     while (!atEnd()) {
0622         const char *const oldBeginOfLine = beginOfLine();
0623         if (!skipToCRLF()) {
0624             return false;
0625         }
0626         const int lineLength = mState.cursor - oldBeginOfLine;
0627         if (lineLength > 0) {
0628             if (!isValidUtf8(oldBeginOfLine, lineLength)) {
0629                 makeError(Error::InvalidUTF8);
0630                 return false;
0631             }
0632             const QString line = removeCRLF(QString::fromUtf8(oldBeginOfLine, lineLength));
0633             lines.push_back(removeDotStuff(line));
0634             if (line == QLatin1Char('.')) {
0635                 break;
0636             }
0637         } else {
0638             lines.push_back(QString());
0639         }
0640     }
0641
0642     if (lines.back() != QLatin1StringView(".")) {
0643         makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
0644         return false;
0645     }
0646
0647     assert(!lines.empty());
0648     lines.erase(--lines.end()); // don't include the lone dot.
0649     result = lines.join(QLatin1Char('\n'));
0650     return true;
0651 }
0652
0653 bool Lexer::Impl::parseQuotedString(QString &result)
0654 {
0655     // quoted-string := DQUOTE *CHAR DQUOTE
0656
0657     // check that caller plays by the rules:
0658     assert(*(mState.cursor - 1) == '"');
0659
0660     const int qsBeginCol = column() - 1;
0661     const int qsBeginLine = line();
0662
0663     QStringDecoder dec(QStringDecoder::Utf8);
0664     while (!atEnd()) {
0665         switch (*mState.cursor) {
0666         case '"':
0667             ++mState.cursor;
0668             return true;
0669         case '\r':
0670         case '\n':
0671             if (!eatCRLF()) {
0672                 return false;
0673             }
0674             result += QLatin1Char('\n');
0675             break;
0676         case '\\':
0677             ++mState.cursor;
0678             if (atEnd()) {
0679                 break;
0680             }
0681             [[fallthrough]];
0682         default:
0683             if (!is8Bit(*mState.cursor)) {
0684                 result += QLatin1Char(*mState.cursor++);
0685             } else { // probably UTF-8
0686                 const char *const eightBitBegin = mState.cursor;
0687                 skipTo8BitEnd();
0688                 const int eightBitLen = mState.cursor - eightBitBegin;
0689                 assert(eightBitLen > 0);
0690                 if (isValidUtf8(eightBitBegin, eightBitLen)) {
0691                     result += dec.decode(QByteArrayView(eightBitBegin, eightBitLen));
0692                 } else {
0693                     assert(column() >= eightBitLen);
0694                     makeError(Error::InvalidUTF8, line(), column() - eightBitLen);
0695                     return false;
0696                 }
0697             }
0698         }
0699     }
0700
0701     makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol);
0702     return false;
0703 }
0704
0705 void Lexer::Impl::makeIllegalCharError(char ch)
0706 {
0707     makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter);
0708 }
0709 } // namespace KSieve