File indexing completed on 2024-11-17 04:50:40
0001 /* -*- c++ -*- 0002 parser/lexer.cpp 0003 0004 This file is part of KSieve, 0005 the KDE internet mail/usenet news message filtering library. 0006 SPDX-FileCopyrightText: 2002-2003 Marc Mutz <mutz@kde.org> 0007 0008 SPDX-License-Identifier: GPL-2.0-only 0009 */ 0010 0011 #include "lexer_p.h" 0012 0013 #include "error.h" 0014 #include "utf8validator.h" 0015 0016 #include <QString> 0017 #include <QStringList> 0018 0019 #include <memory> // std::unique_ptr 0020 0021 #include <QStringDecoder> 0022 #include <cassert> 0023 #include <cctype> // isdigit 0024 0025 #ifdef STR_DIM 0026 #undef STR_DIM 0027 #endif 0028 #define STR_DIM(x) (sizeof(x) - 1) 0029 0030 namespace KSieve 0031 { 0032 // 0033 // 0034 // Lexer Bridge implementation 0035 // 0036 // 0037 0038 Lexer::Lexer(const char *scursor, const char *send, int options) 0039 : i(new Impl(scursor, send, options)) 0040 { 0041 } 0042 0043 Lexer::~Lexer() 0044 { 0045 delete i; 0046 i = nullptr; 0047 } 0048 0049 bool Lexer::ignoreComments() const 0050 { 0051 assert(i); 0052 return i->ignoreComments(); 0053 } 0054 0055 const Error &Lexer::error() const 0056 { 0057 assert(i); 0058 return i->error(); 0059 } 0060 0061 bool Lexer::atEnd() const 0062 { 0063 assert(i); 0064 return i->atEnd(); 0065 } 0066 0067 int Lexer::column() const 0068 { 0069 assert(i); 0070 return i->column(); 0071 } 0072 0073 int Lexer::line() const 0074 { 0075 assert(i); 0076 return i->line(); 0077 } 0078 0079 void Lexer::save() 0080 { 0081 assert(i); 0082 i->save(); 0083 } 0084 0085 void Lexer::restore() 0086 { 0087 assert(i); 0088 i->restore(); 0089 } 0090 0091 Lexer::Token Lexer::nextToken(QString &result) 0092 { 0093 assert(i); 0094 return i->nextToken(result); 0095 } 0096 } // namespace KSieve 0097 0098 // none except a-zA-Z0-9_ 0099 static const unsigned char iTextMap[16] = { 0100 0x00, 0101 0x00, 0102 0x00, 0103 0x00, // CTLs: none 0104 0x00, 0105 0x00, 0106 0xFF, 0107 0xC0, // SP ... '?': 0-9 0108 0x7F, 0109 0xFF, 0110 0xFF, 0111 0xE1, // '@' ... '_': A-Z_ 0112 0x7F, 0113 0xFF, 0114 0xFF, 0115 0xE0 // '`' ... DEL: a-z 0116 }; 0117 0118 // SP, HT, CR, LF, {}[]();,#/ 0119 // ### exclude '['? Why would one want to write identifier["foo"]? 0120 static const unsigned char delimMap[16] = { 0121 0x00, 0122 0x64, 0123 0x00, 0124 0x00, // CTLs: CR, HT, LF 0125 0x90, 0126 0xC9, 0127 0x00, 0128 0x10, // SP ... '?': SP, #(),; 0129 0x00, 0130 0x00, 0131 0x00, 0132 0x16, // '@' ... '_': [] 0133 0x00, 0134 0x00, 0135 0x00, 0136 0x16 // '`' ... DEL: {} 0137 }; 0138 0139 // All except iText, delim, "*: 0140 static const unsigned char illegalMap[16] = {0xFF, 0x9B, 0xFF, 0xFF, 0x4F, 0x16, 0x00, 0x0F, 0x80, 0x00, 0x00, 0x0A, 0x80, 0x00, 0x00, 0x0A}; 0141 0142 static inline bool isOfSet(const unsigned char map[16], unsigned char ch) 0143 { 0144 assert(ch < 128); 0145 return map[ch / 8] & 0x80 >> ch % 8; 0146 } 0147 0148 static inline bool isIText(unsigned char ch) 0149 { 0150 return ch <= 'z' && isOfSet(iTextMap, ch); 0151 } 0152 0153 static inline bool isDelim(unsigned char ch) 0154 { 0155 return ch <= '}' && isOfSet(delimMap, ch); 0156 } 0157 0158 static inline bool isIllegal(unsigned char ch) 0159 { 0160 return ch >= '~' || isOfSet(illegalMap, ch); 0161 } 0162 0163 static inline bool is8Bit(signed char ch) 0164 { 0165 return ch < 0; 0166 } 0167 0168 static QString removeCRLF(const QString &s) 0169 { 0170 const bool CRLF = s.endsWith(QLatin1StringView("\r\n")); 0171 const bool LF = !CRLF && s.endsWith(QLatin1Char('\n')); 0172 0173 const int e = CRLF ? 2 : LF ? 1 : 0; // what to chop off at the end 0174 0175 return s.left(s.length() - e); 0176 } 0177 0178 static QString removeDotStuff(const QString &s) 0179 { 0180 return s.startsWith(QLatin1StringView("..")) ? s.mid(1) : s; 0181 } 0182 0183 namespace KSieve 0184 { 0185 // 0186 // 0187 // Lexer Implementation 0188 // 0189 // 0190 0191 Lexer::Impl::Impl(const char *scursor, const char *send, int options) 0192 : mState(scursor ? scursor : send) 0193 , mEnd(send ? send : scursor) 0194 , mIgnoreComments(options & IgnoreComments) 0195 , mIgnoreLF(options & IgnoreLineFeeds) 0196 { 0197 if (!scursor || !send) { 0198 assert(atEnd()); 0199 } 0200 } 0201 0202 Lexer::Token Lexer::Impl::nextToken(QString &result) 0203 { 0204 assert(!atEnd()); 0205 result.clear(); 0206 // clearErrors(); 0207 0208 const int oldLine = line(); 0209 0210 const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS(); 0211 0212 if (!ignoreLineFeeds() && oldLine != line()) { 0213 result.setNum(line() - oldLine); // return number of linefeeds encountered 0214 return LineFeeds; 0215 } 0216 0217 if (!eatingWSSucceeded) { 0218 return None; 0219 } 0220 0221 if (atEnd()) { 0222 return None; 0223 } 0224 0225 switch (*mState.cursor) { 0226 case '#': // HashComment 0227 assert(!ignoreComments()); 0228 ++mState.cursor; 0229 if (!atEnd()) { 0230 parseHashComment(result, true); 0231 } 0232 return HashComment; 0233 case '/': // BracketComment 0234 assert(!ignoreComments()); 0235 ++mState.cursor; // eat slash 0236 if (atEnd() || *mState.cursor != '*') { 0237 makeError(Error::SlashWithoutAsterisk); 0238 return BracketComment; 0239 } 0240 ++mState.cursor; // eat asterisk 0241 if (atEnd()) { 0242 makeError(Error::UnfinishedBracketComment); 0243 return BracketComment; 0244 } 0245 parseBracketComment(result, true); 0246 return BracketComment; 0247 case ':': // Tag 0248 ++mState.cursor; 0249 if (atEnd()) { 0250 makeError(Error::UnexpectedCharacter, line(), column() - 1); 0251 return Tag; 0252 } 0253 if (!isIText(*mState.cursor)) { 0254 makeIllegalCharError(*mState.cursor); 0255 return Tag; 0256 } 0257 parseTag(result); 0258 return Tag; 0259 case '"': // QuotedString 0260 ++mState.cursor; 0261 parseQuotedString(result); 0262 return QuotedString; 0263 case '{': 0264 case '}': 0265 case '[': 0266 case ']': 0267 case '(': 0268 case ')': 0269 case ';': 0270 case ',': // Special 0271 result = QLatin1Char(*mState.cursor++); 0272 return Special; 0273 case '0': 0274 case '1': 0275 case '2': 0276 case '3': 0277 case '4': 0278 case '5': 0279 case '6': 0280 case '7': 0281 case '8': 0282 case '9': // Number 0283 parseNumber(result); 0284 return Number; 0285 case 't': // maybe MultiLineString, else Identifier 0286 if (_strnicmp(mState.cursor, "text:", STR_DIM("text:")) == 0) { 0287 // MultiLineString 0288 mState.cursor += STR_DIM("text:"); 0289 parseMultiLine(result); 0290 // ### FIXME: There can be a hash-comment between "text:" 0291 // and CRLF! That should be preserved somehow... 0292 return MultiLineString; 0293 } 0294 [[fallthrough]]; 0295 default: // Identifier (first must not be 0-9, and can't (caught by Number above)) 0296 if (!isIText(*mState.cursor)) { 0297 makeError(Error::IllegalCharacter); 0298 return None; 0299 } 0300 parseIdentifier(result); 0301 return Identifier; 0302 } 0303 } 0304 0305 bool Lexer::Impl::eatWS() 0306 { 0307 while (!atEnd()) { 0308 switch (*mState.cursor) { 0309 case '\r': 0310 case '\n': 0311 if (!eatCRLF()) { 0312 return false; 0313 } 0314 break; 0315 case ' ': 0316 case '\t': 0317 ++mState.cursor; 0318 break; 0319 default: 0320 return true; 0321 } 0322 } 0323 0324 // at end: 0325 return true; 0326 } 0327 0328 bool Lexer::Impl::eatCRLF() 0329 { 0330 assert(!atEnd()); 0331 assert(*mState.cursor == '\n' || *mState.cursor == '\r'); 0332 0333 if (*mState.cursor == '\r') { 0334 ++mState.cursor; 0335 if (atEnd() || *mState.cursor != '\n') { 0336 // CR w/o LF -> error 0337 makeError(Error::CRWithoutLF); 0338 return false; 0339 } else { 0340 // good CRLF 0341 newLine(); 0342 return true; 0343 } 0344 } else { /* *mState.cursor == '\n' */ 0345 // good, LF only 0346 newLine(); 0347 return true; 0348 } 0349 } 0350 0351 bool Lexer::Impl::parseHashComment(QString &result, bool reallySave) 0352 { 0353 // hash-comment := "#" *CHAR-NOT-CRLF CRLF 0354 0355 // check that the caller plays by the rules: 0356 assert(*(mState.cursor - 1) == '#'); 0357 0358 const char *const commentStart = mState.cursor; 0359 0360 // find next CRLF: 0361 while (!atEnd()) { 0362 if (*mState.cursor == '\n' || *mState.cursor == '\r') { 0363 break; 0364 } 0365 ++mState.cursor; 0366 } 0367 const char *const commentEnd = mState.cursor - 1; 0368 0369 // Laurent it creates a problem when we have just "#F" => it doesn't see it as a comment 0370 // if (commentEnd == commentStart) { 0371 // return true; // # was last char in script... 0372 // } 0373 0374 if (atEnd() || eatCRLF()) { 0375 const int commentLength = commentEnd - commentStart + 1; 0376 if (commentLength > 0) { 0377 if (!isValidUtf8(commentStart, commentLength)) { 0378 makeError(Error::InvalidUTF8); 0379 return false; 0380 } 0381 if (reallySave) { 0382 result += QString::fromUtf8(commentStart, commentLength); 0383 // In comment < or > breaks parsing => convert them to double quote 0384 // See src/ksieveui/scriptsparsing/tests/failed/script1.siv 0385 result.replace(QLatin1Char('<'), QLatin1Char('"')); 0386 result.replace(QLatin1Char('>'), QLatin1Char('"')); 0387 } 0388 } 0389 return true; 0390 } 0391 0392 return false; 0393 } 0394 0395 bool Lexer::Impl::parseBracketComment(QString &result, bool reallySave) 0396 { 0397 // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/" 0398 0399 // check that caller plays by the rules: 0400 assert(*(mState.cursor - 2) == '/'); 0401 assert(*(mState.cursor - 1) == '*'); 0402 0403 const char *const commentStart = mState.cursor; 0404 const int commentCol = column() - 2; 0405 const int commentLine = line(); 0406 0407 // find next asterisk: 0408 do { 0409 if (!skipTo('*')) { 0410 if (!error()) { 0411 makeError(Error::UnfinishedBracketComment, commentLine, commentCol); 0412 } 0413 return false; 0414 } 0415 } while (!atEnd() && *++mState.cursor != '/'); 0416 0417 if (atEnd()) { 0418 makeError(Error::UnfinishedBracketComment, commentLine, commentCol); 0419 return false; 0420 } 0421 0422 assert(*mState.cursor == '/'); 0423 0424 const int commentLength = mState.cursor - commentStart - 1; 0425 if (commentLength > 0) { 0426 if (!isValidUtf8(commentStart, commentLength)) { 0427 makeError(Error::InvalidUTF8); 0428 return false; 0429 } 0430 if (reallySave) { 0431 QString tmp = QString::fromUtf8(commentStart, commentLength); 0432 result += tmp.remove(QLatin1Char('\r')); // get rid of CR in CRLF pairs 0433 } 0434 } 0435 0436 ++mState.cursor; // eat '/' 0437 return true; 0438 } 0439 0440 bool Lexer::Impl::parseComment(QString &result, bool reallySave) 0441 { 0442 // comment := hash-comment / bracket-comment 0443 0444 switch (*mState.cursor) { 0445 case '#': 0446 ++mState.cursor; 0447 return parseHashComment(result, reallySave); 0448 case '/': 0449 if (charsLeft() < 2 || mState.cursor[1] != '*') { 0450 makeError(Error::IllegalCharacter); 0451 return false; 0452 } else { 0453 mState.cursor += 2; // eat "/*" 0454 return parseBracketComment(result, reallySave); 0455 } 0456 default: 0457 return false; // don't set an error here - there was no comment 0458 } 0459 } 0460 0461 bool Lexer::Impl::eatCWS() 0462 { 0463 // white-space := 1*(SP / CRLF / HTAB / comment ) 0464 0465 while (!atEnd()) { 0466 switch (*mState.cursor) { 0467 case ' ': 0468 case '\t': // SP / HTAB 0469 ++mState.cursor; 0470 break; 0471 case '\n': 0472 case '\r': // CRLF 0473 if (!eatCRLF()) { 0474 return false; 0475 } 0476 break; 0477 case '#': 0478 case '/': { // comments 0479 QString dummy; 0480 if (!parseComment(dummy)) { 0481 return false; 0482 } 0483 break; 0484 } 0485 default: 0486 return true; 0487 } 0488 } 0489 return true; 0490 } 0491 0492 bool Lexer::Impl::parseIdentifier(QString &result) 0493 { 0494 // identifier := (ALPHA / "_") *(ALPHA DIGIT "_") 0495 0496 assert(isIText(*mState.cursor)); 0497 0498 const char *const identifierStart = mState.cursor; 0499 0500 // first char: 0501 if (isdigit(*mState.cursor)) { // no digits for the first 0502 makeError(Error::NoLeadingDigits); 0503 return false; 0504 } 0505 0506 // rest of identifier chars ( now digits are allowed ): 0507 for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor) { } 0508 0509 const int identifierLength = mState.cursor - identifierStart; 0510 0511 // Can use the fast fromLatin1 here, since identifiers are always 0512 // in the us-ascii subset: 0513 result += QString::fromLatin1(identifierStart, identifierLength); 0514 0515 if (atEnd() || isDelim(*mState.cursor)) { 0516 return true; 0517 } 0518 0519 makeIllegalCharError(*mState.cursor); 0520 return false; 0521 } 0522 0523 bool Lexer::Impl::parseTag(QString &result) 0524 { 0525 // tag := ":" identifier 0526 0527 // check that the caller plays by the rules: 0528 assert(*(mState.cursor - 1) == ':'); 0529 assert(!atEnd()); 0530 assert(isIText(*mState.cursor)); 0531 0532 return parseIdentifier(result); 0533 } 0534 0535 bool Lexer::Impl::parseNumber(QString &result) 0536 { 0537 // number := 1*DIGIT [QUANTIFIER] 0538 // QUANTIFIER := "K" / "M" / "G" 0539 0540 assert(isdigit(*mState.cursor)); 0541 0542 while (!atEnd() && isdigit(*mState.cursor)) { 0543 result += QLatin1Char(*mState.cursor++); 0544 } 0545 0546 if (atEnd() || isDelim(*mState.cursor)) { 0547 return true; 0548 } 0549 0550 switch (*mState.cursor) { 0551 case 'G': 0552 case 'g': 0553 case 'M': 0554 case 'm': 0555 case 'K': 0556 case 'k': 0557 result += QLatin1Char(*mState.cursor++); 0558 break; 0559 default: 0560 makeIllegalCharError(); 0561 return false; 0562 } 0563 0564 // quantifier found. Check for delimiter: 0565 if (atEnd() || isDelim(*mState.cursor)) { 0566 return true; 0567 } 0568 makeIllegalCharError(); 0569 return false; 0570 } 0571 0572 bool Lexer::Impl::parseMultiLine(QString &result) 0573 { 0574 // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF) 0575 // *(multi-line-literal / multi-line-dotstuff) 0576 // "." CRLF 0577 // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF 0578 // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF 0579 // ;; A line containing only "." ends the multi-line. 0580 // ;; Remove a leading '.' if followed by another '.'. 0581 0582 assert(_strnicmp(mState.cursor - 5, "text:", STR_DIM("text:")) == 0); 0583 0584 const int mlBeginLine = line(); 0585 const int mlBeginCol = column() - 5; 0586 0587 while (!atEnd()) { 0588 switch (*mState.cursor) { 0589 case ' ': 0590 case '\t': 0591 ++mState.cursor; 0592 break; 0593 case '#': { 0594 ++mState.cursor; 0595 QString dummy; 0596 if (!parseHashComment(dummy)) { 0597 return false; 0598 } 0599 goto MultiLineStart; // break from switch _and_ while 0600 } 0601 case '\n': 0602 case '\r': 0603 if (!eatCRLF()) { 0604 return false; 0605 } 0606 goto MultiLineStart; // break from switch _and_ while 0607 default: 0608 makeError(Error::NonCWSAfterTextColon); 0609 return false; 0610 } 0611 } 0612 0613 MultiLineStart: 0614 if (atEnd()) { 0615 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol); 0616 return false; 0617 } 0618 0619 // Now, collect the single lines until one with only a single dot is found: 0620 QStringList lines; 0621 while (!atEnd()) { 0622 const char *const oldBeginOfLine = beginOfLine(); 0623 if (!skipToCRLF()) { 0624 return false; 0625 } 0626 const int lineLength = mState.cursor - oldBeginOfLine; 0627 if (lineLength > 0) { 0628 if (!isValidUtf8(oldBeginOfLine, lineLength)) { 0629 makeError(Error::InvalidUTF8); 0630 return false; 0631 } 0632 const QString line = removeCRLF(QString::fromUtf8(oldBeginOfLine, lineLength)); 0633 lines.push_back(removeDotStuff(line)); 0634 if (line == QLatin1Char('.')) { 0635 break; 0636 } 0637 } else { 0638 lines.push_back(QString()); 0639 } 0640 } 0641 0642 if (lines.back() != QLatin1StringView(".")) { 0643 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol); 0644 return false; 0645 } 0646 0647 assert(!lines.empty()); 0648 lines.erase(--lines.end()); // don't include the lone dot. 0649 result = lines.join(QLatin1Char('\n')); 0650 return true; 0651 } 0652 0653 bool Lexer::Impl::parseQuotedString(QString &result) 0654 { 0655 // quoted-string := DQUOTE *CHAR DQUOTE 0656 0657 // check that caller plays by the rules: 0658 assert(*(mState.cursor - 1) == '"'); 0659 0660 const int qsBeginCol = column() - 1; 0661 const int qsBeginLine = line(); 0662 0663 QStringDecoder dec(QStringDecoder::Utf8); 0664 while (!atEnd()) { 0665 switch (*mState.cursor) { 0666 case '"': 0667 ++mState.cursor; 0668 return true; 0669 case '\r': 0670 case '\n': 0671 if (!eatCRLF()) { 0672 return false; 0673 } 0674 result += QLatin1Char('\n'); 0675 break; 0676 case '\\': 0677 ++mState.cursor; 0678 if (atEnd()) { 0679 break; 0680 } 0681 [[fallthrough]]; 0682 default: 0683 if (!is8Bit(*mState.cursor)) { 0684 result += QLatin1Char(*mState.cursor++); 0685 } else { // probably UTF-8 0686 const char *const eightBitBegin = mState.cursor; 0687 skipTo8BitEnd(); 0688 const int eightBitLen = mState.cursor - eightBitBegin; 0689 assert(eightBitLen > 0); 0690 if (isValidUtf8(eightBitBegin, eightBitLen)) { 0691 result += dec.decode(QByteArrayView(eightBitBegin, eightBitLen)); 0692 } else { 0693 assert(column() >= eightBitLen); 0694 makeError(Error::InvalidUTF8, line(), column() - eightBitLen); 0695 return false; 0696 } 0697 } 0698 } 0699 } 0700 0701 makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol); 0702 return false; 0703 } 0704 0705 void Lexer::Impl::makeIllegalCharError(char ch) 0706 { 0707 makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter); 0708 } 0709 } // namespace KSieve