src/interpreter/tokenizer.cpp

0001 /*
0002     SPDX-FileCopyrightText: 2003-2009 Cies Breijs <cies AT kde DOT nl>
0003
0004     SPDX-License-Identifier: GPL-2.0-or-later
0005 */
0006
0007 #include "tokenizer.h"
0008
0009 #include <QDebug>
0010
0011 void Tokenizer::initialize(const QString& inString)
0012 {
0013     translator  = Translator::instance();
0014     inputString = inString + QLatin1Char('\n');  // the certainty of a hard break at the end makes parsing much easier
0015     at  = 0;
0016     row = 1;
0017     col = 1;
0018     prevCol = 1;
0019     atEnd = false;
0020 }
0021
0022
0023 Token* Tokenizer::getToken()
0024 {
0025     int startRow = row;
0026     int startCol = col;
0027
0028     QChar c = getChar();  // get and store the next character from the string
0029
0030     // catch the end of the input string
0031     if (atEnd)
0032         return new Token(Token::EndOfInput, QStringLiteral("END"), row, col, row, col);
0033
0034     int cType = translator->look2type(c);  // since we need to know it often we store it
0035
0036     // catch spaces
0037     if (isSpace(c)) {
0038         QString look;
0039         do {
0040             look += (isTab(c) ? QStringLiteral("  ") : QStringLiteral(" "));
0041             c = getChar();
0042         } while (isSpace(c) && !atEnd);
0043         ungetChar();
0044         return new Token(Token::WhiteSpace, look, startRow, startCol, row, col);
0045     }
0046
0047     // catch EndOfLine's
0048     if (isBreak(c)) {
0049         return new Token(Token::EndOfLine, QStringLiteral("\\n"), startRow, startCol, startRow+1, 1);
0050     }
0051
0052     // catch comments
0053     if (cType == Token::Comment) {
0054         QString look;
0055         do {
0056             look += c;
0057             c = getChar();
0058         } while (!isBreak(c) && !atEnd);
0059         ungetChar();
0060         return new Token(Token::Comment, look, startRow, startCol, row, col);
0061     }
0062
0063     // catch strings
0064     if (cType == Token::StringDelimiter) {
0065         QString look = QString(c);
0066         do {
0067             c = getChar();
0068             look += c;
0069         } while (!(translator->look2type(c) == Token::StringDelimiter && look.right(2) != QLatin1String("\\\"")) &&
0070                  !isBreak(c) && !atEnd);
0071         return new Token(Token::String, look, startRow, startCol, row, col);
0072     }
0073
0074     // catch variables
0075     if (cType == Token::VariablePrefix) {
0076         QString look;
0077         do {
0078             look += c;
0079             c = getChar();
0080         } while (isWordChar(c) || c.category() == QChar::Number_DecimalDigit || c == QLatin1Char('_'));
0081         ungetChar();
0082         return new Token(Token::Variable, look, startRow, startCol, row, col);
0083     }
0084
0085     // catch words (known commands or function calls)
0086     if (isWordChar(c)) {  // first char has to be a letter
0087         QString look;
0088         do {
0089             look += c;
0090             c = getChar();
0091         } while (isWordChar(c) || c.isDigit() || c == QLatin1Char('_'));  // next chars
0092         ungetChar();
0093         int type = translator->look2type(look);
0094         if (type == Token::Unknown)
0095             type = Token::FunctionCall;
0096         return new Token(type, look, startRow, startCol, row, col);
0097     }
0098
0099     // catch numbers
0100     if (c.isDigit() || cType == Token::DecimalSeparator) {
0101         bool hasDot = false;
0102
0103         int localType = cType;
0104         QString look;
0105         do {
0106             if (localType == Token::DecimalSeparator) hasDot = true;
0107             look += c;
0108             c = getChar();
0109             localType = translator->look2type(c);
0110         } while (c.isDigit() || (localType == Token::DecimalSeparator && !hasDot));
0111         ungetChar();
0112
0113         // if all we got is a dot then this is not a number, so return an Error token here
0114         if (translator->look2type(look) == Token::DecimalSeparator)
0115             return new Token(Token::Error, look, startRow, startCol, row, col);
0116
0117         return new Token(Token::Number, look, startRow, startCol, row, col);
0118     }
0119
0120     // catch previously uncaught 'double character tokens' (tokens that ar not in letters, like: == != >= <=)
0121     {
0122         QString look = QString(c).append(getChar());
0123         int type = translator->look2type(look);
0124         if (type != Token::Unknown)
0125             return new Token(type, look, startRow, startCol, row, col);
0126         ungetChar();
0127     }
0128
0129     // catch known tokens of a single character (as last...)
0130     if (cType != Token::Unknown)
0131         return new Token(cType, static_cast<QString>(c), startRow, startCol, row, col);
0132
0133     // this does not neglect calls to functions with a name of length one (checked it)
0134     return new Token(Token::Error, static_cast<QString>(c), startRow, startCol, row, col);
0135 }
0136
0137
0138 QChar Tokenizer::getChar()
0139 {
0140     if (at >= inputString.size()) {
0141         atEnd = true;
0142 //      //qDebug() << "Tokenizer::getChar() returns: a ZERO CHAR " << " @ " << at - 1;
0143         return QChar();
0144     }
0145     QChar c(inputString.at(at));
0146     at++;
0147     if (isBreak(c)) {
0148         row++;
0149         prevCol = col;
0150         col = 1;
0151     } else {
0152         col++;
0153     }
0154 //  //qDebug() << "Tokenizer::getChar() returns: " << c << " (" << c.category() << ") " << " @ " << at - 1;
0155     return c;
0156 }
0157
0158
0159 void Tokenizer::ungetChar()
0160 {
0161     if (at <= 0) return;  // do nothing when trying to go before the first character
0162
0163     at--;
0164     if (atEnd) atEnd = false;
0165
0166     QChar c(inputString.at(at));
0167     if (isBreak(c)) {
0168         row--;
0169         col = prevCol;
0170     } else {
0171         col--;
0172     }
0173 }
0174
0175
0176 bool Tokenizer::isWordChar(const QChar& c)
0177 {
0178     // this method exists because some languages have non-letter category characters
0179     // mixed with their letter character to make words (like hindi)
0180     // NOTE: this has to be extended then languages give problems,
0181     //       just add a category in the following test
0182     return (c.isLetter() || c.isMark());
0183 }
0184
0185 bool Tokenizer::isBreak(const QChar& c)
0186 {
0187     return (c == QLatin1Char('\x0a') || c == QLatin1Char('\n'));
0188 //    c.category() == QChar::Other_Control  // one of these also contains the tab (\t)
0189 //    c.category() == QChar::Separator_Line
0190 //    c.category() == QChar::Separator_Paragraph
0191 }
0192
0193
0194 bool Tokenizer::isSpace(const QChar& c)
0195 {
0196     return (c.category() == QChar::Separator_Space || c == QLatin1Char(' ') || isTab(c));
0197 }
0198
0199 bool Tokenizer::isTab(const QChar& c)
0200 {
0201     return (c == QLatin1Char('\x09') || c == QLatin1Char('\t'));
0202 }