File indexing completed on 2024-09-15 03:31:00
0001 /* 0002 SPDX-FileCopyrightText: 2003-2009 Cies Breijs <cies AT kde DOT nl> 0003 0004 SPDX-License-Identifier: GPL-2.0-or-later 0005 */ 0006 0007 #include "tokenizer.h" 0008 0009 #include <QDebug> 0010 0011 void Tokenizer::initialize(const QString& inString) 0012 { 0013 translator = Translator::instance(); 0014 inputString = inString + QLatin1Char('\n'); // the certainty of a hard break at the end makes parsing much easier 0015 at = 0; 0016 row = 1; 0017 col = 1; 0018 prevCol = 1; 0019 atEnd = false; 0020 } 0021 0022 0023 Token* Tokenizer::getToken() 0024 { 0025 int startRow = row; 0026 int startCol = col; 0027 0028 QChar c = getChar(); // get and store the next character from the string 0029 0030 // catch the end of the input string 0031 if (atEnd) 0032 return new Token(Token::EndOfInput, QStringLiteral("END"), row, col, row, col); 0033 0034 int cType = translator->look2type(c); // since we need to know it often we store it 0035 0036 // catch spaces 0037 if (isSpace(c)) { 0038 QString look; 0039 do { 0040 look += (isTab(c) ? QStringLiteral(" ") : QStringLiteral(" ")); 0041 c = getChar(); 0042 } while (isSpace(c) && !atEnd); 0043 ungetChar(); 0044 return new Token(Token::WhiteSpace, look, startRow, startCol, row, col); 0045 } 0046 0047 // catch EndOfLine's 0048 if (isBreak(c)) { 0049 return new Token(Token::EndOfLine, QStringLiteral("\\n"), startRow, startCol, startRow+1, 1); 0050 } 0051 0052 // catch comments 0053 if (cType == Token::Comment) { 0054 QString look; 0055 do { 0056 look += c; 0057 c = getChar(); 0058 } while (!isBreak(c) && !atEnd); 0059 ungetChar(); 0060 return new Token(Token::Comment, look, startRow, startCol, row, col); 0061 } 0062 0063 // catch strings 0064 if (cType == Token::StringDelimiter) { 0065 QString look = QString(c); 0066 do { 0067 c = getChar(); 0068 look += c; 0069 } while (!(translator->look2type(c) == Token::StringDelimiter && look.right(2) != QLatin1String("\\\"")) && 0070 !isBreak(c) && !atEnd); 0071 return new Token(Token::String, look, startRow, startCol, row, col); 0072 } 0073 0074 // catch variables 0075 if (cType == Token::VariablePrefix) { 0076 QString look; 0077 do { 0078 look += c; 0079 c = getChar(); 0080 } while (isWordChar(c) || c.category() == QChar::Number_DecimalDigit || c == QLatin1Char('_')); 0081 ungetChar(); 0082 return new Token(Token::Variable, look, startRow, startCol, row, col); 0083 } 0084 0085 // catch words (known commands or function calls) 0086 if (isWordChar(c)) { // first char has to be a letter 0087 QString look; 0088 do { 0089 look += c; 0090 c = getChar(); 0091 } while (isWordChar(c) || c.isDigit() || c == QLatin1Char('_')); // next chars 0092 ungetChar(); 0093 int type = translator->look2type(look); 0094 if (type == Token::Unknown) 0095 type = Token::FunctionCall; 0096 return new Token(type, look, startRow, startCol, row, col); 0097 } 0098 0099 // catch numbers 0100 if (c.isDigit() || cType == Token::DecimalSeparator) { 0101 bool hasDot = false; 0102 0103 int localType = cType; 0104 QString look; 0105 do { 0106 if (localType == Token::DecimalSeparator) hasDot = true; 0107 look += c; 0108 c = getChar(); 0109 localType = translator->look2type(c); 0110 } while (c.isDigit() || (localType == Token::DecimalSeparator && !hasDot)); 0111 ungetChar(); 0112 0113 // if all we got is a dot then this is not a number, so return an Error token here 0114 if (translator->look2type(look) == Token::DecimalSeparator) 0115 return new Token(Token::Error, look, startRow, startCol, row, col); 0116 0117 return new Token(Token::Number, look, startRow, startCol, row, col); 0118 } 0119 0120 // catch previously uncaught 'double character tokens' (tokens that ar not in letters, like: == != >= <=) 0121 { 0122 QString look = QString(c).append(getChar()); 0123 int type = translator->look2type(look); 0124 if (type != Token::Unknown) 0125 return new Token(type, look, startRow, startCol, row, col); 0126 ungetChar(); 0127 } 0128 0129 // catch known tokens of a single character (as last...) 0130 if (cType != Token::Unknown) 0131 return new Token(cType, static_cast<QString>(c), startRow, startCol, row, col); 0132 0133 // this does not neglect calls to functions with a name of length one (checked it) 0134 return new Token(Token::Error, static_cast<QString>(c), startRow, startCol, row, col); 0135 } 0136 0137 0138 QChar Tokenizer::getChar() 0139 { 0140 if (at >= inputString.size()) { 0141 atEnd = true; 0142 // //qDebug() << "Tokenizer::getChar() returns: a ZERO CHAR " << " @ " << at - 1; 0143 return QChar(); 0144 } 0145 QChar c(inputString.at(at)); 0146 at++; 0147 if (isBreak(c)) { 0148 row++; 0149 prevCol = col; 0150 col = 1; 0151 } else { 0152 col++; 0153 } 0154 // //qDebug() << "Tokenizer::getChar() returns: " << c << " (" << c.category() << ") " << " @ " << at - 1; 0155 return c; 0156 } 0157 0158 0159 void Tokenizer::ungetChar() 0160 { 0161 if (at <= 0) return; // do nothing when trying to go before the first character 0162 0163 at--; 0164 if (atEnd) atEnd = false; 0165 0166 QChar c(inputString.at(at)); 0167 if (isBreak(c)) { 0168 row--; 0169 col = prevCol; 0170 } else { 0171 col--; 0172 } 0173 } 0174 0175 0176 bool Tokenizer::isWordChar(const QChar& c) 0177 { 0178 // this method exists because some languages have non-letter category characters 0179 // mixed with their letter character to make words (like hindi) 0180 // NOTE: this has to be extended then languages give problems, 0181 // just add a category in the following test 0182 return (c.isLetter() || c.isMark()); 0183 } 0184 0185 bool Tokenizer::isBreak(const QChar& c) 0186 { 0187 return (c == QLatin1Char('\x0a') || c == QLatin1Char('\n')); 0188 // c.category() == QChar::Other_Control // one of these also contains the tab (\t) 0189 // c.category() == QChar::Separator_Line 0190 // c.category() == QChar::Separator_Paragraph 0191 } 0192 0193 0194 bool Tokenizer::isSpace(const QChar& c) 0195 { 0196 return (c.category() == QChar::Separator_Space || c == QLatin1Char(' ') || isTab(c)); 0197 } 0198 0199 bool Tokenizer::isTab(const QChar& c) 0200 { 0201 return (c == QLatin1Char('\x09') || c == QLatin1Char('\t')); 0202 }