Warning, file /frameworks/khtml/src/xpath/tokenizer.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 /* 0002 * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org> 0003 * 0004 * Redistribution and use in source and binary forms, with or without 0005 * modification, are permitted provided that the following conditions 0006 * are met: 0007 * 0008 * 1. Redistributions of source code must retain the above copyright 0009 * notice, this list of conditions and the following disclaimer. 0010 * 2. Redistributions in binary form must reproduce the above copyright 0011 * notice, this list of conditions and the following disclaimer in the 0012 * documentation and/or other materials provided with the distribution. 0013 * 0014 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 0015 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 0016 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 0017 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 0018 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 0019 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 0020 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 0021 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 0022 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 0023 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 0024 */ 0025 #include "tokenizer.h" 0026 0027 #include "xml/dom_stringimpl.h" 0028 #include "xml/dom3_xpathimpl.h" 0029 #include "dom/dom3_xpath.h" 0030 0031 #include <cstdio> 0032 0033 using namespace std; 0034 0035 using namespace DOM; 0036 using namespace DOM::XPath; 0037 using namespace khtml; 0038 using namespace khtml::XPath; 0039 0040 namespace khtml 0041 { 0042 namespace XPath 0043 { 0044 0045 struct AxisNameMapping { 0046 const char *name; 0047 Step::AxisType type; 0048 }; 0049 0050 static AxisNameMapping axisNames[] = { 0051 { "ancestor", Step::AncestorAxis }, 0052 { "ancestor-or-self", Step::AncestorOrSelfAxis }, 0053 { "attribute", Step::AttributeAxis }, 0054 { "child", Step::ChildAxis }, 0055 { "descendant", Step::DescendantAxis }, 0056 { "descendant-or-self", Step::DescendantOrSelfAxis }, 0057 { "following", Step::FollowingAxis }, 0058 { "following-sibling", Step::FollowingSiblingAxis }, 0059 { "namespace", Step::NamespaceAxis }, 0060 { "parent", Step::ParentAxis }, 0061 { "preceding", Step::PrecedingAxis }, 0062 { "preceding-sibling", Step::PrecedingSiblingAxis }, 0063 { "self", Step::SelfAxis } 0064 }; 0065 static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]); 0066 0067 static const char *const nodeTypeNames[] = { 0068 "comment", 0069 "text", 0070 "processing-instruction", 0071 "node", 0072 nullptr 0073 }; 0074 0075 QHash<QString, Step::AxisType> *Tokenizer::s_axisNamesDict = nullptr; 0076 QSet<QString> *Tokenizer::s_nodeTypeNamesDict = nullptr; 0077 0078 Tokenizer &Tokenizer::self() 0079 { 0080 static Tokenizer instance; 0081 return instance; 0082 } 0083 0084 Tokenizer::XMLCat Tokenizer::charCat(QChar aChar) 0085 { 0086 //### might need to add some special cases from the XML spec. 0087 0088 if (aChar.unicode() == '_') { 0089 return NameStart; 0090 } 0091 0092 if (aChar.unicode() == '.' || aChar.unicode() == '-') { 0093 return NameCont; 0094 } 0095 0096 switch (aChar.category()) { 0097 case QChar::Letter_Lowercase: //Ll 0098 case QChar::Letter_Uppercase: //Lu 0099 case QChar::Letter_Other: //Lo 0100 case QChar::Letter_Titlecase: //Lt 0101 case QChar::Number_Letter: //Nl 0102 return NameStart; 0103 0104 case QChar::Mark_SpacingCombining: //Mc 0105 case QChar::Mark_Enclosing: //Me 0106 case QChar::Mark_NonSpacing: //Mn 0107 case QChar::Letter_Modifier: //Lm 0108 case QChar::Number_DecimalDigit: //Nd 0109 return NameCont; 0110 0111 default: 0112 return NotPartOfName; 0113 } 0114 } 0115 0116 bool Tokenizer::isAxisName(QString name, Step::AxisType *type) 0117 { 0118 if (!s_axisNamesDict) { 0119 s_axisNamesDict = new QHash<QString, Step::AxisType>; 0120 for (unsigned int p = 0; p < axisNamesCount; ++p) 0121 s_axisNamesDict->insert(QLatin1String(axisNames[p].name), 0122 axisNames[p].type); 0123 } 0124 0125 QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name); 0126 if (it != s_axisNamesDict->constEnd()) { 0127 *type = *it; 0128 } 0129 return it != s_axisNamesDict->constEnd(); 0130 } 0131 0132 bool Tokenizer::isNodeTypeName(QString name) 0133 { 0134 if (!s_nodeTypeNamesDict) { 0135 s_nodeTypeNamesDict = new QSet<QString>; 0136 for (int p = 0; nodeTypeNames[p]; ++p) { 0137 s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p])); 0138 } 0139 } 0140 return s_nodeTypeNamesDict->contains(name); 0141 } 0142 0143 /* Returns whether the last parsed token matches the [32] Operator rule 0144 * (check https://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate 0145 * the tokens. 0146 */ 0147 bool Tokenizer::isOperatorContext() 0148 { 0149 if (m_nextPos == 0) { 0150 return false; 0151 } 0152 0153 switch (m_lastTokenType) { 0154 case AND: case OR: case MULOP: 0155 case '/': case SLASHSLASH: case '|': case PLUS: case MINUS: 0156 case EQOP: case RELOP: 0157 case '@': case AXISNAME: case '(': case '[': 0158 return false; 0159 default: 0160 return true; 0161 } 0162 } 0163 0164 void Tokenizer::skipWS() 0165 { 0166 while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace()) { 0167 ++m_nextPos; 0168 } 0169 } 0170 0171 Token Tokenizer::makeTokenAndAdvance(int code, int advance) 0172 { 0173 m_nextPos += advance; 0174 return Token(code); 0175 } 0176 0177 Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance) 0178 { 0179 m_nextPos += advance; 0180 return Token(code, val); 0181 } 0182 0183 //Returns next char if it's there and interesting, 0 otherwise 0184 char Tokenizer::peekAheadHelper() 0185 { 0186 if (m_nextPos + 1 >= m_data.length()) { 0187 return 0; 0188 } 0189 QChar next = m_data[m_nextPos + 1]; 0190 if (next.row() != 0) { 0191 return 0; 0192 } else { 0193 return next.cell(); 0194 } 0195 } 0196 0197 char Tokenizer::peekCurHelper() 0198 { 0199 if (m_nextPos >= m_data.length()) { 0200 return 0; 0201 } 0202 QChar next = m_data[m_nextPos]; 0203 if (next.row() != 0) { 0204 return 0; 0205 } else { 0206 return next.cell(); 0207 } 0208 } 0209 0210 Token Tokenizer::lexString() 0211 { 0212 QChar delimiter = m_data[m_nextPos]; 0213 int startPos = m_nextPos + 1; 0214 0215 for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) { 0216 if (m_data[m_nextPos] == delimiter) { 0217 QString value = m_data.mid(startPos, m_nextPos - startPos); 0218 ++m_nextPos; //Consume the char; 0219 return Token(LITERAL, value); 0220 } 0221 } 0222 0223 //Ouch, went off the end -- report error 0224 return Token(ERROR); 0225 } 0226 0227 Token Tokenizer::lexNumber() 0228 { 0229 int startPos = m_nextPos; 0230 bool seenDot = false; 0231 0232 //Go until end or a non-digits character 0233 for (; m_nextPos < m_data.length(); ++m_nextPos) { 0234 QChar aChar = m_data[m_nextPos]; 0235 if (aChar.row() != 0) { 0236 break; 0237 } 0238 0239 if (aChar.cell() < '0' || aChar.cell() > '9') { 0240 if (aChar.cell() == '.' && !seenDot) { 0241 seenDot = true; 0242 } else { 0243 break; 0244 } 0245 } 0246 } 0247 0248 QString value = m_data.mid(startPos, m_nextPos - startPos); 0249 return Token(NUMBER, value); 0250 } 0251 0252 Token Tokenizer::lexNCName() 0253 { 0254 int startPos = m_nextPos; 0255 if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart) { 0256 //Keep going until we get a character that's not good for names. 0257 for (; m_nextPos < m_data.length(); ++m_nextPos) { 0258 if (charCat(m_data[m_nextPos]) == NotPartOfName) { 0259 break; 0260 } 0261 } 0262 0263 QString value = m_data.mid(startPos, m_nextPos - startPos); 0264 return Token(value); 0265 } else { 0266 return makeTokenAndAdvance(ERROR); 0267 } 0268 } 0269 0270 Token Tokenizer::lexQName() 0271 { 0272 Token t1 = lexNCName(); 0273 if (t1.type == ERROR) { 0274 return t1; 0275 } 0276 skipWS(); 0277 //If the next character is :, what we just got it the prefix, if not, 0278 //it's the whole thing 0279 if (peekAheadHelper() != ':') { 0280 return t1; 0281 } 0282 0283 Token t2 = lexNCName(); 0284 if (t2.type == ERROR) { 0285 return t2; 0286 } 0287 0288 return Token(t1.value + ":" + t2.value); 0289 } 0290 0291 Token Tokenizer::nextTokenInternal() 0292 { 0293 skipWS(); 0294 0295 if (m_nextPos >= m_data.length()) { 0296 return Token(0); 0297 } 0298 0299 char code = peekCurHelper(); 0300 switch (code) { 0301 case '(': case ')': case '[': case ']': 0302 case '@': case ',': case '|': 0303 return makeTokenAndAdvance(code); 0304 case '\'': 0305 case '\"': 0306 return lexString(); 0307 case '0': case '1': case '2': case '3': case '4': 0308 case '5': case '6': case '7': case '8': case '9': 0309 return lexNumber(); 0310 case '.': { 0311 char next = peekAheadHelper(); 0312 if (next == '.') { 0313 return makeTokenAndAdvance(DOTDOT, 2); 0314 } else if (next >= '0' && next <= '9') { 0315 return lexNumber(); 0316 } else { 0317 return makeTokenAndAdvance('.'); 0318 } 0319 } 0320 case '/': 0321 if (peekAheadHelper() == '/') { 0322 return makeTokenAndAdvance(SLASHSLASH, 2); 0323 } else { 0324 return makeTokenAndAdvance('/'); 0325 } 0326 case '+': 0327 return makeTokenAndAdvance(PLUS); 0328 case '-': 0329 return makeTokenAndAdvance(MINUS); 0330 case '=': 0331 return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ); 0332 case '!': 0333 if (peekAheadHelper() == '=') { 0334 return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2); 0335 } else { 0336 return Token(ERROR); 0337 } 0338 case '<': 0339 if (peekAheadHelper() == '=') { 0340 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2); 0341 } else { 0342 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT); 0343 } 0344 case '>': 0345 if (peekAheadHelper() == '=') { 0346 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2); 0347 } else { 0348 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT); 0349 } 0350 case '*': 0351 if (isOperatorContext()) { 0352 return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul); 0353 } else { 0354 ++m_nextPos; 0355 return Token(NAMETEST, "*"); 0356 } 0357 case '$': {//$ QName 0358 m_nextPos++; 0359 Token par = lexQName(); 0360 if (par.type == ERROR) { 0361 return par; 0362 } else { 0363 return Token(VARIABLEREFERENCE, par.value); 0364 } 0365 } 0366 } 0367 0368 Token t1 = lexNCName(); 0369 if (t1.type == ERROR) { 0370 return t1; 0371 } 0372 0373 skipWS(); 0374 0375 //If we're in an operator context, check for any operator names 0376 if (isOperatorContext()) { 0377 if (t1.value == QLatin1String("and")) { //### hash? 0378 return Token(AND); 0379 } 0380 if (t1.value == QLatin1String("or")) { 0381 return Token(OR); 0382 } 0383 if (t1.value == QLatin1String("mod")) { 0384 return Token(MULOP, NumericOp::OP_Mod); 0385 } 0386 if (t1.value == QLatin1String("div")) { 0387 return Token(MULOP, NumericOp::OP_Div); 0388 } 0389 } 0390 0391 //See whether we are at a : 0392 if (peekCurHelper() == ':') { 0393 m_nextPos++; 0394 //Any chance it's an axis name? 0395 if (peekCurHelper() == ':') { 0396 m_nextPos++; 0397 0398 //It might be an axis name. 0399 Step::AxisType axisType; 0400 if (isAxisName(t1.value, &axisType)) { 0401 return Token(AXISNAME, axisType); 0402 } 0403 //Ugh, :: is only valid in axis names -> error 0404 return Token(ERROR); 0405 } 0406 0407 //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest 0408 skipWS(); 0409 if (peekCurHelper() == '*') { 0410 m_nextPos++; 0411 return Token(NAMETEST, t1.value + ":*"); 0412 } 0413 0414 //Make a full qname.. 0415 Token t2 = lexNCName(); 0416 if (t2.type == ERROR) { 0417 return t2; 0418 } 0419 0420 t1.value = t1.value + ':' + t2.value; 0421 } 0422 0423 skipWS(); 0424 if (peekCurHelper() == '(') { 0425 //note: we don't swallow the ( here! 0426 0427 //either node type of function name 0428 if (isNodeTypeName(t1.value)) { 0429 if (t1.value == "processing-instruction") { 0430 return Token(PI, t1.value); 0431 } else { 0432 return Token(NODETYPE, t1.value); 0433 } 0434 } 0435 //must be a function name. 0436 return Token(FUNCTIONNAME, t1.value); 0437 } 0438 0439 //At this point, it must be NAMETEST 0440 return Token(NAMETEST, t1.value); 0441 } 0442 0443 Token Tokenizer::nextToken() 0444 { 0445 Token toRet = nextTokenInternal(); 0446 m_lastTokenType = toRet.type; 0447 return toRet; 0448 } 0449 0450 Tokenizer::Tokenizer() 0451 { 0452 reset(QString()); 0453 } 0454 0455 Tokenizer::~Tokenizer() 0456 { 0457 delete s_axisNamesDict; 0458 delete s_nodeTypeNamesDict; 0459 } 0460 0461 void Tokenizer::reset(QString data) 0462 { 0463 m_nextPos = 0; 0464 m_data = data; 0465 m_lastTokenType = 0; 0466 } 0467 0468 int khtmlxpathyylex() 0469 { 0470 Token tok = Tokenizer::self().nextToken(); 0471 if (tok.hasString) { 0472 khtmlxpathyylval.str = new DOMString(tok.value); 0473 } else if (tok.intValue) { 0474 khtmlxpathyylval.num = tok.intValue; 0475 } 0476 return tok.type; 0477 } 0478 0479 void initTokenizer(const DOM::DOMString &string) 0480 { 0481 Tokenizer::self().reset(string.string()); 0482 } 0483 0484 } // namespace XPath 0485 } // namespace khtml 0486