src/xpath/tokenizer.cpp

0001 /*
0002  * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
0003  *
0004  * Redistribution and use in source and binary forms, with or without
0005  * modification, are permitted provided that the following conditions
0006  * are met:
0007  *
0008  * 1. Redistributions of source code must retain the above copyright
0009  *    notice, this list of conditions and the following disclaimer.
0010  * 2. Redistributions in binary form must reproduce the above copyright
0011  *    notice, this list of conditions and the following disclaimer in the
0012  *    documentation and/or other materials provided with the distribution.
0013  *
0014  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
0015  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0016  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
0017  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
0018  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
0019  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
0020  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
0021  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0022  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
0023  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0024  */
0025 #include "tokenizer.h"
0026
0027 #include "xml/dom_stringimpl.h"
0028 #include "xml/dom3_xpathimpl.h"
0029 #include "dom/dom3_xpath.h"
0030
0031 #include <cstdio>
0032
0033 using namespace std;
0034
0035 using namespace DOM;
0036 using namespace DOM::XPath;
0037 using namespace khtml;
0038 using namespace khtml::XPath;
0039
0040 namespace khtml
0041 {
0042 namespace XPath
0043 {
0044
0045 struct AxisNameMapping {
0046     const char *name;
0047     Step::AxisType type;
0048 };
0049
0050 static AxisNameMapping axisNames[] = {
0051     { "ancestor", Step::AncestorAxis },
0052     { "ancestor-or-self", Step::AncestorOrSelfAxis },
0053     { "attribute", Step::AttributeAxis },
0054     { "child", Step::ChildAxis },
0055     { "descendant", Step::DescendantAxis },
0056     { "descendant-or-self", Step::DescendantOrSelfAxis },
0057     { "following", Step::FollowingAxis },
0058     { "following-sibling", Step::FollowingSiblingAxis },
0059     { "namespace", Step::NamespaceAxis },
0060     { "parent", Step::ParentAxis },
0061     { "preceding", Step::PrecedingAxis },
0062     { "preceding-sibling", Step::PrecedingSiblingAxis },
0063     { "self", Step::SelfAxis }
0064 };
0065 static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
0066
0067 static const char *const nodeTypeNames[] = {
0068     "comment",
0069     "text",
0070     "processing-instruction",
0071     "node",
0072     nullptr
0073 };
0074
0075 QHash<QString, Step::AxisType> *Tokenizer::s_axisNamesDict     = nullptr;
0076 QSet<QString> *Tokenizer::s_nodeTypeNamesDict = nullptr;
0077
0078 Tokenizer &Tokenizer::self()
0079 {
0080     static Tokenizer instance;
0081     return instance;
0082 }
0083
0084 Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
0085 {
0086     //### might need to add some special cases from the XML spec.
0087
0088     if (aChar.unicode() == '_') {
0089         return NameStart;
0090     }
0091
0092     if (aChar.unicode() == '.' || aChar.unicode() == '-') {
0093         return NameCont;
0094     }
0095
0096     switch (aChar.category()) {
0097     case QChar::Letter_Lowercase: //Ll
0098     case QChar::Letter_Uppercase: //Lu
0099     case QChar::Letter_Other:     //Lo
0100     case QChar::Letter_Titlecase: //Lt
0101     case QChar::Number_Letter:    //Nl
0102         return NameStart;
0103
0104     case QChar::Mark_SpacingCombining: //Mc
0105     case QChar::Mark_Enclosing:        //Me
0106     case QChar::Mark_NonSpacing:       //Mn
0107     case QChar::Letter_Modifier:       //Lm
0108     case QChar::Number_DecimalDigit:   //Nd
0109         return NameCont;
0110
0111     default:
0112         return NotPartOfName;
0113     }
0114 }
0115
0116 bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
0117 {
0118     if (!s_axisNamesDict) {
0119         s_axisNamesDict = new QHash<QString, Step::AxisType>;
0120         for (unsigned int p = 0; p < axisNamesCount; ++p)
0121             s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
0122                                     axisNames[p].type);
0123     }
0124
0125     QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
0126     if (it != s_axisNamesDict->constEnd()) {
0127         *type = *it;
0128     }
0129     return it != s_axisNamesDict->constEnd();
0130 }
0131
0132 bool Tokenizer::isNodeTypeName(QString name)
0133 {
0134     if (!s_nodeTypeNamesDict) {
0135         s_nodeTypeNamesDict = new QSet<QString>;
0136         for (int p = 0; nodeTypeNames[p]; ++p) {
0137             s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
0138         }
0139     }
0140     return s_nodeTypeNamesDict->contains(name);
0141 }
0142
0143 /* Returns whether the last parsed token matches the [32] Operator rule
0144  * (check https://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
0145  * the tokens.
0146  */
0147 bool Tokenizer::isOperatorContext()
0148 {
0149     if (m_nextPos == 0) {
0150         return false;
0151     }
0152
0153     switch (m_lastTokenType) {
0154     case AND: case OR: case MULOP:
0155     case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
0156     case EQOP: case RELOP:
0157     case '@': case AXISNAME:   case '(': case '[':
0158         return false;
0159     default:
0160         return true;
0161     }
0162 }
0163
0164 void Tokenizer::skipWS()
0165 {
0166     while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace()) {
0167         ++m_nextPos;
0168     }
0169 }
0170
0171 Token Tokenizer::makeTokenAndAdvance(int code, int advance)
0172 {
0173     m_nextPos += advance;
0174     return Token(code);
0175 }
0176
0177 Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
0178 {
0179     m_nextPos += advance;
0180     return Token(code, val);
0181 }
0182
0183 //Returns next char if it's there and interesting, 0 otherwise
0184 char Tokenizer::peekAheadHelper()
0185 {
0186     if (m_nextPos + 1 >= m_data.length()) {
0187         return 0;
0188     }
0189     QChar next = m_data[m_nextPos + 1];
0190     if (next.row() != 0) {
0191         return 0;
0192     } else {
0193         return next.cell();
0194     }
0195 }
0196
0197 char Tokenizer::peekCurHelper()
0198 {
0199     if (m_nextPos >= m_data.length()) {
0200         return 0;
0201     }
0202     QChar next = m_data[m_nextPos];
0203     if (next.row() != 0) {
0204         return 0;
0205     } else {
0206         return next.cell();
0207     }
0208 }
0209
0210 Token Tokenizer::lexString()
0211 {
0212     QChar delimiter = m_data[m_nextPos];
0213     int   startPos  = m_nextPos + 1;
0214
0215     for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
0216         if (m_data[m_nextPos] == delimiter) {
0217             QString value = m_data.mid(startPos, m_nextPos - startPos);
0218             ++m_nextPos; //Consume the char;
0219             return Token(LITERAL, value);
0220         }
0221     }
0222
0223     //Ouch, went off the end -- report error
0224     return Token(ERROR);
0225 }
0226
0227 Token Tokenizer::lexNumber()
0228 {
0229     int startPos = m_nextPos;
0230     bool seenDot = false;
0231
0232     //Go until end or a non-digits character
0233     for (; m_nextPos < m_data.length(); ++m_nextPos) {
0234         QChar aChar = m_data[m_nextPos];
0235         if (aChar.row() != 0) {
0236             break;
0237         }
0238
0239         if (aChar.cell() < '0' || aChar.cell() > '9') {
0240             if (aChar.cell() == '.' && !seenDot) {
0241                 seenDot = true;
0242             } else {
0243                 break;
0244             }
0245         }
0246     }
0247
0248     QString value = m_data.mid(startPos, m_nextPos - startPos);
0249     return Token(NUMBER, value);
0250 }
0251
0252 Token Tokenizer::lexNCName()
0253 {
0254     int startPos = m_nextPos;
0255     if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart) {
0256         //Keep going until we get a character that's not good for names.
0257         for (; m_nextPos < m_data.length(); ++m_nextPos) {
0258             if (charCat(m_data[m_nextPos]) == NotPartOfName) {
0259                 break;
0260             }
0261         }
0262
0263         QString value = m_data.mid(startPos, m_nextPos - startPos);
0264         return Token(value);
0265     } else {
0266         return makeTokenAndAdvance(ERROR);
0267     }
0268 }
0269
0270 Token Tokenizer::lexQName()
0271 {
0272     Token t1 = lexNCName();
0273     if (t1.type == ERROR) {
0274         return t1;
0275     }
0276     skipWS();
0277     //If the next character is :, what we just got it the prefix, if not,
0278     //it's the whole thing
0279     if (peekAheadHelper() != ':') {
0280         return t1;
0281     }
0282
0283     Token t2 = lexNCName();
0284     if (t2.type == ERROR) {
0285         return t2;
0286     }
0287
0288     return Token(t1.value + ":" + t2.value);
0289 }
0290
0291 Token Tokenizer::nextTokenInternal()
0292 {
0293     skipWS();
0294
0295     if (m_nextPos >= m_data.length()) {
0296         return Token(0);
0297     }
0298
0299     char code = peekCurHelper();
0300     switch (code) {
0301     case '(': case ')': case '[': case ']':
0302     case '@': case ',': case '|':
0303         return makeTokenAndAdvance(code);
0304     case '\'':
0305     case '\"':
0306         return lexString();
0307     case '0': case '1': case '2': case '3': case '4':
0308     case '5': case '6': case '7': case '8': case '9':
0309         return lexNumber();
0310     case '.': {
0311         char next = peekAheadHelper();
0312         if (next == '.') {
0313             return makeTokenAndAdvance(DOTDOT, 2);
0314         } else if (next >= '0' && next <= '9') {
0315             return lexNumber();
0316         } else {
0317             return makeTokenAndAdvance('.');
0318         }
0319     }
0320     case '/':
0321         if (peekAheadHelper() == '/') {
0322             return makeTokenAndAdvance(SLASHSLASH, 2);
0323         } else {
0324             return makeTokenAndAdvance('/');
0325         }
0326     case '+':
0327         return makeTokenAndAdvance(PLUS);
0328     case '-':
0329         return makeTokenAndAdvance(MINUS);
0330     case '=':
0331         return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
0332     case '!':
0333         if (peekAheadHelper() == '=') {
0334             return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
0335         } else {
0336             return Token(ERROR);
0337         }
0338     case '<':
0339         if (peekAheadHelper() == '=') {
0340             return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
0341         } else {
0342             return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
0343         }
0344     case '>':
0345         if (peekAheadHelper() == '=') {
0346             return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
0347         } else {
0348             return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
0349         }
0350     case '*':
0351         if (isOperatorContext()) {
0352             return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
0353         } else {
0354             ++m_nextPos;
0355             return Token(NAMETEST, "*");
0356         }
0357     case '$': {//$ QName
0358         m_nextPos++;
0359         Token par = lexQName();
0360         if (par.type == ERROR) {
0361             return par;
0362         } else {
0363             return Token(VARIABLEREFERENCE, par.value);
0364         }
0365     }
0366     }
0367
0368     Token t1 = lexNCName();
0369     if (t1.type == ERROR) {
0370         return t1;
0371     }
0372
0373     skipWS();
0374
0375     //If we're in an operator context, check for any operator names
0376     if (isOperatorContext()) {
0377         if (t1.value == QLatin1String("and")) { //### hash?
0378             return Token(AND);
0379         }
0380         if (t1.value == QLatin1String("or")) {
0381             return Token(OR);
0382         }
0383         if (t1.value == QLatin1String("mod")) {
0384             return Token(MULOP, NumericOp::OP_Mod);
0385         }
0386         if (t1.value == QLatin1String("div")) {
0387             return Token(MULOP, NumericOp::OP_Div);
0388         }
0389     }
0390
0391     //See whether we are at a :
0392     if (peekCurHelper() == ':') {
0393         m_nextPos++;
0394         //Any chance it's an axis name?
0395         if (peekCurHelper() == ':') {
0396             m_nextPos++;
0397
0398             //It might be an axis name.
0399             Step::AxisType axisType;
0400             if (isAxisName(t1.value, &axisType)) {
0401                 return Token(AXISNAME, axisType);
0402             }
0403             //Ugh, :: is only valid in axis names -> error
0404             return Token(ERROR);
0405         }
0406
0407         //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
0408         skipWS();
0409         if (peekCurHelper() == '*') {
0410             m_nextPos++;
0411             return Token(NAMETEST, t1.value + ":*");
0412         }
0413
0414         //Make a full qname..
0415         Token t2 = lexNCName();
0416         if (t2.type == ERROR) {
0417             return t2;
0418         }
0419
0420         t1.value = t1.value + ':' + t2.value;
0421     }
0422
0423     skipWS();
0424     if (peekCurHelper() == '(') {
0425         //note: we don't swallow the ( here!
0426
0427         //either node type of function name
0428         if (isNodeTypeName(t1.value)) {
0429             if (t1.value == "processing-instruction") {
0430                 return Token(PI, t1.value);
0431             } else {
0432                 return Token(NODETYPE, t1.value);
0433             }
0434         }
0435         //must be a function name.
0436         return Token(FUNCTIONNAME, t1.value);
0437     }
0438
0439     //At this point, it must be NAMETEST
0440     return Token(NAMETEST, t1.value);
0441 }
0442
0443 Token Tokenizer::nextToken()
0444 {
0445     Token toRet = nextTokenInternal();
0446     m_lastTokenType = toRet.type;
0447     return toRet;
0448 }
0449
0450 Tokenizer::Tokenizer()
0451 {
0452     reset(QString());
0453 }
0454
0455 Tokenizer::~Tokenizer()
0456 {
0457     delete s_axisNamesDict;
0458     delete s_nodeTypeNamesDict;
0459 }
0460
0461 void Tokenizer::reset(QString data)
0462 {
0463     m_nextPos = 0;
0464     m_data = data;
0465     m_lastTokenType = 0;
0466 }
0467
0468 int khtmlxpathyylex()
0469 {
0470     Token tok = Tokenizer::self().nextToken();
0471     if (tok.hasString) {
0472         khtmlxpathyylval.str = new DOMString(tok.value);
0473     } else if (tok.intValue) {
0474         khtmlxpathyylval.num = tok.intValue;
0475     }
0476     return tok.type;
0477 }
0478
0479 void initTokenizer(const DOM::DOMString &string)
0480 {
0481     Tokenizer::self().reset(string.string());
0482 }
0483
0484 } // namespace XPath
0485 } // namespace khtml
0486