File indexing completed on 2024-05-12 15:43:33
0001 /* 0002 * This file is part of the KDE libraries 0003 * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org) 0004 * Copyright (C) 2003,2004 Apple Computer, Inc. 0005 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org) 0006 * Copyright (C) 2007 Sune Vuorela (debian@pusling.com) 0007 * 0008 * This library is free software; you can redistribute it and/or 0009 * modify it under the terms of the GNU Lesser General Public 0010 * License as published by the Free Software Foundation; either 0011 * version 2 of the License, or (at your option) any later version. 0012 * 0013 * This library is distributed in the hope that it will be useful, 0014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0016 * Lesser General Public License for more details. 0017 * 0018 * You should have received a copy of the GNU Lesser General Public 0019 * License along with this library; if not, write to the Free Software 0020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 0021 * 0022 */ 0023 0024 #include "regexp.h" 0025 #include "lexer.h" 0026 0027 #include <assert.h> 0028 #include <stdio.h> 0029 #include <stdlib.h> 0030 #include <string.h> 0031 #include <wtf/Vector.h> 0032 0033 #if defined _WIN32 || defined _WIN64 0034 #define HAVE_SYS_TIME_H 0 0035 #endif 0036 #if HAVE_SYS_TIME_H 0037 #include <sys/time.h> 0038 0039 static const rlim_t sWantedStackSizeLimit = 32 * 1024 * 1024; 0040 0041 #endif 0042 0043 using WTF::Vector; 0044 0045 // GCC cstring uses these automatically, but not all implementations do. 0046 using std::strlen; 0047 using std::strcpy; 0048 using std::strncpy; 0049 using std::memset; 0050 using std::memcpy; 0051 0052 namespace KJS 0053 { 0054 0055 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; 0056 0057 static bool sanitizePatternExtensions(UString &p, WTF::Vector<int> *parenIdx = nullptr); 0058 0059 // JS regexps can contain Unicode escape sequences (\uxxxx) which 0060 // are rather uncommon elsewhere. As our regexp libs don't understand 0061 // them we do the unescaping ourselves internally. 0062 // Also make sure to expand out any nulls as pcre_compile 0063 // expects null termination.. 0064 static UString sanitizePattern(const UString &p) 0065 { 0066 UString np; 0067 0068 bool changed = false; 0069 const char *const nil = "\\x00"; 0070 if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) { 0071 bool escape = false; 0072 changed = true; 0073 for (int i = 0; i < p.size(); ++i) { 0074 UChar c = p[i]; 0075 if (escape) { 0076 escape = false; 0077 // we only care about \u 0078 if (c == 'u') { 0079 // standard unicode escape sequence looks like \uxxxx but 0080 // other browsers also accept less than 4 hex digits 0081 unsigned short u = 0; 0082 int j = 0; 0083 for (j = 0; j < 4; ++j) { 0084 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { 0085 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); 0086 ++i; 0087 } else { 0088 // sequence incomplete. restore index. 0089 // TODO: cleaner way to propagate warning 0090 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j); 0091 i -= j; 0092 break; 0093 } 0094 } 0095 if (j < 4) { 0096 // sequence was incomplete. treat \u as u which IE always 0097 // and FF sometimes does. 0098 np.append(UString('u')); 0099 } else { 0100 c = UChar(u); 0101 switch (u) { 0102 case 0: 0103 // Make sure to encode 0, to avoid terminating the string 0104 np += UString(nil); 0105 break; 0106 case '^': 0107 case '$': 0108 case '\\': 0109 case '.': 0110 case '*': 0111 case '+': 0112 case '?': 0113 case '(': case ')': 0114 case '{': case '}': 0115 case '[': case ']': 0116 case '|': 0117 // escape pattern characters have to remain escaped 0118 np.append(UString('\\')); 0119 // intentional fallthrough 0120 default: 0121 np += UString(&c, 1); 0122 break; 0123 } 0124 } 0125 continue; 0126 } 0127 np += UString('\\'); 0128 np += UString(&c, 1); 0129 } else { 0130 if (c == '\\') { 0131 escape = true; 0132 } else if (c == '\0') { 0133 np += UString(nil); 0134 } else { 0135 np += UString(&c, 1); 0136 } 0137 } 0138 } 0139 } 0140 // Rewrite very inefficient RE formulation: 0141 // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+ 0142 // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space. 0143 if (p.find(".|\\s)") >= 0) { 0144 if (np.isEmpty()) { 0145 np = p; 0146 } 0147 bool didRewrite = false; 0148 WTF::Vector<int> parenIdx; 0149 sanitizePatternExtensions(np, &parenIdx); 0150 Vector<int>::const_iterator end = parenIdx.end(); 0151 int previdx = 0; 0152 UString tmp; 0153 bool nonCapturing = false; 0154 for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) { 0155 int idx = *it; 0156 if (np.size() < idx + 6) { 0157 break; 0158 } 0159 if (np[idx + 1] == '?' && np[idx + 2] == ':') { 0160 nonCapturing = true; 0161 idx += 3; 0162 } else { 0163 ++idx; 0164 } 0165 if (!(np[idx] == '.' && np[idx + 1] == '|' && np[idx + 2] == '\\' && np[idx + 3] == 's')) { 0166 continue; 0167 } 0168 if (np.size() >= idx + 6 && (np[idx + 5] == '+' || (np[idx + 5] == '*')) && 0169 // no need to do anything if the pattern is minimal e.g. (.|\s)+? 0170 !(np.size() > idx + 6 && np[idx + 6] == '?')) { 0171 didRewrite = true; 0172 if (nonCapturing) { // trivial case: (?:.|\s)+ => [\w\W]+ 0173 tmp.append(np, previdx, idx - previdx - 3); 0174 tmp.append("[\\w\\W]"); 0175 tmp.append(np[idx + 5]); 0176 } else if (np[idx + 5] == '*') { // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?) 0177 tmp.append(np, previdx, idx - previdx - 1); 0178 tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)"); 0179 } else { // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W]) 0180 assert(np[idx + 5] == '+'); 0181 tmp.append(np, previdx, idx - previdx - 1); 0182 tmp.append("[\\w\\W]*([\\w\\W])"); 0183 } 0184 } else { 0185 tmp.append(np, previdx, idx - previdx + 5); 0186 } 0187 previdx = idx + 6; 0188 } 0189 if (didRewrite) { 0190 tmp.append(np, previdx); 0191 fprintf(stderr, "Pattern: %s ", np.ascii()); 0192 fprintf(stderr, "was rewritten to: %s\n", tmp.ascii()); 0193 np = tmp; 0194 changed = true; 0195 } 0196 } 0197 return (changed ? np : p); 0198 } 0199 0200 // For now, the only 'extension' to standard we are willing to deal with is 0201 // a non-escaped closing bracket, outside of a character class. e.g. /.*]/ 0202 static bool sanitizePatternExtensions(UString &p, WTF::Vector<int> *parenIdx) 0203 { 0204 UString newPattern; 0205 0206 static const int StateNominal = 0, StateOpenBracket = 1; 0207 WTF::Vector<int> v; 0208 bool escape = false; 0209 0210 int state = StateNominal; 0211 int escapedSinceLastParen = 0; 0212 for (int i = 0; i < p.size(); ++i) { 0213 UChar c = p[i]; 0214 if (escape) { 0215 escape = false; 0216 } else { 0217 if (c == '\\') { 0218 escape = true; 0219 } else if (c == ']') { 0220 if (state == StateOpenBracket) { 0221 state = StateNominal; 0222 } else if (state == StateNominal) { 0223 v.append(i); 0224 ++escapedSinceLastParen; 0225 } 0226 } else if (c == '[') { 0227 if (state == StateOpenBracket) { 0228 v.append(i); 0229 ++escapedSinceLastParen; 0230 } else if (state == StateNominal) { 0231 state = StateOpenBracket; 0232 } 0233 } else if (c == '(') { 0234 if (parenIdx && state == StateNominal) { 0235 parenIdx->append(i + escapedSinceLastParen); 0236 escapedSinceLastParen = 0; 0237 } 0238 } 0239 } 0240 } 0241 if (state == StateOpenBracket) { 0242 // this is not recoverable. 0243 return false; 0244 } 0245 if (v.size()) { 0246 int pos = 0; 0247 Vector<int>::const_iterator end = v.end(); 0248 for (Vector<int>::const_iterator it = v.begin(); it != end; ++it) { 0249 newPattern += p.substr(pos, *it - pos); 0250 pos = *it; 0251 newPattern += UString('\\'); 0252 } 0253 newPattern += p.substr(pos); 0254 p = newPattern; 0255 return true; 0256 } else { 0257 return false; 0258 } 0259 } 0260 0261 bool RegExp::tryGrowingMaxStackSize = true; 0262 bool RegExp::didIncreaseMaxStackSize = false; 0263 0264 #if HAVE(SYS_TIME_H) 0265 rlim_t RegExp::availableStackSize = 8 * 1024 * 1024; 0266 #else 0267 int RegExp::availableStackSize = 8 * 1024 * 1024; 0268 #endif 0269 0270 RegExp::RegExp(const UString &p, char flags) 0271 : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0) 0272 { 0273 #if HAVE_PCREPOSIX 0274 // Determine whether libpcre has unicode support if need be.. 0275 if (utf8Support == Unknown) { 0276 int supported; 0277 pcre_config(PCRE_CONFIG_UTF8, (void *)&supported); 0278 utf8Support = supported ? Supported : Unsupported; 0279 } 0280 #endif 0281 0282 UString intern = sanitizePattern(p); 0283 0284 #if HAVE_PCREPOSIX 0285 int options = 0; 0286 0287 // we are close but not 100% the same as Perl 0288 #ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7 0289 options |= PCRE_JAVASCRIPT_COMPAT; 0290 #endif 0291 0292 // Note: the Global flag is already handled by RegExpProtoFunc::execute. 0293 // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute. 0294 if (flags & IgnoreCase) { 0295 options |= PCRE_CASELESS; 0296 } 0297 if (flags & Multiline) { 0298 options |= PCRE_MULTILINE; 0299 } 0300 0301 if (utf8Support == Supported) { 0302 options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK); 0303 } 0304 0305 const char *errorMessage; 0306 int errorOffset; 0307 bool secondTry = false; 0308 0309 while (1) { 0310 RegExpStringContext converted(intern); 0311 0312 _regex = pcre_compile(converted.buffer(), options, &errorMessage, &errorOffset, nullptr); 0313 0314 if (!_regex) { 0315 #ifdef PCRE_JAVASCRIPT_COMPAT 0316 // The compilation failed. It is likely the pattern contains non-standard extensions. 0317 // We may try to tolerate some of those extensions. 0318 bool doRecompile = !secondTry && sanitizePatternExtensions(intern); 0319 if (doRecompile) { 0320 secondTry = true; 0321 #ifndef NDEBUG 0322 fprintf(stderr, "KJS: pcre_compile() failed with '%s' - non-standard extensions detected in pattern, trying second compile after correction.\n", errorMessage); 0323 #endif 0324 continue; 0325 } 0326 #endif 0327 #ifndef NDEBUG 0328 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage); 0329 #endif 0330 _valid = false; 0331 return; 0332 } 0333 break; 0334 } 0335 0336 #ifdef PCRE_INFO_CAPTURECOUNT 0337 // Get number of subpatterns that will be returned. 0338 pcre_fullinfo(_regex, nullptr, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns); 0339 #endif 0340 0341 #else /* HAVE_PCREPOSIX */ 0342 0343 int regflags = 0; 0344 #ifdef REG_EXTENDED 0345 regflags |= REG_EXTENDED; 0346 #endif 0347 #ifdef REG_ICASE 0348 if (flags & IgnoreCase) { 0349 regflags |= REG_ICASE; 0350 } 0351 #endif 0352 0353 //NOTE: Multiline is not feasible with POSIX regex. 0354 //if ( f & Multiline ) 0355 // ; 0356 // Note: the Global flag is already handled by RegExpProtoFunc::execute 0357 0358 int errorCode = regcomp(&_regex, intern.ascii(), regflags); 0359 if (errorCode != 0) { 0360 #ifndef NDEBUG 0361 char errorMessage[80]; 0362 regerror(errorCode, &_regex, errorMessage, sizeof errorMessage); 0363 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage); 0364 #endif 0365 _valid = false; 0366 } 0367 #endif 0368 } 0369 0370 RegExp::~RegExp() 0371 { 0372 #if HAVE_PCREPOSIX 0373 pcre_free(_regex); 0374 #else 0375 /* TODO: is this really okay after an error ? */ 0376 regfree(&_regex); 0377 #endif 0378 } 0379 0380 void RegExpStringContext::prepareUtf8(const UString &s) 0381 { 0382 // Allocate a buffer big enough to hold all the characters plus \0 0383 const int length = s.size(); 0384 _buffer = new char[length * 3 + 1]; 0385 0386 // Also create buffer for positions. We need one extra character in there, 0387 // even past the \0 since the non-empty handling may jump one past the end 0388 _originalPos = new int[length * 3 + 2]; 0389 0390 // Convert to runs of 8-bit characters, and generate indices 0391 // Note that we do NOT combine surrogate pairs here, as 0392 // regexps operate on them as separate characters 0393 char *p = _buffer; 0394 int *posOut = _originalPos; 0395 const UChar *d = s.data(); 0396 for (int i = 0; i != length; ++i) { 0397 unsigned short c = d[i].unicode(); 0398 0399 int sequenceLen; 0400 if (c < 0x80) { 0401 *p++ = (char)c; 0402 sequenceLen = 1; 0403 } else if (c < 0x800) { 0404 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 0405 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 0406 sequenceLen = 2; 0407 } else { 0408 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 0409 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 0410 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 0411 sequenceLen = 3; 0412 } 0413 0414 while (sequenceLen > 0) { 0415 *posOut = i; 0416 ++posOut; 0417 --sequenceLen; 0418 } 0419 } 0420 0421 _bufferSize = p - _buffer; 0422 0423 *p++ = '\0'; 0424 0425 // Record positions for \0, and the fictional character after that. 0426 *posOut = length; 0427 *(posOut + 1) = length + 1; 0428 } 0429 0430 void RegExpStringContext::prepareASCII(const UString &s) 0431 { 0432 _originalPos = nullptr; 0433 0434 // Best-effort attempt to get something done 0435 // when we don't have utf 8 available -- use 0436 // truncated version, and pray for the best 0437 CString truncated = s.cstring(); 0438 _buffer = new char[truncated.size() + 1]; 0439 memcpy(_buffer, truncated.c_str(), truncated.size()); 0440 _buffer[truncated.size()] = '\0'; // For _compile use 0441 _bufferSize = truncated.size(); 0442 } 0443 0444 RegExpStringContext::RegExpStringContext(const UString &s) 0445 { 0446 #ifndef NDEBUG 0447 _originalS = s; 0448 #endif 0449 0450 if (RegExp::utf8Support == RegExp::Supported) { 0451 prepareUtf8(s); 0452 } else { 0453 prepareASCII(s); 0454 } 0455 } 0456 0457 RegExpStringContext::~RegExpStringContext() 0458 { 0459 delete[] _originalPos; _originalPos = nullptr; 0460 delete[] _buffer; _buffer = nullptr; 0461 } 0462 0463 UString RegExp::match(const RegExpStringContext &ctx, const UString &s, bool *error, int i, int *pos, int **ovector) 0464 { 0465 #ifndef NDEBUG 0466 assert(s.data() == ctx._originalS.data()); // Make sure the context is right.. 0467 #endif 0468 0469 if (i < 0) { 0470 i = 0; 0471 } 0472 int dummyPos; 0473 if (!pos) { 0474 pos = &dummyPos; 0475 } 0476 *pos = -1; 0477 if (ovector) { 0478 *ovector = nullptr; 0479 } 0480 0481 if (i > s.size() || s.isNull()) { 0482 return UString::null(); 0483 } 0484 0485 #if HAVE_PCREPOSIX 0486 0487 if (!_regex) { 0488 return UString::null(); 0489 } 0490 0491 // Set up the offset vector for the result. 0492 // First 2/3 used for result, the last third used by PCRE. 0493 int *offsetVector; 0494 int offsetVectorSize; 0495 int fixedSizeOffsetVector[3]; 0496 if (!ovector) { 0497 offsetVectorSize = 3; 0498 offsetVector = fixedSizeOffsetVector; 0499 } else { 0500 offsetVectorSize = (_numSubPatterns + 1) * 3; 0501 offsetVector = new int [offsetVectorSize]; 0502 } 0503 0504 int startPos; 0505 if (utf8Support == Supported) { 0506 startPos = i; 0507 while (ctx.originalPos(startPos) < i) { 0508 ++startPos; 0509 } 0510 } else { 0511 startPos = i; 0512 } 0513 0514 int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0; 0515 0516 // See if we have to limit stack space... 0517 *error = false; 0518 int stackGlutton = 0; 0519 pcre_config(PCRE_CONFIG_STACKRECURSE, (void *)&stackGlutton); 0520 pcre_extra limits; 0521 if (stackGlutton) { 0522 #if HAVE(SYS_TIME_H) 0523 if (tryGrowingMaxStackSize) { 0524 rlimit l; 0525 getrlimit(RLIMIT_STACK, &l); 0526 availableStackSize = l.rlim_cur; 0527 if (l.rlim_cur < sWantedStackSizeLimit && 0528 (l.rlim_max > l.rlim_cur || l.rlim_max == RLIM_INFINITY)) { 0529 l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ? 0530 sWantedStackSizeLimit : std::min(l.rlim_max, sWantedStackSizeLimit); 0531 if ((didIncreaseMaxStackSize = !setrlimit(RLIMIT_STACK, &l))) { 0532 availableStackSize = l.rlim_cur; 0533 } 0534 } 0535 tryGrowingMaxStackSize = false; 0536 } 0537 #endif 0538 0539 limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION; 0540 // libPCRE docs claim that it munches about 500 bytes per recursion. 0541 // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes 0542 // (and I've measured 800 in an another instance) 0543 // We go somewhat conservative, and use about 3/4ths of that, 0544 // especially since we're not exactly light on the stack, either 0545 limits.match_limit_recursion = (availableStackSize / 1300) * 3 / 4; 0546 } 0547 0548 const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : nullptr, ctx.buffer(), 0549 ctx.bufferSize(), startPos, baseFlags, offsetVector, offsetVectorSize); 0550 0551 //Now go through and patch up the offsetVector 0552 if (utf8Support == Supported) 0553 for (int c = 0; c < 2 * numMatches; ++c) 0554 if (offsetVector[c] != -1) { 0555 offsetVector[c] = ctx.originalPos(offsetVector[c]); 0556 } 0557 0558 if (numMatches < 0) { 0559 #ifndef NDEBUG 0560 if (numMatches != PCRE_ERROR_NOMATCH) { 0561 fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches); 0562 } 0563 #endif 0564 if (offsetVector != fixedSizeOffsetVector) { 0565 delete [] offsetVector; 0566 } 0567 if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT) { 0568 *error = true; 0569 } 0570 return UString::null(); 0571 } 0572 0573 *pos = offsetVector[0]; 0574 if (ovector) { 0575 *ovector = offsetVector; 0576 } 0577 return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]); 0578 0579 #else 0580 0581 if (!_valid) { 0582 return UString::null(); 0583 } 0584 0585 const unsigned maxMatch = 10; 0586 regmatch_t rmatch[maxMatch]; 0587 0588 char *str = strdup(s.ascii()); // TODO: why ??? 0589 if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) { 0590 free(str); 0591 return UString::null(); 0592 } 0593 free(str); 0594 0595 if (!ovector) { 0596 *pos = rmatch[0].rm_so + i; 0597 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so); 0598 } 0599 0600 // map rmatch array to ovector used in PCRE case 0601 _numSubPatterns = 0; 0602 for (unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0; j++) { 0603 _numSubPatterns++; 0604 } 0605 int ovecsize = (_numSubPatterns + 1) * 3; // see above 0606 *ovector = new int[ovecsize]; 0607 for (unsigned j = 0; j < _numSubPatterns + 1; j++) { 0608 if (j > maxMatch) { 0609 break; 0610 } 0611 (*ovector)[2 * j] = rmatch[j].rm_so + i; 0612 (*ovector)[2 * j + 1] = rmatch[j].rm_eo + i; 0613 } 0614 0615 *pos = (*ovector)[0]; 0616 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]); 0617 0618 #endif 0619 } 0620 0621 } // namespace KJS