src/kjs/regexp.cpp

0001 /*
0002  *  This file is part of the KDE libraries
0003  *  Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
0004  *  Copyright (C) 2003,2004 Apple Computer, Inc.
0005  *  Copyright (C) 2006      Maksim Orlovich (maksim@kde.org)
0006  *  Copyright (C) 2007      Sune Vuorela (debian@pusling.com)
0007  *
0008  *  This library is free software; you can redistribute it and/or
0009  *  modify it under the terms of the GNU Lesser General Public
0010  *  License as published by the Free Software Foundation; either
0011  *  version 2 of the License, or (at your option) any later version.
0012  *
0013  *  This library is distributed in the hope that it will be useful,
0014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
0015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0016  *  Lesser General Public License for more details.
0017  *
0018  *  You should have received a copy of the GNU Lesser General Public
0019  *  License along with this library; if not, write to the Free Software
0020  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
0021  *
0022  */
0023
0024 #include "regexp.h"
0025 #include "lexer.h"
0026
0027 #include <assert.h>
0028 #include <stdio.h>
0029 #include <stdlib.h>
0030 #include <string.h>
0031 #include <wtf/Vector.h>
0032
0033 #if defined _WIN32 || defined _WIN64
0034 #define HAVE_SYS_TIME_H 0
0035 #endif
0036 #if HAVE_SYS_TIME_H
0037 #include <sys/time.h>
0038
0039 static const rlim_t sWantedStackSizeLimit = 32 * 1024 * 1024;
0040
0041 #endif
0042
0043 using WTF::Vector;
0044
0045 // GCC cstring uses these automatically, but not all implementations do.
0046 using std::strlen;
0047 using std::strcpy;
0048 using std::strncpy;
0049 using std::memset;
0050 using std::memcpy;
0051
0052 namespace KJS
0053 {
0054
0055 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
0056
0057 static bool sanitizePatternExtensions(UString &p, WTF::Vector<int> *parenIdx = nullptr);
0058
0059 // JS regexps can contain Unicode escape sequences (\uxxxx) which
0060 // are rather uncommon elsewhere. As our regexp libs don't understand
0061 // them we do the unescaping ourselves internally.
0062 // Also make sure to expand out any nulls as pcre_compile
0063 // expects null termination..
0064 static UString sanitizePattern(const UString &p)
0065 {
0066     UString np;
0067
0068     bool changed = false;
0069     const char *const nil = "\\x00";
0070     if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
0071         bool escape = false;
0072         changed = true;
0073         for (int i = 0; i < p.size(); ++i) {
0074             UChar c = p[i];
0075             if (escape) {
0076                 escape = false;
0077                 // we only care about \u
0078                 if (c == 'u') {
0079                     // standard unicode escape sequence looks like \uxxxx but
0080                     // other browsers also accept less than 4 hex digits
0081                     unsigned short u = 0;
0082                     int j = 0;
0083                     for (j = 0; j < 4; ++j) {
0084                         if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
0085                             u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
0086                             ++i;
0087                         } else {
0088                             // sequence incomplete. restore index.
0089                             // TODO: cleaner way to propagate warning
0090                             fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
0091                             i -= j;
0092                             break;
0093                         }
0094                     }
0095                     if (j < 4) {
0096                         // sequence was incomplete. treat \u as u which IE always
0097                         // and FF sometimes does.
0098                         np.append(UString('u'));
0099                     } else {
0100                         c = UChar(u);
0101                         switch (u) {
0102                         case 0:
0103                             // Make sure to encode 0, to avoid terminating the string
0104                             np += UString(nil);
0105                             break;
0106                         case '^':
0107                         case '$':
0108                         case '\\':
0109                         case '.':
0110                         case '*':
0111                         case '+':
0112                         case '?':
0113                         case '(': case ')':
0114                         case '{': case '}':
0115                         case '[': case ']':
0116                         case '|':
0117                             // escape pattern characters have to remain escaped
0118                             np.append(UString('\\'));
0119                         // intentional fallthrough
0120                         default:
0121                             np += UString(&c, 1);
0122                             break;
0123                         }
0124                     }
0125                     continue;
0126                 }
0127                 np += UString('\\');
0128                 np += UString(&c, 1);
0129             } else {
0130                 if (c == '\\') {
0131                     escape = true;
0132                 } else if (c == '\0') {
0133                     np += UString(nil);
0134                 } else {
0135                     np += UString(&c, 1);
0136                 }
0137             }
0138         }
0139     }
0140     // Rewrite very inefficient RE formulation:
0141     // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+
0142     // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space.
0143     if (p.find(".|\\s)") >= 0) {
0144         if (np.isEmpty()) {
0145             np = p;
0146         }
0147         bool didRewrite = false;
0148         WTF::Vector<int> parenIdx;
0149         sanitizePatternExtensions(np, &parenIdx);
0150         Vector<int>::const_iterator end = parenIdx.end();
0151         int previdx = 0;
0152         UString tmp;
0153         bool nonCapturing = false;
0154         for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) {
0155             int idx = *it;
0156             if (np.size() < idx + 6) {
0157                 break;
0158             }
0159             if (np[idx + 1] == '?' && np[idx + 2] == ':') {
0160                 nonCapturing = true;
0161                 idx += 3;
0162             } else {
0163                 ++idx;
0164             }
0165             if (!(np[idx] == '.' && np[idx + 1] == '|' && np[idx + 2] == '\\' && np[idx + 3] == 's')) {
0166                 continue;
0167             }
0168             if (np.size() >= idx + 6 && (np[idx + 5] == '+' || (np[idx + 5] == '*')) &&
0169                     // no need to do anything if the pattern is minimal e.g. (.|\s)+?
0170                     !(np.size() > idx + 6 && np[idx + 6] == '?')) {
0171                 didRewrite = true;
0172                 if (nonCapturing) {               // trivial case: (?:.|\s)+ => [\w\W]+
0173                     tmp.append(np, previdx, idx - previdx - 3);
0174                     tmp.append("[\\w\\W]");
0175                     tmp.append(np[idx + 5]);
0176                 } else if (np[idx + 5] == '*') {  // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?)
0177                     tmp.append(np, previdx, idx - previdx - 1);
0178                     tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)");
0179                 } else {                          // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W])
0180                     assert(np[idx + 5] == '+');
0181                     tmp.append(np, previdx, idx - previdx - 1);
0182                     tmp.append("[\\w\\W]*([\\w\\W])");
0183                 }
0184             } else {
0185                 tmp.append(np, previdx, idx - previdx + 5);
0186             }
0187             previdx = idx + 6;
0188         }
0189         if (didRewrite) {
0190             tmp.append(np, previdx);
0191             fprintf(stderr, "Pattern: %s ", np.ascii());
0192             fprintf(stderr, "was rewritten to: %s\n", tmp.ascii());
0193             np = tmp;
0194             changed = true;
0195         }
0196     }
0197     return (changed ? np : p);
0198 }
0199
0200 // For now, the only 'extension' to standard we are willing to deal with is
0201 // a non-escaped closing bracket, outside of a character class. e.g. /.*]/
0202 static bool sanitizePatternExtensions(UString &p, WTF::Vector<int> *parenIdx)
0203 {
0204     UString newPattern;
0205
0206     static const int StateNominal = 0, StateOpenBracket = 1;
0207     WTF::Vector<int> v;
0208     bool escape = false;
0209
0210     int state = StateNominal;
0211     int escapedSinceLastParen = 0;
0212     for (int i = 0; i < p.size(); ++i) {
0213         UChar c = p[i];
0214         if (escape) {
0215             escape = false;
0216         } else {
0217             if (c == '\\') {
0218                 escape = true;
0219             } else if (c == ']') {
0220                 if (state == StateOpenBracket) {
0221                     state = StateNominal;
0222                 } else if (state == StateNominal) {
0223                     v.append(i);
0224                     ++escapedSinceLastParen;
0225                 }
0226             } else if (c == '[') {
0227                 if (state == StateOpenBracket) {
0228                     v.append(i);
0229                     ++escapedSinceLastParen;
0230                 } else if (state == StateNominal) {
0231                     state = StateOpenBracket;
0232                 }
0233             } else if (c == '(') {
0234                 if (parenIdx && state == StateNominal) {
0235                     parenIdx->append(i + escapedSinceLastParen);
0236                     escapedSinceLastParen = 0;
0237                 }
0238             }
0239         }
0240     }
0241     if (state == StateOpenBracket) {
0242         // this is not recoverable.
0243         return false;
0244     }
0245     if (v.size()) {
0246         int pos = 0;
0247         Vector<int>::const_iterator end = v.end();
0248         for (Vector<int>::const_iterator it = v.begin(); it != end; ++it) {
0249             newPattern += p.substr(pos, *it - pos);
0250             pos = *it;
0251             newPattern += UString('\\');
0252         }
0253         newPattern += p.substr(pos);
0254         p = newPattern;
0255         return true;
0256     } else {
0257         return false;
0258     }
0259 }
0260
0261 bool RegExp::tryGrowingMaxStackSize = true;
0262 bool RegExp::didIncreaseMaxStackSize = false;
0263
0264 #if HAVE(SYS_TIME_H)
0265 rlim_t RegExp::availableStackSize = 8 * 1024 * 1024;
0266 #else
0267 int RegExp::availableStackSize = 8 * 1024 * 1024;
0268 #endif
0269
0270 RegExp::RegExp(const UString &p, char flags)
0271     : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0)
0272 {
0273 #if HAVE_PCREPOSIX
0274     // Determine whether libpcre has unicode support if need be..
0275     if (utf8Support == Unknown) {
0276         int supported;
0277         pcre_config(PCRE_CONFIG_UTF8, (void *)&supported);
0278         utf8Support = supported ? Supported : Unsupported;
0279     }
0280 #endif
0281
0282     UString intern = sanitizePattern(p);
0283
0284 #if HAVE_PCREPOSIX
0285     int options = 0;
0286
0287     // we are close but not 100% the same as Perl
0288 #ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
0289     options |= PCRE_JAVASCRIPT_COMPAT;
0290 #endif
0291
0292     // Note: the Global flag is already handled by RegExpProtoFunc::execute.
0293     // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
0294     if (flags & IgnoreCase) {
0295         options |= PCRE_CASELESS;
0296     }
0297     if (flags & Multiline) {
0298         options |= PCRE_MULTILINE;
0299     }
0300
0301     if (utf8Support == Supported) {
0302         options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
0303     }
0304
0305     const char *errorMessage;
0306     int errorOffset;
0307     bool secondTry = false;
0308
0309     while (1) {
0310         RegExpStringContext converted(intern);
0311
0312         _regex = pcre_compile(converted.buffer(), options, &errorMessage, &errorOffset, nullptr);
0313
0314         if (!_regex) {
0315 #ifdef PCRE_JAVASCRIPT_COMPAT
0316             // The compilation failed. It is likely the pattern contains non-standard extensions.
0317             // We may try to tolerate some of those extensions.
0318             bool doRecompile = !secondTry && sanitizePatternExtensions(intern);
0319             if (doRecompile) {
0320                 secondTry = true;
0321 #ifndef NDEBUG
0322                 fprintf(stderr, "KJS: pcre_compile() failed with '%s' - non-standard extensions detected in pattern, trying second compile after correction.\n", errorMessage);
0323 #endif
0324                 continue;
0325             }
0326 #endif
0327 #ifndef NDEBUG
0328             fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
0329 #endif
0330             _valid = false;
0331             return;
0332         }
0333         break;
0334     }
0335
0336 #ifdef PCRE_INFO_CAPTURECOUNT
0337     // Get number of subpatterns that will be returned.
0338     pcre_fullinfo(_regex, nullptr, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
0339 #endif
0340
0341 #else /* HAVE_PCREPOSIX */
0342
0343     int regflags = 0;
0344 #ifdef REG_EXTENDED
0345     regflags |= REG_EXTENDED;
0346 #endif
0347 #ifdef REG_ICASE
0348     if (flags & IgnoreCase) {
0349         regflags |= REG_ICASE;
0350     }
0351 #endif
0352
0353     //NOTE: Multiline is not feasible with POSIX regex.
0354     //if ( f & Multiline )
0355     //    ;
0356     // Note: the Global flag is already handled by RegExpProtoFunc::execute
0357
0358     int errorCode = regcomp(&_regex, intern.ascii(), regflags);
0359     if (errorCode != 0) {
0360 #ifndef NDEBUG
0361         char errorMessage[80];
0362         regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
0363         fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
0364 #endif
0365         _valid = false;
0366     }
0367 #endif
0368 }
0369
0370 RegExp::~RegExp()
0371 {
0372 #if HAVE_PCREPOSIX
0373     pcre_free(_regex);
0374 #else
0375     /* TODO: is this really okay after an error ? */
0376     regfree(&_regex);
0377 #endif
0378 }
0379
0380 void RegExpStringContext::prepareUtf8(const UString &s)
0381 {
0382     // Allocate a buffer big enough to hold all the characters plus \0
0383     const int length = s.size();
0384     _buffer = new char[length * 3 + 1];
0385
0386     // Also create buffer for positions. We need one extra character in there,
0387     // even past the \0 since the non-empty handling may jump one past the end
0388     _originalPos = new int[length * 3 + 2];
0389
0390     // Convert to runs of 8-bit characters, and generate indices
0391     // Note that we do NOT combine surrogate pairs here, as
0392     // regexps operate on them as separate characters
0393     char *p      = _buffer;
0394     int  *posOut = _originalPos;
0395     const UChar *d = s.data();
0396     for (int i = 0; i != length; ++i) {
0397         unsigned short c = d[i].unicode();
0398
0399         int sequenceLen;
0400         if (c < 0x80) {
0401             *p++ = (char)c;
0402             sequenceLen = 1;
0403         } else if (c < 0x800) {
0404             *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
0405             *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
0406             sequenceLen = 2;
0407         } else {
0408             *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
0409             *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
0410             *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
0411             sequenceLen = 3;
0412         }
0413
0414         while (sequenceLen > 0) {
0415             *posOut = i;
0416             ++posOut;
0417             --sequenceLen;
0418         }
0419     }
0420
0421     _bufferSize = p - _buffer;
0422
0423     *p++ = '\0';
0424
0425     // Record positions for \0, and the fictional character after that.
0426     *posOut     = length;
0427     *(posOut + 1) = length + 1;
0428 }
0429
0430 void RegExpStringContext::prepareASCII(const UString &s)
0431 {
0432     _originalPos = nullptr;
0433
0434     // Best-effort attempt to get something done
0435     // when we don't have utf 8 available -- use
0436     // truncated version, and pray for the best
0437     CString truncated = s.cstring();
0438     _buffer = new char[truncated.size() + 1];
0439     memcpy(_buffer, truncated.c_str(), truncated.size());
0440     _buffer[truncated.size()] = '\0'; // For _compile use
0441     _bufferSize = truncated.size();
0442 }
0443
0444 RegExpStringContext::RegExpStringContext(const UString &s)
0445 {
0446 #ifndef NDEBUG
0447     _originalS = s;
0448 #endif
0449
0450     if (RegExp::utf8Support == RegExp::Supported) {
0451         prepareUtf8(s);
0452     } else {
0453         prepareASCII(s);
0454     }
0455 }
0456
0457 RegExpStringContext::~RegExpStringContext()
0458 {
0459     delete[] _originalPos; _originalPos = nullptr;
0460     delete[] _buffer;      _buffer      = nullptr;
0461 }
0462
0463 UString RegExp::match(const RegExpStringContext &ctx, const UString &s, bool *error, int i, int *pos, int **ovector)
0464 {
0465 #ifndef NDEBUG
0466     assert(s.data() == ctx._originalS.data()); // Make sure the context is right..
0467 #endif
0468
0469     if (i < 0) {
0470         i = 0;
0471     }
0472     int dummyPos;
0473     if (!pos) {
0474         pos = &dummyPos;
0475     }
0476     *pos = -1;
0477     if (ovector) {
0478         *ovector = nullptr;
0479     }
0480
0481     if (i > s.size() || s.isNull()) {
0482         return UString::null();
0483     }
0484
0485 #if HAVE_PCREPOSIX
0486
0487     if (!_regex) {
0488         return UString::null();
0489     }
0490
0491     // Set up the offset vector for the result.
0492     // First 2/3 used for result, the last third used by PCRE.
0493     int *offsetVector;
0494     int offsetVectorSize;
0495     int fixedSizeOffsetVector[3];
0496     if (!ovector) {
0497         offsetVectorSize = 3;
0498         offsetVector = fixedSizeOffsetVector;
0499     } else {
0500         offsetVectorSize = (_numSubPatterns + 1) * 3;
0501         offsetVector = new int [offsetVectorSize];
0502     }
0503
0504     int startPos;
0505     if (utf8Support == Supported) {
0506         startPos = i;
0507         while (ctx.originalPos(startPos) < i) {
0508             ++startPos;
0509         }
0510     } else {
0511         startPos = i;
0512     }
0513
0514     int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;
0515
0516     // See if we have to limit stack space...
0517     *error = false;
0518     int stackGlutton = 0;
0519     pcre_config(PCRE_CONFIG_STACKRECURSE, (void *)&stackGlutton);
0520     pcre_extra limits;
0521     if (stackGlutton) {
0522 #if HAVE(SYS_TIME_H)
0523         if (tryGrowingMaxStackSize) {
0524             rlimit l;
0525             getrlimit(RLIMIT_STACK, &l);
0526             availableStackSize = l.rlim_cur;
0527             if (l.rlim_cur < sWantedStackSizeLimit &&
0528                     (l.rlim_max > l.rlim_cur || l.rlim_max == RLIM_INFINITY)) {
0529                 l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ?
0530                              sWantedStackSizeLimit : std::min(l.rlim_max, sWantedStackSizeLimit);
0531                 if ((didIncreaseMaxStackSize = !setrlimit(RLIMIT_STACK, &l))) {
0532                     availableStackSize = l.rlim_cur;
0533                 }
0534             }
0535             tryGrowingMaxStackSize = false;
0536         }
0537 #endif
0538
0539         limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION;
0540         // libPCRE docs claim that it munches about 500 bytes per recursion.
0541         // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
0542         // (and I've measured 800 in an another instance)
0543         // We go somewhat conservative, and use about 3/4ths of that,
0544         // especially since we're not exactly light on the stack, either
0545         limits.match_limit_recursion = (availableStackSize / 1300) * 3 / 4;
0546     }
0547
0548     const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : nullptr, ctx.buffer(),
0549                                      ctx.bufferSize(), startPos, baseFlags, offsetVector, offsetVectorSize);
0550
0551     //Now go through and patch up the offsetVector
0552     if (utf8Support == Supported)
0553         for (int c = 0; c < 2 * numMatches; ++c)
0554             if (offsetVector[c] != -1) {
0555                 offsetVector[c] = ctx.originalPos(offsetVector[c]);
0556             }
0557
0558     if (numMatches < 0) {
0559 #ifndef NDEBUG
0560         if (numMatches != PCRE_ERROR_NOMATCH) {
0561             fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
0562         }
0563 #endif
0564         if (offsetVector != fixedSizeOffsetVector) {
0565             delete [] offsetVector;
0566         }
0567         if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT) {
0568             *error = true;
0569         }
0570         return UString::null();
0571     }
0572
0573     *pos = offsetVector[0];
0574     if (ovector) {
0575         *ovector = offsetVector;
0576     }
0577     return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
0578
0579 #else
0580
0581     if (!_valid) {
0582         return UString::null();
0583     }
0584
0585     const unsigned maxMatch = 10;
0586     regmatch_t rmatch[maxMatch];
0587
0588     char *str = strdup(s.ascii()); // TODO: why ???
0589     if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
0590         free(str);
0591         return UString::null();
0592     }
0593     free(str);
0594
0595     if (!ovector) {
0596         *pos = rmatch[0].rm_so + i;
0597         return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
0598     }
0599
0600     // map rmatch array to ovector used in PCRE case
0601     _numSubPatterns = 0;
0602     for (unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0; j++) {
0603         _numSubPatterns++;
0604     }
0605     int ovecsize = (_numSubPatterns + 1) * 3; // see above
0606     *ovector = new int[ovecsize];
0607     for (unsigned j = 0; j < _numSubPatterns + 1; j++) {
0608         if (j > maxMatch) {
0609             break;
0610         }
0611         (*ovector)[2 * j] = rmatch[j].rm_so + i;
0612         (*ovector)[2 * j + 1] = rmatch[j].rm_eo + i;
0613     }
0614
0615     *pos = (*ovector)[0];
0616     return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
0617
0618 #endif
0619 }
0620
0621 } // namespace KJS