kregexpeditor/src/qregexpparser.l

0001 /*
0002  *  Copyright (c) 2002-2003 Jesper K. Pedersen <blackie@kde.org>
0003  *
0004  *  This library is free software; you can redistribute it and/or
0005  *  modify it under the terms of the GNU Library General Public
0006  *  License version 2 as published by the Free Software Foundation.
0007  *
0008  *  This library is distributed in the hope that it will be useful,
0009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
0010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0011  *  Library General Public License for more details.
0012  *
0013  *  You should have received a copy of the GNU Library General Public License
0014  *  along with this library; see the file COPYING.LIB.  If not, write to
0015  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0016  *  Boston, MA 02110-1301, USA.
0017  **/
0018 %option noyywrap
0019 %option nounput
0020 %option reentrant
0021 %option bison-bridge
0022
0023 %{
0024
0025   #include "textrangeregexp.h"
0026   #include "gen_qregexpparser.hh"
0027   #include "qregexpparsercommon.h"
0028
0029   void parseRange( const char* txt, int* min, int* max );
0030   RegExp* parseCharClass( const char* match );
0031 %}
0032
0033 Escape   \\.
0034 BackRef  \\[1-9][0-9]*
0035 CharClass \[^?\]?[^]]*\]
0036 Range \{[0-9]*(,[0-9]*)?\}
0037 HexChar \\x[0-9a-fA-F]{1,4}
0038 OctChar \\0[0-7]{1,4}
0039 SpecialEsc \\[afnrtv]
0040 %%
0041 "\\b"      return TOK_PosWordChar;
0042 "\\B"      return TOK_PosNonWordChar;
0043 "\\d"      {
0044              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0045              regexp->setDigit( true );
0046              yylval->regexp = regexp;
0047              return TOK_CharClass;
0048            }
0049 "\\D"      {
0050              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0051              regexp->setNonDigit( true );
0052              yylval->regexp = regexp;
0053              return TOK_CharClass;
0054            }
0055 "\\s"      {
0056              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0057              regexp->setSpace( true );
0058              yylval->regexp = regexp;
0059              return TOK_CharClass;
0060            }
0061 "\\S"      {
0062              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0063              regexp->setNonSpace( true );
0064              yylval->regexp = regexp;
0065              return TOK_CharClass;
0066            }
0067 "\\w"      {
0068              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0069              regexp->setWordChar( true );
0070              yylval->regexp = regexp;
0071              return TOK_CharClass;
0072            }
0073 "\\W"      {
0074              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0075              regexp->setNonWordChar( true );
0076              yylval->regexp = regexp;
0077              return TOK_CharClass;
0078            }
0079 {SpecialEsc} {
0080                TextRangeRegExp* regexp = new TextRangeRegExp( false );
0081                regexp->addCharacter( QString::fromLocal8Bit( yytext ) );
0082                yylval->regexp = regexp;
0083                return TOK_CharClass;
0084              }
0085
0086 {HexChar}  {
0087              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0088              regexp->addCharacter( QString::fromLocal8Bit(yytext) );
0089              yylval->regexp = regexp;
0090              return TOK_CharClass;
0091            }
0092 {OctChar}  {
0093              TextRangeRegExp* regexp = new TextRangeRegExp( false );
0094              regexp->addCharacter( QString::fromLocal8Bit(yytext) );
0095              yylval->regexp = regexp;
0096              return TOK_CharClass;
0097            }
0098 "."        return TOK_Dot;
0099 "$"        return TOK_Dollar;
0100 "^"        return TOK_Carat;
0101 "(?:"      return TOK_MagicLeftParent;
0102 "(?="      return TOK_PosLookAhead;
0103 "(?!"      return TOK_NegLookAhead;
0104 "("        return TOK_LeftParen;
0105 ")"        return TOK_RightParent;
0106 "|"        return TOK_Bar;
0107 "*"        { yylval->range.min = 0; yylval->range.max=-1; return TOK_Quantifier; }
0108 "?"        { yylval->range.min = 0; yylval->range.max=1;  return TOK_Quantifier; }
0109 "+"        { yylval->range.min = 1; yylval->range.max=-1; return TOK_Quantifier; }
0110 {Range}     { parseRange( yytext, &yylval->range.min, &yylval->range.max ); return TOK_Quantifier; }
0111 {CharClass} { yylval->regexp = parseCharClass(yytext); return TOK_CharClass; }
0112 {BackRef}   { yylval->backRef = atoi( yytext+1 ); return TOK_BackRef; }
0113 {Escape}    { yylval->ch = yytext[1]; return TOK_EscapeChar; }
0114 .           { yylval->ch = yytext[0]; return TOK_Char; }
0115
0116 %%
0117
0118 void scannerInit( yyscan_t *scanner, struct parse_context *context, const QString& qstr )
0119 {
0120   yylex_init( scanner );
0121   yyset_extra( context, *scanner );
0122   QByteArray cstr;
0123   if ( !qstr.isNull() )
0124     cstr = qstr.toLatin1();
0125   yy_switch_to_buffer( yy_scan_string( cstr.constData(), *scanner ), *scanner );
0126 }
0127
0128 void scannerDestroy( yyscan_t scanner )
0129 {
0130   yylex_destroy( scanner );
0131 }
0132
0133 /**
0134    This function parses a range in a form similar to "{3,4}", "{,7}"
0135    etc. and returns the value in the integers pointed to by min and max.
0136 */
0137 void parseRange( const char* txt, int* min, int* max )
0138 {
0139
0140   /*
0141       case  txt   min  max
0142        1    {}     0   -1
0143        2    {,}    0   -1
0144        3    {5}    5    5
0145        4    {5,}   5   -1
0146        5    {,7}   0    7
0147        6    {5,7}  5    7
0148   */
0149   char c;
0150   int i = 1;
0151   int minimum=0, maximum=0;
0152   int minFound=0, maxFound=0, commaFound = 0;
0153
0154   while ( (c = txt[i++]) != ',' && c != '}') {
0155     minimum = minimum*10+ c-'0';
0156     minFound=1;
0157   }
0158
0159   if ( c == ',' )
0160     commaFound = 1;
0161
0162   if ( c != '}' ) {
0163     while ( (c = txt[i++]) != '}') {
0164       maximum = maximum*10+ c-'0';
0165       maxFound = 1;
0166     }
0167   }
0168
0169   *min = minimum;
0170   if ( maxFound )
0171     *max = maximum;   /* case 5,6 */
0172   else if ( !minFound )
0173     *max = -1;        /* case 1,2 */
0174   else if ( commaFound )
0175     *max = -1;        /* case 4 */
0176   else
0177     *max = minimum;   /* case 3 */
0178 }
0179
0180
0181 /**
0182     This function parses a character range like "[^ab1-4]".
0183 */
0184 RegExp* parseCharClass( const char* match )
0185 {
0186   TextRangeRegExp* res = new TextRangeRegExp( false );
0187   QString txt = QString::fromLocal8Bit( match );
0188   if(txt.length() <= 2)
0189     return res;
0190   txt = txt.mid(1,txt.length()-2);
0191
0192   int i = 0;
0193   QChar ch = txt.at(i++);
0194   QString pendingChar;
0195   QString thisChar;
0196   bool charPending = false;
0197   bool rangePending = false;
0198   bool flushPending = false;
0199
0200   if ( i < txt.length() && ch == QLatin1Char('^') ) {
0201     res->setNegate( true );
0202     ch = txt.at(i++);
0203   }
0204
0205   do {
0206     // If a character is pending, and the next char is '-' then we are
0207     // possible looking at a range.
0208     if ( i < txt.length() && ch == QLatin1Char('-') && charPending ) {
0209       rangePending = true;
0210       ch = txt.at(i++);
0211       continue;
0212     }
0213
0214     // If we have a pending character, but do not also have a pending
0215     // range, then the pending character was not part of a range, and
0216     // should therefore just be added as a single character.
0217     if ( charPending && !rangePending ) {
0218       res->addCharacter( pendingChar );
0219       charPending = false;
0220     }
0221
0222     if ( ch == QLatin1Char('\\') ) {
0223       // Handle the cases where an escape character is specified.
0224       ch = txt.at(i++);
0225
0226       if ( ch == QLatin1Char('a') || ch == QLatin1Char('f') || ch == QLatin1Char('n') || ch == QLatin1Char('r') || ch == QLatin1Char('t') || ch == QLatin1Char('v') ) {
0227         // These are just seen as normal characters.
0228         thisChar = QString::fromLocal8Bit("\\") + ch;
0229       }
0230       else if ( ch == QLatin1Char('d') ) {
0231         // The following characters represent character groups. If any of
0232         // these are seen in a range, then the range is ignored, thus [a-\s]
0233         // matches an 'a', a '-', and a space (\s means space).
0234         res->setDigit( true );
0235         flushPending = true;
0236       }
0237       else if ( ch == QLatin1Char('D') ) {
0238         res->setNonDigit( true );
0239         flushPending = true;
0240       }
0241       else if ( ch == QLatin1Char('s') ) {
0242         res->setSpace( true );
0243         flushPending = true;
0244       }
0245       else if ( ch == QLatin1Char('S') ) {
0246         res->setNonSpace( true );
0247         flushPending = true;
0248       }
0249       else if ( ch == QLatin1Char('w') ) {
0250         res->setWordChar( true );
0251         flushPending = true;
0252       }
0253       else if ( ch == QLatin1Char('W') ) {
0254         res->setNonWordChar( true );
0255         flushPending = true;
0256       }
0257       else if ( ch == QLatin1Char('x') || ch == QLatin1Char('X') ) {
0258         // This is a hexidecimal character: \xHHHH
0259         QString str;
0260         for ( int j=0; j<4; j++) {
0261           ch = txt.at(i++);
0262             if ( ch == QLatin1Char('a') || ch == QLatin1Char('A') || ch == QLatin1Char('b') || ch == QLatin1Char('B') ||
0263                  ch == QLatin1Char('c') || ch == QLatin1Char('C') || ch == QLatin1Char('d') || ch == QLatin1Char('D') ||
0264                  ch == QLatin1Char('e') || ch == QLatin1Char('E') || ch == QLatin1Char('f') || ch == QLatin1Char('F') ||
0265                  ch == QLatin1Char('0') || ch == QLatin1Char('1') || ch == QLatin1Char('2') || ch == QLatin1Char('3') ||
0266                  ch == QLatin1Char('4') || ch == QLatin1Char('5') || ch == QLatin1Char('6') || ch == QLatin1Char('7') ||
0267                  ch == QLatin1Char('8') || ch == QLatin1Char('9') )
0268               str += ch;
0269             else
0270               i--;
0271         }
0272         thisChar = QString::fromLocal8Bit("\\x") + str;
0273       }
0274       else if ( ch == QLatin1Char('0') ) {
0275         // This is an octal character
0276         QString str;
0277         for ( int j=0; j<4; j++) {
0278           ch = txt.at(i++);
0279           if ( ch == QLatin1Char('0') || ch == QLatin1Char('1') || ch == QLatin1Char('2') || ch == QLatin1Char('3')
0280                || ch == QLatin1Char('4') || ch == QLatin1Char('5') || ch == QLatin1Char('6') || ch == QLatin1Char('7') )
0281             str += ch;
0282           else
0283             i--;
0284         }
0285         thisChar = QString::fromLocal8Bit("\\x") + str ;
0286       }
0287       else {
0288         // Anything else escaped just means the character itself.
0289         thisChar = ch;
0290       }
0291     }
0292     else {
0293       // A non escaped character.
0294       thisChar = ch;
0295     }
0296
0297     // The characters \s,\S,\w,\W,\d or \D, can not be part of a range,
0298     // thus if they are meet in what looks like a range, then the
0299     // characters of the range is justed seen as normal non range
0300     // characters. thus [a-\s] matches an 'a', a '-', and a space (\s means
0301     // space).
0302     if ( flushPending ) {
0303       if ( charPending )
0304         res->addCharacter( pendingChar );
0305       if ( rangePending )
0306         res->addCharacter( QString::fromLocal8Bit("-") );
0307       flushPending = false;
0308       charPending = false;
0309       rangePending = false;
0310     }
0311     else {
0312       if ( rangePending ) {
0313         res->addRange( pendingChar, thisChar );
0314         charPending = false;
0315         rangePending = false;
0316       }
0317       else {
0318         pendingChar = thisChar;
0319         charPending = true;
0320       }
0321     }
0322     if ( i == txt.length() )
0323         break;
0324     ch = txt.at(i++);
0325   }
0326   while ( ch != QLatin1Char(']') );
0327
0328   if ( charPending )
0329     res->addCharacter( pendingChar );
0330   if ( rangePending )
0331     res->addCharacter( QString::fromLocal8Bit("-") );
0332
0333   return res;
0334 }