File indexing completed on 2024-04-21 04:36:18
0001 /* This file is part of kdev-pg-qt 0002 Copyright (C) 2010 Jonathan Schmidt-DominĂ© <devel@the-user.org> 0003 0004 This library is free software; you can redistribute it and/or 0005 modify it under the terms of the GNU Library General Public 0006 License as published by the Free Software Foundation; either 0007 version 2 of the License, or (at your option) any later version. 0008 0009 This library is distributed in the hope that it will be useful, 0010 but WITHOUT ANY WARRANTY; without even the implied warranty of 0011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0012 Library General Public License for more details. 0013 0014 You should have received a copy of the GNU Library General Public License 0015 along with this library; see the file COPYING.LIB. If not, write to 0016 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0017 Boston, MA 02110-1301, USA. 0018 */ 0019 0020 #ifndef KDEV_PG_REGEXP_HELPER 0021 #define KDEV_PG_REGEXP_HELPER 0022 0023 #include <kdev-pg-char-sets.h> 0024 0025 #include <iomanip> 0026 0027 namespace KDevPG 0028 { 0029 0030 template<CharEncoding codec> 0031 class SeqCharSet; 0032 0033 inline void printChar(ostream& o, uint x) 0034 { 0035 auto flags = o.flags(); 0036 auto width = o.width(); 0037 auto fill = o.fill(); 0038 if(x == '\\') 0039 o << "'\\\\'"; 0040 else if(x == '\'') 0041 o << "'\\\\''"; 0042 else if(x == '"') 0043 o << "'\\\"'"; 0044 else if(x >= 32 && x <= 126) 0045 o << '\'' << (char)x << '\''; 0046 else 0047 o << hex << "\\" << setw(2) << setfill('0') << x; 0048 o.fill(fill); 0049 o.width(width); 0050 o.flags(flags); 0051 } 0052 0053 inline void printChar(QTextStream& o, uint x) 0054 { 0055 auto base = o.integerBase(); 0056 auto width = o.fieldWidth(); 0057 auto pad = o.padChar(); 0058 if(x == '\\') 0059 o << "'\\\\'"; 0060 else if(x == '\'') 0061 o << "'\\\\''"; 0062 else if(x == '"') 0063 o << "'\\\"'"; 0064 else if(x >= 32 && x <= 126) 0065 o << '\'' << (char)x << '\''; 0066 else 0067 o << Qt::hex << "\\" << qSetFieldWidth(2) << qSetPadChar('0') << x; 0068 o.setPadChar(pad); 0069 o.setFieldWidth(width); 0070 o.setIntegerBase(base); 0071 } 0072 0073 template<typename Stream, CharEncoding codec> 0074 Stream& operator<<(Stream&, const SeqCharSet<codec>&); 0075 0076 template<class cs> class CharSetCondition; 0077 0078 template<CharEncoding _codec> 0079 class SeqCharSet 0080 { 0081 public: 0082 static const CharEncoding codec = _codec; 0083 private: 0084 typedef typename Codec2Int<codec>::Result Int; 0085 typedef SeqCharSet<codec> Self; 0086 vector<pair<Int, Int> > data; 0087 bool mEpsilon: 1; 0088 template<typename Stream, CharEncoding codec> /* when using fixed codec I get linker errors (undefined reference), templates do not get recognized to be the same */ 0089 friend Stream& operator<<(Stream&, const SeqCharSet<codec>&); 0090 friend class CharSetCondition< SeqCharSet<codec> >; 0091 public: 0092 SeqCharSet(const QString& s ) : mEpsilon(false) 0093 { 0094 auto str = qString2Codec<codec>(s); 0095 sort(str.begin(), str.end()); 0096 Int last = -1; 0097 for(int i = 0; i != str.size(); ++i) 0098 { 0099 Int x = str[i]; 0100 if(x == last) 0101 last = ++data.back().second; 0102 else 0103 data.push_back(make_pair(x, last = x+1)); 0104 } 0105 } 0106 SeqCharSet(Int chr) : mEpsilon(false) 0107 { 0108 data.push_back(make_pair(chr, chr + 1)); 0109 } 0110 SeqCharSet() : mEpsilon(true) 0111 {} 0112 SeqCharSet(const Self& o) : data(o.data), mEpsilon(o.mEpsilon) 0113 { 0114 } 0115 static Self fullSet() 0116 { 0117 Self ret; 0118 ret.mEpsilon = false; 0119 ret.data.push_back(make_pair(Int(0), Int(Codec2Size<codec>::value))); 0120 return ret; 0121 } 0122 static Self range(Int begin, Int end) 0123 { 0124 Self ret; 0125 ret.mEpsilon = false; 0126 if(begin < end) 0127 ret.data.push_back(make_pair(begin, end)); 0128 return ret; 0129 } 0130 static Self emptySet() 0131 { 0132 Self ret; 0133 ret.mEpsilon = false; 0134 return ret; 0135 } 0136 SeqCharSet& operator=(const Self& o) 0137 { 0138 data = o.data; 0139 mEpsilon = o.mEpsilon; 0140 return *this; 0141 } 0142 bool epsilon() const 0143 { 0144 return mEpsilon; 0145 } 0146 bool empty() const 0147 { 0148 return data.empty(); 0149 } 0150 bool accepts(Int x) const 0151 { 0152 int l = 0, r = data.size(); 0153 while(l != r) 0154 { 0155 int m = (l + r) / 2; 0156 if(data[m].first <= x) 0157 { 0158 if(data[m].second > x) 0159 return true; 0160 l = m + 1; 0161 } 0162 else 0163 r = m; 0164 } 0165 return false; 0166 } 0167 Self intersection(const Self& o) const 0168 { 0169 Self ret; 0170 ret.mEpsilon = false; 0171 size_t tpos = 0, opos = 0; 0172 while(tpos != data.size() && opos != o.data.size()) 0173 { 0174 if(data[tpos].first >= o.data[opos].second) 0175 ++opos; 0176 else if(o.data[opos].first >= data[tpos].second) 0177 ++tpos; 0178 else 0179 { 0180 ret.data.push_back(make_pair(max(o.data[opos].first, data[tpos].first), min(o.data[opos].second, data[tpos].second))); 0181 if(data[tpos].second > o.data[opos].second) 0182 ++opos; 0183 else 0184 ++tpos; 0185 } 0186 } 0187 // the end?? 0188 return ret; 0189 } 0190 Self& intersect(const Self& o) 0191 { 0192 *this = this->intersection(o); 0193 return *this; 0194 } 0195 Self& negate() 0196 { 0197 if(!mEpsilon) 0198 { 0199 if(data.size() == 0) 0200 { 0201 data.push_back(make_pair(0, Int(Codec2Size<codec>::value))); 0202 } 0203 else if(data.front().first == 0) 0204 { 0205 for(size_t i = 0; i != data.size()-1; ++i) 0206 { 0207 data[i].first = data[i].second; 0208 data[i].second = data[i+1].first; 0209 } 0210 if(data.back().second == Codec2Size<codec>::value) 0211 data.pop_back(); 0212 else 0213 { 0214 data.back().first = data.back().second; 0215 data.back().second = Codec2Size<codec>::value; 0216 } 0217 } 0218 else 0219 { 0220 Int tmpfirst = 0; 0221 for(size_t i = 0; i != data.size(); ++i) 0222 { 0223 Int tmptmpfirst = data[i].first; 0224 data[i].first = tmpfirst; 0225 tmpfirst = data[i].second; 0226 data[i].second = tmptmpfirst; 0227 } 0228 if(data.back().second != Codec2Size<codec>::value) 0229 data.push_back(make_pair(tmpfirst, Int(Codec2Size<codec>::value))); 0230 } 0231 } 0232 return *this; 0233 } 0234 Self difference(const Self& o) const 0235 { 0236 Self ret; 0237 ret.mEpsilon = false; 0238 size_t tpos = 0, opos = 0; 0239 bool inT = false, inO = false; 0240 size_t start = -1; // if there is a -1 we will know what happened 0241 while(tpos != data.size() && opos != o.data.size()) 0242 { 0243 if(inT) 0244 { 0245 if(inO) 0246 { 0247 if(o.data[opos].second >= data[tpos].second) 0248 { 0249 inT = false; 0250 ++tpos; 0251 } 0252 else 0253 { 0254 start = o.data[opos].second; 0255 inO = false; 0256 ++opos; 0257 } 0258 } 0259 else 0260 { 0261 if(o.data[opos].first >= data[tpos].second) 0262 { 0263 if(start != data[tpos].second) 0264 ret.data.push_back(make_pair(start, data[tpos].second)); 0265 ++tpos; 0266 inT = false; 0267 } 0268 else 0269 { 0270 if(o.data[opos].first > start) 0271 ret.data.push_back(make_pair(start, o.data[opos].first)); 0272 inO = true; 0273 } 0274 } 0275 } 0276 else 0277 { 0278 if(inO) 0279 { 0280 if(o.data[opos].second <= data[tpos].first) 0281 { 0282 inO = false; 0283 ++opos; 0284 } 0285 else 0286 { 0287 inT = true; 0288 start = o.data[opos].second; 0289 } 0290 } 0291 else 0292 { 0293 if(data[tpos].first < o.data[opos].first) 0294 { 0295 inT = true; 0296 start = data[tpos].first; 0297 } 0298 else 0299 { 0300 inO = true; 0301 } 0302 } 0303 } 0304 } 0305 if(inT) 0306 ret.data.push_back(make_pair(start, data[tpos++].second)); 0307 while(tpos != data.size()) 0308 ret.data.push_back(data[tpos++]); 0309 return ret; 0310 } 0311 Self getUnion(const Self& o) const 0312 { 0313 Self ret; 0314 ret.mEpsilon = false; 0315 size_t tpos = 0, opos = 0; 0316 bool inT = false, inO = false; 0317 size_t start = -1; // if there is a -1 we will know what happened 0318 while(tpos != data.size() && opos != o.data.size()) 0319 { 0320 if(inT) 0321 { 0322 if(inO) 0323 { 0324 if(o.data[opos].second >= data[tpos].second) 0325 { 0326 inT = false; 0327 ++tpos; 0328 } 0329 else 0330 { 0331 inO = false; 0332 ++opos; 0333 } 0334 } 0335 else 0336 { 0337 if(o.data[opos].first > data[tpos].second) 0338 { 0339 if(start != data[tpos].second) 0340 ret.data.push_back(make_pair(start, data[tpos].second)); 0341 ++tpos; 0342 inT = false; 0343 } 0344 else 0345 { 0346 inO = true; 0347 } 0348 } 0349 } 0350 else 0351 { 0352 if(inO) 0353 { 0354 if(o.data[opos].second < data[tpos].first) 0355 { 0356 if(start != o.data[opos].second) 0357 ret.data.push_back(make_pair(start, o.data[opos].second)); 0358 inO = false; 0359 ++opos; 0360 } 0361 else 0362 { 0363 inT = true; 0364 } 0365 } 0366 else 0367 { 0368 if(data[tpos].first < o.data[opos].first) 0369 { 0370 inT = true; 0371 start = data[tpos].first; 0372 } 0373 else 0374 { 0375 inO = true; 0376 start = o.data[opos].first; 0377 } 0378 } 0379 } 0380 } 0381 if(inT) 0382 ret.data.push_back(make_pair(start, data[tpos++].second)); 0383 while(tpos != data.size()) 0384 ret.data.push_back(data[tpos++]); 0385 if(inO) 0386 ret.data.push_back(make_pair(start, o.data[opos++].second)); 0387 while(opos != o.data.size()) 0388 ret.data.push_back(o.data[opos++]); 0389 return ret; 0390 } 0391 Self& unite(const Self& o) 0392 { 0393 data = getUnion(o).data; 0394 return *this; 0395 } 0396 bool operator==(const Self& o) const 0397 { 0398 return mEpsilon == o.mEpsilon && data == o.data; 0399 } 0400 }; 0401 0402 template<typename Stream, CharEncoding codec> 0403 Stream& operator<<(Stream &o, const SeqCharSet<codec> &cs) 0404 { 0405 typedef typename Codec2Int<codec>::Result Int; 0406 if(cs.mEpsilon) 0407 o << "ε"; 0408 else 0409 { 0410 for(auto i = cs.data.begin(); i != cs.data.end(); ++i) 0411 { 0412 if(i != cs.data.begin()) 0413 o << ", "; 0414 const auto &p = *i; 0415 if(p.first == p.second - 1) 0416 printChar(o, p.first); 0417 else 0418 { 0419 printChar(o, p.first); 0420 o << "-"; 0421 printChar(o, (uint)p.second-1); 0422 } 0423 } 0424 } 0425 return o; 0426 } 0427 0428 template<CharEncoding codec> 0429 class TableCharSet; 0430 0431 template<typename Stream, CharEncoding codec> 0432 Stream& operator<<(Stream&, const TableCharSet<codec>&); 0433 0434 template<CharEncoding _codec> 0435 class TableCharSet 0436 { 0437 public: 0438 static const CharEncoding codec = _codec; 0439 private: 0440 static const size_t nChars = Codec2Size<codec>::value; 0441 typedef bitset<nChars> BitArray; 0442 static const BitArray nullData; 0443 bitset<Codec2Size<codec>::value> data; 0444 bool mEpsilon: 1; 0445 typedef TableCharSet<codec> Self; 0446 typedef typename Codec2Int<codec>::Result Int; 0447 template<typename Stream, CharEncoding codec> /* when using fixed codec I get linker errors (undefined reference), templates do not get recognized to be the same */ 0448 friend Stream& KDevPG::operator<<(Stream&, const TableCharSet<codec>&); 0449 public: 0450 TableCharSet(const QString& s) : mEpsilon(false) 0451 { 0452 const auto str = qString2Codec<codec>(s); 0453 Int last = -1; 0454 for(int i = 0; i != str.size(); ++i) 0455 { 0456 Int x = str[i]; 0457 data[x] = true; 0458 } 0459 } 0460 TableCharSet(Int chr) : mEpsilon(false) 0461 { 0462 data[chr] = true; 0463 } 0464 TableCharSet() : mEpsilon(true) 0465 {} 0466 static Self emptySet() 0467 { 0468 Self ret; 0469 ret.mEpsilon = false; 0470 return ret; 0471 } 0472 static Self fullSet() 0473 { 0474 Self ret; 0475 ret.data.set(); 0476 ret.mEpsilon = false; 0477 return ret; 0478 } 0479 static Self range(Int begin, Int end) 0480 { 0481 Self ret; 0482 if(begin < end) 0483 for(Int i = begin; i != end; ++i) 0484 ret.data[i] = true; 0485 ret.mEpsilon = false; 0486 return ret; 0487 } 0488 Self& negate() 0489 { 0490 data.flip(); 0491 return *this; 0492 } 0493 bool accepts(Int x) const 0494 { 0495 return data[x]; 0496 } 0497 bool empty() const 0498 { 0499 return data == nullData; 0500 } 0501 bool epsilon() const 0502 { 0503 return mEpsilon; 0504 } 0505 Self& intersect(const Self& o) 0506 { 0507 data &= o.data; 0508 return *this; 0509 } 0510 Self intersection(const Self& o) const 0511 { 0512 Self ret(*this); 0513 ret.data &= o.data; 0514 return ret; 0515 } 0516 Self difference(const Self& o) const 0517 { 0518 Self ret(o); 0519 ret.data.flip(); 0520 ret.data &= data; 0521 return ret; 0522 } 0523 Self getUnion(const Self& o) const 0524 { 0525 Self ret(*this); 0526 ret.data |= o.data; 0527 return ret; 0528 } 0529 Self& unite(const Self& o) 0530 { 0531 data |= o.data; 0532 return *this; 0533 } 0534 bool operator==(const Self& o) const 0535 { 0536 return mEpsilon == o.mEpsilon && data == o.data; 0537 } 0538 }; 0539 0540 template<typename Stream, CharEncoding codec> 0541 Stream& operator<<(Stream &o, const TableCharSet<codec> &cs) 0542 { 0543 typedef typename Codec2Int<codec>::Result Int; 0544 if(cs.mEpsilon) 0545 o << "ε"; 0546 else 0547 { 0548 for(size_t i = 0; i != cs.data.size(); ++i) 0549 if(cs.data[i]) 0550 printChar(o, (uint)i); 0551 } 0552 return o; 0553 } 0554 0555 template<class CharSet> 0556 class CharSetCondition 0557 { 0558 }; 0559 0560 template<CharEncoding codec> 0561 class CharSetCondition<TableCharSet<codec> > 0562 { 0563 typedef typename Codec2Int<codec>::Result Int; 0564 public: 0565 void operator()(QTextStream& str, const vector<pair<TableCharSet<codec>, size_t> >& transition) 0566 { 0567 str << "switch(chr) { "; 0568 for(size_t j = 0; j != transition.size(); ++j) 0569 { 0570 if(!transition[j].first.empty()) 0571 { 0572 for(quint64 i = 0; i != Codec2Size<codec>::value; ++i) 0573 { 0574 if(transition[j].first.accepts(i)) 0575 str << "case " << quint64(i) << ":\n"; 0576 } 0577 str << "goto _state_" << transition[j].second << ";"; 0578 } 0579 } 0580 str << "default: goto _end;\n}\n"; 0581 } 0582 }; 0583 0584 template<CharEncoding codec> 0585 class CharSetCondition<SeqCharSet<codec> > 0586 { 0587 typedef typename Codec2Int<codec>::Result Int; 0588 void codegen(QTextStream& str, const vector<pair<pair<Int, Int>, size_t> >& data, size_t l, size_t r, size_t tmin, size_t tmax) 0589 { 0590 if(l == r) 0591 str << "goto _end;"; 0592 else if(r == l + 1) 0593 { 0594 if(data[l].first.first == tmin) 0595 { 0596 if(data[l].first.second == tmax) 0597 str << "goto _state_" << data[l].second << ";"; 0598 else 0599 { 0600 if(data[l].first.second == data[l].first.first + 1) 0601 str << "if(chr == " << quint64(data[l].first.first) << ")"; 0602 else 0603 str << "if(chr < " << quint64(data[l].first.second) << ")"; 0604 str << "\ngoto _state_" << data[l].second << "; else\ngoto _end;\n"; 0605 } 0606 } 0607 else 0608 { 0609 if(data[l].first.second == tmax) 0610 { 0611 if(data[l].first.second == data[l].first.first + 1) 0612 str << "if(chr == " << quint64(data[l].first.first) << ")"; 0613 else 0614 str << "if(chr >= " << quint64(data[l].first.first) << ")"; 0615 str << "\ngoto _state_" << data[l].second << "; else\ngoto _end;\n"; 0616 } 0617 else 0618 { 0619 if(data[l].first.first == data[l].first.second - 1) 0620 str << "if(chr == " << quint64(data[l].first.first) << ")"; 0621 else if(data[l].first.first == data[l].first.second - 2) 0622 str << "if(chr == " << quint64(data[l].first.first) << " || chr == " << quint64(data[l].first.first + 1) << ")"; 0623 else 0624 str << "if(chr >= " << quint64(data[l].first.first) << " && chr < " << quint64(data[l].first.second) << ")"; 0625 str << "\ngoto _state_" << data[l].second << "; else\ngoto _end;\n"; 0626 } 0627 } 0628 } 0629 else 0630 { 0631 size_t mid = (l + r) / 2; 0632 str << "if(chr < " << quint64(data[mid].first.first) << ") {"; 0633 codegen(str, data, l, mid, tmin, data[mid].first.first); 0634 str << "} else {"; 0635 codegen(str, data, mid, r, data[mid].first.first, tmax); 0636 str << "}"; 0637 } 0638 } 0639 public: 0640 void operator()(QTextStream& str, const vector<pair<SeqCharSet<codec>, size_t> >& transition) 0641 { 0642 vector<pair<pair<Int, Int>, size_t> > data; 0643 for(size_t i = 0; i != transition.size(); ++i) 0644 { 0645 for(size_t j = 0; j != transition[i].first.data.size(); ++j) 0646 data.push_back(make_pair(transition[i].first.data[j], transition[i].second)); 0647 } 0648 sort(data.begin(), data.end()); 0649 codegen(str, data, 0, data.size(), 0, Codec2Size<codec>::value); 0650 } 0651 }; 0652 0653 // just for the case QTextCodec may be needed 0654 /* 0655 class QAsciiCodec : public QTextCodec 0656 { 0657 public: 0658 QAsciiCodec() : QTextCodec() 0659 { 0660 } 0661 QList<QByteArray> aliases() const 0662 { 0663 QList<QByteArray> ret; 0664 ret << "ascii" << "us-ascii" << "usascii"; 0665 return ret; 0666 } 0667 QByteArray convertFromUnicode(const QChar* in, int length, ConverterState* state) const 0668 { 0669 Q_UNUSED(state); 0670 return QString(in, length).toAscii(); 0671 } 0672 virtual QString convertToUnicode(const char* in, int length, ConverterState* state) const 0673 { 0674 Q_UNUSED(state); 0675 return QString::fromAscii(in, length); 0676 } 0677 virtual int mibEnum() const 0678 { 0679 return 47110815; // pseudo random, should be unique 0680 } 0681 virtual QByteArray name() const 0682 { 0683 return "us-ascii"; 0684 } 0685 } qAsciiCodec; 0686 */ 0687 0688 } 0689 0690 #endif