File indexing completed on 2024-04-28 08:27:24
0001 /* This file is part of kdev-pg-qt 0002 Copyright (C) 2010 Jonathan Schmidt-Dominé <devel@the-user.org> 0003 0004 This library is free software; you can redistribute it and/or 0005 modify it under the terms of the GNU Library General Public 0006 License as published by the Free Software Foundation; either 0007 version 2 of the License, or (at your option) any later version. 0008 0009 This library is distributed in the hope that it will be useful, 0010 but WITHOUT ANY WARRANTY; without even the implied warranty of 0011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0012 Library General Public License for more details. 0013 0014 You should have received a copy of the GNU Library General Public License 0015 along with this library; see the file COPYING.LIB. If not, write to 0016 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0017 Boston, MA 02110-1301, USA. 0018 */ 0019 0020 #include "kdev-pg-char-sets.h" 0021 #include "kdev-pg-regexp.h" 0022 #include "kdev-pg.h" 0023 #include "kdev-pg-regexp-helper.h" 0024 #include "kdev-pg-bit-array.h" 0025 #include <iostream> 0026 #include <queue> 0027 #include <stack> 0028 #include <QHash> 0029 #include <unordered_set> 0030 #include <unordered_map> 0031 0032 using namespace std; 0033 0034 // TODO: Lookahead operator 0035 // TODO: Line-End etc. 0036 // TODO: Deactivate/activate rules (have to keep the bitsets of possible final states) 0037 0038 0039 #define q_Hash_to_tr1_hash(type) \ 0040 namespace std \ 0041 { \ 0042 template<> struct hash< type > \ 0043 { \ 0044 inline size_t operator()(const type &x) const \ 0045 { \ 0046 return qHash(x); \ 0047 } \ 0048 }; \ 0049 } 0050 0051 #if (QT_VERSION < QT_VERSION_CHECK(5, 14, 0)) 0052 // Qt >= 5.14 already has std::hash for QBitArray 0053 q_Hash_to_tr1_hash(QBitArray) 0054 #endif 0055 0056 namespace KDevPG 0057 { 0058 0059 typedef BitArray UsedBitArray; 0060 typedef QUtf8ToUcs4Iterator Iterator; 0061 // typedef TableCharSet<Ascii> CharSet; 0062 0063 template<typename CharSet> class NFA; 0064 0065 inline QString codeForDot(QString str) 0066 { 0067 QString out = ""; 0068 int pos = 0; 0069 forever 0070 { 0071 int npos = str.indexOf("\n\01!ASIgnore\"!!\n# ", pos); 0072 if(npos == -1) 0073 { 0074 out += QStringView(str).mid(pos); 0075 break; 0076 } 0077 out += QStringView(str).mid(pos, npos - pos); 0078 int nlpos = str.indexOf('\n', npos + 17); 0079 int codeendpos = str.indexOf("\n\01!AS/Ignore\"!!\n", nlpos); 0080 if(nlpos == -1 || codeendpos == -1) 0081 { 0082 out += "<junk>"; 0083 break; 0084 } 0085 out += QStringView(str).mid(nlpos + 1, codeendpos - nlpos - 1); 0086 pos = codeendpos + 17; 0087 } 0088 return out.replace('\"', "\\\"").replace('\n', '\t').trimmed(); 0089 } 0090 0091 /** 0092 * Deterministic finite automaton. 0093 */ 0094 template<typename CS> 0095 class DFA 0096 { 0097 public: 0098 typedef CS CharSet; 0099 private: 0100 size_t nstates; 0101 vector<vector<pair<CharSet, size_t> > > rules; 0102 size_t numActions; 0103 vector<size_t> accept; 0104 vector<QString> actions; 0105 friend class NFA<CharSet>; 0106 public: 0107 /// Code used for the detected tokens in the generated code 0108 void setActions(const vector<QString>& _actions) 0109 { 0110 actions = _actions; 0111 numActions = actions.size() - 1; 0112 } 0113 /// Generate code for transitions 0114 void codegen(QTextStream& str) 0115 { 0116 str << "goto _state_0; // no warning about unused label\n"; 0117 CharSetCondition<CharSet> csc; 0118 for(size_t i = 0; i != nstates; ++i) 0119 { 0120 str << "\n_state_" + QString::number(i) + ":\n"; 0121 if(accept[i]) 0122 str << "lpos = lxCURR_POS; lstate = " << accept[i] << ";\n"; 0123 str << "lxNEXT_CHR(chr); "; 0124 csc(str, rules[i]); 0125 } 0126 str << "_end:\nplain() = lpos;\nswitch(lstate) {\n"; 0127 for(size_t i = 0; i <= numActions; ++i) 0128 { 0129 str << "case " << QString::number(i) << ": "; 0130 if(i == 0) 0131 str << "goto _fail; // no warning about unused label\n_fail: "; 0132 str << "{" << actions[i] << "\nlxFINISH\n}\n"; 0133 } 0134 str << "}\n"; 0135 } 0136 /// .dot-output 0137 void dotOutput(QTextStream& out, const QString& name) 0138 { 0139 out << "digraph " << name << "{" << Qt::endl; 0140 for(size_t i = 0; i != nstates; ++i) 0141 { 0142 out << "s" << i << " [ label = \"" << i << "\""; 0143 if(i == 0) 0144 out << ", shape=rect, style=rounded"; 0145 else 0146 out << ", shape=oval"; 0147 if(accept[i] != 0) 0148 out << ", penwidth=4"; 0149 out << " ];" << Qt::endl; 0150 } 0151 for(size_t i = 1; i <= numActions; ++i) 0152 { 0153 out << "f" << i << " [ label = \"" << codeForDot(actions[i]) << "\", shape=rect, penwidth=2 ];" << Qt::endl; 0154 } 0155 for(size_t i = 0; i != nstates; ++i) 0156 { 0157 for(auto j = rules[i].begin(); j != rules[i].end(); ++j) 0158 { 0159 out << "s" << i << " -> " << "s" << j->second << " [ label = \"" << j->first << "\" ];" << Qt::endl; 0160 } 0161 if(accept[i] != 0) 0162 { 0163 out << "s" << i << " -> " << "f" << accept[i] << ";" << Qt::endl; 0164 } 0165 } 0166 out << "}" << Qt::endl; 0167 } 0168 /// Debugging output 0169 void inspect() 0170 { 0171 cout << "#states = " << nstates << " accept = <"; 0172 for(size_t i = 0; i != nstates; ++i) 0173 cout << ", " << accept[i]; 0174 cout << "> rules = {"; 0175 for(size_t i = 0; i != nstates; ++i) 0176 { 0177 cout << i << " = {"; 0178 for(size_t j = 0; j != rules[i].size(); ++j) 0179 cout << rules[i][j].first << " = " << rules[i][j].second << ", "; 0180 cout << "} "; 0181 } 0182 cout << "}" << endl; 0183 } 0184 /// Test if the string matches the regexp 0185 size_t accepts(const QString& str) 0186 { 0187 size_t state = 0; 0188 foreach(const QChar c, str) 0189 { 0190 for(const auto& r : rules[state]) 0191 { 0192 if(r.first.accepts(c.unicode())) // make it more portable!! 0193 { 0194 state = r.second; 0195 goto success; 0196 } 0197 } 0198 return false; 0199 success:; 0200 } 0201 return accept[state]; 0202 } 0203 /// Can be used to test tokenization 0204 pair<Iterator, size_t> nextAction(Iterator iter) 0205 { 0206 Iterator npos = iter; 0207 size_t action = -1; 0208 size_t state = 0; 0209 while(iter.hasNext()) 0210 { 0211 if(accept[state]) 0212 { 0213 npos = iter; 0214 action = accept[state]; 0215 } 0216 quint64 chr = iter.next(); // make it more portable!! 0217 for(const auto& r : rules[state]) 0218 { 0219 if(r.first.accepts(chr)) 0220 { 0221 state = r.second; 0222 goto success; 0223 } 0224 } 0225 return make_pair(npos, action); 0226 success:; 0227 } 0228 if(accept[state]) 0229 { 0230 npos = iter; 0231 action = accept[state]; 0232 } 0233 return make_pair(npos, action); 0234 } 0235 /// Remove each state x iff ¬active[x] 0236 DFA<CharSet>& filterStates(const UsedBitArray& active) 0237 { 0238 size_t curr = 0; 0239 vector<size_t> mapping(nstates); 0240 for(size_t i = 0; i != nstates; ++i) 0241 { 0242 if(active[i]) 0243 { 0244 rules[curr] = rules[i]; 0245 accept[curr] = accept[i]; 0246 mapping[i] = curr; 0247 ++curr; 0248 } 0249 } 0250 rules.resize(curr); 0251 accept.resize(curr); 0252 nstates = curr; 0253 for(size_t i = 0; i != nstates; ++i) 0254 { 0255 auto orules = rules[i]; 0256 rules[i].clear(); 0257 for(auto j = orules.begin(); j != orules.end(); ++j) 0258 { 0259 if(active[j->second]) 0260 rules[i].push_back(make_pair(j->first, mapping[j->second])); 0261 } 0262 } 0263 return *this; 0264 } 0265 /// Eliminate each state unarrivable from the start 0266 DFA<CharSet>& eliminateUnarrivableStates() 0267 { 0268 UsedBitArray arrivable(nstates); 0269 arrivable[0] = true; 0270 stack<size_t> todo; 0271 todo.push(0); 0272 while(todo.size()) 0273 { 0274 size_t curr = todo.top(); 0275 todo.pop(); 0276 for(const auto& i : rules[curr]) 0277 { 0278 if(!arrivable[i.second]) 0279 { 0280 arrivable[i.second] = true; 0281 todo.push(i.second); 0282 } 0283 } 0284 } 0285 return filterStates(arrivable); 0286 } 0287 /// Eliminate each state iff no final state is arivable from it 0288 DFA<CharSet>& eliminateInactiveStates() 0289 { 0290 UsedBitArray active(nstates); 0291 for(size_t i = 0; i != nstates; ++i) 0292 if(accept[i]) 0293 active[i] = true; 0294 bool changed; 0295 do 0296 { 0297 changed = false; 0298 for(size_t i = 0; i != nstates; ++i) 0299 { 0300 if(!active[i]) 0301 { 0302 for(size_t j = 0; j != rules[i].size(); ++j) 0303 { 0304 if(active[rules[i][j].second]) 0305 { 0306 active[i] = true; 0307 changed = true; 0308 break; 0309 } 0310 } 0311 } 0312 } 0313 } while(changed); 0314 return filterStates(active); 0315 } 0316 /// followers in same group for every input 0317 bool sameGroup(size_t ngroups, const vector<size_t>& ogroups, size_t x, size_t y) 0318 { 0319 vector<CharSet> xsets(ngroups), ysets(ngroups); 0320 for(const auto& i : rules[x]) 0321 xsets[ogroups[i.second]].unite(i.first); 0322 for(const auto& i : rules[y]) 0323 ysets[ogroups[i.second]].unite(i.first); 0324 return xsets == ysets; 0325 } 0326 DFA<CharSet>& minimize() 0327 { 0328 eliminateUnarrivableStates(); 0329 eliminateInactiveStates(); 0330 vector<vector<size_t> > grinv(numActions + 1); 0331 for(size_t i = 0; i != nstates; ++i) 0332 grinv[(accept[i] - accept[0] + grinv.size()) % grinv.size()].push_back(i); 0333 size_t ongroups; 0334 vector<size_t> groups(nstates); 0335 do 0336 { 0337 for(size_t i = 0; i != grinv.size(); ++i) 0338 for(size_t j = 0; j != grinv[i].size(); ++j) 0339 groups[grinv[i][j]] = i; 0340 ongroups = grinv.size(); 0341 grinv.clear(); 0342 for(size_t i = 0; i != nstates; ++i) 0343 { 0344 for(size_t j = 0; j != grinv.size(); ++j) 0345 { 0346 if(groups[grinv[j][0]] == groups[i] && sameGroup(ongroups, groups, grinv[j][0], i)) 0347 { 0348 grinv[j].push_back(i); 0349 goto found; 0350 } 0351 } 0352 grinv.resize(grinv.size()+1); 0353 grinv.back().push_back(i); 0354 found:; 0355 } 0356 } 0357 while(ongroups != grinv.size()); 0358 vector<vector<pair<CharSet, size_t> > > _rules(grinv.size()); 0359 vector<size_t> _accept(grinv.size()); 0360 for(size_t i = 0; i != nstates; ++i) 0361 { 0362 _accept[groups[i]] = accept[i]; 0363 for(const auto& j : rules[i]) 0364 _rules[groups[i]].push_back(make_pair(j.first, groups[j.second])); 0365 } 0366 rules = _rules; 0367 nstates = grinv.size(); 0368 accept = _accept; 0369 for(size_t i = 0; i != nstates; ++i) 0370 { 0371 vector<CharSet> mapping(nstates, CharSet::emptySet()); 0372 for(const auto& j : rules[i]) 0373 mapping[j.second].unite(j.first); 0374 rules[i].clear(); 0375 for(size_t j = 0; j != nstates; ++j) 0376 if(!mapping[j].empty()) 0377 rules[i].push_back(make_pair(mapping[j], j)); 0378 } 0379 eliminateUnarrivableStates(); 0380 eliminateInactiveStates(); 0381 return *this; 0382 } 0383 NFA<CharSet> nfa() const; 0384 }; 0385 0386 template<typename CS> 0387 class NFA 0388 { 0389 public: 0390 typedef CS CharSet; 0391 private: 0392 size_t nstates; 0393 vector<vector<pair<CharSet, size_t> > > rules; 0394 size_t accept; 0395 friend class DFA<CharSet>; 0396 public: 0397 void inspect() 0398 { 0399 cout << "#states = " << nstates << " accept = " << accept << " rules = {"; 0400 for(size_t i = 0; i != nstates; ++i) 0401 { 0402 cout << i << " = {"; 0403 for(size_t j = 0; j != rules[i].size(); ++j) 0404 cout << rules[i][j].first << " = " << rules[i][j].second << ", "; 0405 cout << "} "; 0406 } 0407 cout << "}" << endl; 0408 } 0409 void dotOutput(QTextStream& out, const QString& name) 0410 { 0411 out << "digraph " << name << "{" << Qt::endl; 0412 for(size_t i = 0; i != nstates; ++i) 0413 { 0414 out << "s" << i << " [ label = \"" << i << "\""; 0415 if(i == 0) 0416 out << ", shape=rect, style=rounded"; 0417 else 0418 out << ", shape=oval"; 0419 if(i >= accept) 0420 out << ", penwidth=4"; 0421 out << " ];" << Qt::endl; 0422 } 0423 for(size_t i = 0; i != nstates; ++i) 0424 { 0425 for(auto j = rules[i].begin(); j != rules[i].end(); ++j) 0426 { 0427 out << "s" << i << " -> " << "s" << j->second << " [ label = \"" << j->first << "\" ];" << Qt::endl; 0428 } 0429 } 0430 out << "}" << Qt::endl; 0431 } 0432 /** 0433 * Accepts no words. 0434 */ 0435 NFA() : nstates(2), rules(2), accept(1) 0436 {} 0437 static NFA<CharSet> emptyWord() 0438 { 0439 NFA<CharSet> res; 0440 res.nstates = 1; 0441 res.rules.resize(1); 0442 res.accept = 0; 0443 return res; 0444 } 0445 NFA(const NFA<CharSet>& o) : nstates(o.nstates), rules(o.rules), accept(o.accept) 0446 {} 0447 NFA(const CharSet& set) : nstates(2), rules(2), accept(1) 0448 { 0449 rules[0].push_back(make_pair(set, 1)); 0450 } 0451 explicit NFA(const std::vector<NFA<CharSet> >& list) 0452 { 0453 if(list.size() == 0) 0454 { 0455 nstates = 2; 0456 rules.resize(2); 0457 accept = 1; 0458 } 0459 else 0460 { 0461 accept = 1; 0462 for(size_t i = 0; i != list.size(); ++i) 0463 accept += list.begin()[i].nstates; 0464 nstates = accept + list.size(); 0465 rules.resize(nstates); 0466 size_t start = 1; 0467 for(size_t i = 0; i != list.size(); ++i) 0468 { 0469 for(size_t j = start; j != start + list.begin()[i].nstates; ++j) 0470 { 0471 rules[j] = list.begin()[i].rules[j - start]; 0472 for(size_t k = 0; k != rules[j].size(); ++k) 0473 rules[j][k].second += start; 0474 } 0475 rules[0].push_back(make_pair(CharSet(), start)); 0476 start += list.begin()[i].nstates; 0477 rules[start - 1].push_back(make_pair(CharSet(), nstates - list.size() + i)); 0478 } 0479 } 0480 } 0481 void adjust(size_t startnum) 0482 { 0483 accept += startnum; 0484 nstates += startnum; 0485 rules.resize(nstates); 0486 for(size_t i = nstates; i != startnum; ) 0487 { 0488 --i; 0489 rules[i] = rules[i - startnum]; 0490 for(size_t j = 0; j != rules[i].size(); ++j) 0491 rules[i][j].second += startnum; 0492 } 0493 for(size_t i = 0; i != startnum; ++i) 0494 rules[i].clear(); 0495 } 0496 NFA<CharSet>& operator|=(const NFA<CharSet>& o) 0497 { 0498 adjust(1); 0499 rules.resize(o.nstates+nstates+2); 0500 for(size_t i = nstates; i != nstates + o.nstates; ++i) 0501 { 0502 rules[i] = o.rules[i - nstates]; 0503 for(size_t j = 0; j != rules[i].size(); ++j) 0504 rules[i][j].second += nstates; 0505 } 0506 rules[0].push_back(make_pair(CharSet(), 1)); 0507 rules[0].push_back(make_pair(CharSet(), nstates)); 0508 nstates += o.nstates; 0509 rules[accept].push_back(make_pair(CharSet(), nstates)); 0510 rules[nstates-1].push_back(make_pair(CharSet(), nstates)); 0511 accept = nstates; 0512 ++nstates; 0513 return *this; 0514 } 0515 NFA<CharSet>& operator<<=(const NFA<CharSet>& o) 0516 { 0517 rules.resize(nstates + o.nstates + 1); 0518 rules[accept].push_back(make_pair(CharSet(), nstates)); 0519 accept = o.accept + nstates; 0520 for(size_t i = nstates; i != nstates+o.nstates; ++i) 0521 { 0522 rules[i] = o.rules[i-nstates]; 0523 for(size_t j = 0; j != rules[i].size(); ++j) 0524 rules[i][j].second += nstates; 0525 } 0526 nstates += o.nstates; 0527 return *this; 0528 } 0529 NFA<CharSet>& operator*() 0530 { 0531 rules[accept].push_back(make_pair(CharSet(), 0)); 0532 rules[0].push_back(make_pair(CharSet(), nstates)); 0533 accept = nstates; 0534 ++nstates; 0535 rules.resize(nstates); 0536 return *this; 0537 } 0538 UsedBitArray closure(const UsedBitArray& states) 0539 { 0540 UsedBitArray s(states); 0541 assert(s.size() == nstates); 0542 stack<size_t> todo; 0543 for(size_t i = 0; i != nstates; ++i) 0544 if(s[i]) 0545 todo.push(i); 0546 while(todo.size()) 0547 { 0548 size_t x = todo.top(); 0549 todo.pop(); 0550 for(size_t i = 0; i != rules[x].size(); ++i) 0551 { 0552 if(rules[x][i].first.epsilon() && !s[rules[x][i].second]) 0553 { 0554 s[rules[x][i].second] = true; 0555 todo.push(rules[x][i].second); 0556 } 0557 } 0558 } 0559 return s; 0560 } 0561 UsedBitArray vecUnion(const UsedBitArray& a, const UsedBitArray& b) 0562 { 0563 UsedBitArray res(a); 0564 for(size_t i = 0; i != a.size(); ++i) 0565 res[i] = res[i] || b[i]; 0566 return res; 0567 } 0568 vector< pair<CharSet, UsedBitArray > > follow(const UsedBitArray& states) 0569 { 0570 UsedBitArray s(states); 0571 vector<pair<CharSet, UsedBitArray > > pr(nstates); 0572 for(size_t i = 0; i != nstates; ++i) 0573 { 0574 if(s[i]) 0575 { 0576 for(const auto& r : rules[i]) 0577 { 0578 UsedBitArray nxt(nstates); 0579 nxt[r.second] = true; 0580 pr.push_back(make_pair(r.first, nxt)); 0581 } 0582 } 0583 } 0584 bool b = true; 0585 while(b) 0586 { 0587 b = false; 0588 for(size_t i = 0; i != pr.size(); ++i) 0589 { 0590 if(!pr[i].first.epsilon()) 0591 { 0592 for(size_t j = 0; j != i; ++j) 0593 { 0594 if(!pr[j].first.epsilon()) 0595 { 0596 CharSet inter = pr[i].first.intersection(pr[j].first); 0597 if(!inter.empty()) 0598 { 0599 CharSet diff1 = pr[i].first.difference(inter); 0600 CharSet diff2 = pr[j].first.difference(inter); 0601 UsedBitArray v1 = pr[i].second, v2 = pr[j].second; 0602 pr[i].first = inter; 0603 pr[i].second = vecUnion(pr[i].second, pr[j].second); 0604 if(diff1.empty()) 0605 { 0606 if(diff2.empty()) 0607 { 0608 swap(pr[j], pr.back()); 0609 pr.pop_back(); 0610 } 0611 else 0612 { 0613 pr[j].first = diff2; 0614 pr[j].second = v2; 0615 } 0616 } 0617 else 0618 { 0619 pr[j].first = diff1; 0620 pr[j].second = v1; 0621 if(!diff2.empty()) 0622 pr.push_back(make_pair(diff2, v2)); 0623 } 0624 b = true; 0625 goto retry; 0626 } 0627 } 0628 } 0629 } 0630 } 0631 retry:; 0632 } 0633 return pr; 0634 } 0635 DFA<CharSet> dfa() 0636 { 0637 unordered_set<UsedBitArray> states; 0638 unordered_map<UsedBitArray, vector<pair<CharSet, UsedBitArray > > > rules; 0639 stack<UsedBitArray > todo; 0640 UsedBitArray start(nstates); 0641 start[0] = true; 0642 start = closure(start); 0643 todo.push(start); 0644 while(todo.size()) 0645 { 0646 UsedBitArray x = todo.top(); 0647 todo.pop(); 0648 states.insert(x); 0649 rules[x]; 0650 vector<pair<CharSet, UsedBitArray > > nxt = follow(x); 0651 for(const auto& nx : nxt) 0652 { 0653 if(!nx.first.epsilon()) 0654 { 0655 UsedBitArray n = closure(nx.second); 0656 if(!states.count(n)) 0657 todo.push(n); 0658 rules[x].push_back(make_pair(nx.first, n)); 0659 } 0660 } 0661 } 0662 DFA<CharSet> _; 0663 _.nstates = states.size(); 0664 _.accept.resize(_.nstates); 0665 _.numActions = nstates - accept; 0666 _.actions.resize(_.numActions + 1); 0667 unordered_map<UsedBitArray, size_t> st; 0668 size_t cnt = 0; 0669 foreach(const UsedBitArray& i, states) 0670 { 0671 if(i == start && cnt) 0672 { 0673 st[*states.begin()] = cnt; 0674 st[i] = 0; 0675 } 0676 else 0677 st[i] = cnt; 0678 ++cnt; 0679 } 0680 _.rules.resize(_.nstates); 0681 for(auto i = st.begin(); i != st.end(); ++i) 0682 { 0683 for(size_t j = accept; j != nstates; ++j) 0684 { 0685 if(i->first[j]) 0686 { 0687 _.accept[i->second] = j - accept + 1; 0688 break; 0689 } 0690 } 0691 } 0692 for(auto i = rules.begin(); i != rules.end(); ++i) 0693 { 0694 size_t key = st[i->first]; 0695 for(const auto& nx : i->second) 0696 _.rules[key].push_back(make_pair(nx.first, st[nx.second])); 0697 } 0698 return _; 0699 } 0700 bool acceptsEpsilon() const 0701 { 0702 stack<size_t> todo; 0703 todo.push(0); 0704 UsedBitArray vis(nstates); 0705 vis[0] = true; 0706 while(!todo.empty()) 0707 { 0708 size_t curr = todo.top(); 0709 todo.pop(); 0710 if(curr >= accept) 0711 return true; 0712 foreach(const auto& nx, rules[curr]) 0713 { 0714 if(nx.first.epsilon() && !vis[nx.second]) 0715 { 0716 vis[nx.second] = true; 0717 todo.push(nx.second); 0718 } 0719 } 0720 } 0721 return false; 0722 } 0723 bool isEmpty() const 0724 { 0725 stack<size_t> todo; 0726 todo.push(0); 0727 UsedBitArray vis(nstates); 0728 vis[0] = true; 0729 while(!todo.empty()) 0730 { 0731 size_t curr = todo.top(); 0732 todo.pop(); 0733 if(curr >= accept) 0734 return false; 0735 foreach(const auto& nx, rules[curr]) 0736 { 0737 if(!vis[nx.second]) 0738 { 0739 vis[nx.second] =true; 0740 todo.push(nx.second); 0741 } 0742 } 0743 } 0744 return true; 0745 } 0746 int minLength() const 0747 { 0748 queue<size_t> _todo0, _todo1; 0749 queue<size_t> *todo0 = &_todo0, *todo1 = &_todo1; 0750 todo0->push(0); 0751 UsedBitArray vis(nstates); 0752 vis[0] = true; 0753 int length = 0; 0754 while(!todo0->empty()) 0755 { 0756 size_t curr = todo0->front(); 0757 todo0->pop(); 0758 if(curr >= accept) 0759 return length; 0760 foreach(const auto& nx, rules[curr]) 0761 { 0762 if(!vis[nx.second]) 0763 { 0764 vis[nx.second] = true; 0765 if(nx.first.epsilon()) 0766 todo0->push(nx.second); 0767 else 0768 todo1->push(nx.second); 0769 } 0770 } 0771 if(todo0->empty()) 0772 { 0773 ++length; 0774 swap(todo0, todo1); 0775 } 0776 } 0777 return -1; 0778 } 0779 private: 0780 bool unboundedCheck(size_t x, UsedBitArray& vis) const 0781 { 0782 vis[x] = true; 0783 foreach(const auto& nx, rules[x]) 0784 { 0785 if(vis[nx.second]) 0786 return true; 0787 else 0788 if(unboundedCheck(nx.second, vis)) 0789 return true; 0790 } 0791 vis[x] = false; 0792 return false; 0793 } 0794 public: 0795 bool isUnbounded() const 0796 { 0797 UsedBitArray vis(nstates); 0798 return unboundedCheck(0, vis); 0799 } 0800 int maxLength() const 0801 { 0802 // assumes that there are no circle unarrivable from start 0803 if(isUnbounded()) 0804 return -2; 0805 if(isEmpty()) 0806 return -1; 0807 queue<size_t> _todo0, _todo1; 0808 queue<size_t> *todo0 = &_todo0, *todo1 = &_todo1; 0809 vector<int> dist(nstates, -1); 0810 vector<vector<pair<CharSet, size_t> > > brules(nstates); 0811 for(size_t i = 0; i != nstates; ++i) 0812 { 0813 foreach(const auto& nx, rules[i]) 0814 { 0815 brules[nx.second].push_back(nx); 0816 brules[nx.second].back().second = i; 0817 } 0818 } 0819 for(size_t i = accept; i != nstates; ++i) 0820 { 0821 todo0->push(i); 0822 dist[i] = 0; 0823 } 0824 int length = 0; 0825 while(!todo0->empty()) 0826 { 0827 size_t curr = todo0->front(); 0828 todo0->pop(); 0829 foreach(const auto& nx, brules[curr]) 0830 { 0831 if(nx.first.epsilon()) 0832 { 0833 if(dist[nx.second] < length) 0834 { 0835 dist[nx.second] = length; 0836 todo0->push(nx.second); 0837 } 0838 } 0839 else 0840 { 0841 if(dist[nx.second] <= length) 0842 { 0843 dist[nx.second] = length + 1; 0844 todo1->push(nx.second); 0845 } 0846 } 0847 } 0848 if(todo0->empty()) 0849 { 0850 ++length; 0851 swap(todo0, todo1); 0852 } 0853 } 0854 return dist[0]; 0855 } 0856 NFA<CharSet>& negate() 0857 { 0858 DFA<CharSet> tmp = dfa(); 0859 tmp.rules.push_back(vector< pair<CharSet, size_t> >()); 0860 tmp.rules.back().push_back(make_pair(CharSet::fullSet(), tmp.nstates)); 0861 ++tmp.nstates; 0862 for(auto i = tmp.accept.begin(); i != tmp.accept.end(); ++i) 0863 { 0864 if(*i == 0) 0865 *i = 1; 0866 else 0867 *i = 0; 0868 } 0869 tmp.accept.push_back(1); 0870 for(auto i = tmp.rules.begin(); i != tmp.rules.end(); ++i) 0871 { 0872 CharSet successSet = CharSet::fullSet(); 0873 for(auto j = i->begin(); j != i->end(); ++j) 0874 { 0875 CharSet tmp = j->first; 0876 tmp.negate(); 0877 successSet.intersect(tmp); 0878 } 0879 if(!successSet.empty()) 0880 i->push_back(make_pair(successSet, tmp.nstates - 1)); 0881 } 0882 tmp.eliminateUnarrivableStates(); 0883 tmp.eliminateInactiveStates(); 0884 return *this = tmp.nfa(); 0885 } 0886 NFA<CharSet>& operator&=(const NFA<CharSet>& o) 0887 { 0888 NFA<CharSet> _o = o; 0889 _o.negate(); 0890 negate(); 0891 *this |= _o; 0892 negate(); 0893 return *this; 0894 } 0895 NFA<CharSet>& operator^=(const NFA<CharSet>& o) 0896 { 0897 negate(); 0898 return (*this |= o).negate(); 0899 } 0900 }; 0901 0902 template<typename CharSet> 0903 NFA< CharSet > DFA<CharSet>::nfa() const 0904 { 0905 NFA<CharSet> res; 0906 if(nstates == 0) 0907 return res; 0908 res.nstates = nstates; 0909 res.rules = rules; 0910 res.accept = nstates; 0911 for(size_t i = 0; i != nstates; ++i) 0912 { 0913 if(nstates + accept[i] > res.nstates) 0914 { 0915 res.nstates = nstates + accept[i]; 0916 res.rules.resize(res.nstates); 0917 } 0918 if(accept[i] != 0) 0919 res.rules[i].push_back(make_pair(CharSet(), nstates + accept[i] - 1)); 0920 } 0921 if(res.nstates == nstates) 0922 { 0923 ++res.nstates; 0924 res.rules.push_back(vector<pair<CharSet, size_t> >()); 0925 } 0926 return res; 0927 } 0928 0929 // notice: utf8/8bit and utf16/ucs2 do not require different NFAs and DFAs 0930 #define EACH_TYPE(macro) \ 0931 switch(GDFA::type) \ 0932 { \ 0933 case SAscii: { typedef SeqCharSet<Ascii> T; macro(s0); } break; \ 0934 case SLatin1: case SUtf8: { typedef SeqCharSet<Latin1> T; macro(s1); } break; \ 0935 case SUcs2: case SUtf16: { typedef SeqCharSet<Ucs2> T; macro(s2); } break; \ 0936 case SUcs4: { typedef SeqCharSet<Ucs4> T; macro(s3); } break; \ 0937 case TAscii: { typedef TableCharSet<Ascii> T; macro(t0); } break; \ 0938 case TLatin1: case TUtf8: { typedef TableCharSet<Latin1> T; macro(t1); } break; \ 0939 case TUcs2: case TUtf16: { typedef TableCharSet<Ucs2> T; macro(t2); } break; \ 0940 default: exit(-1); \ 0941 } 0942 0943 void GDFA::codegen(QTextStream& str) 0944 { 0945 #define macro(x) x->codegen(str); 0946 EACH_TYPE(macro) 0947 #undef macro 0948 } 0949 0950 GDFA& GDFA::minimize() 0951 { 0952 #define macro(x) x->minimize(); 0953 EACH_TYPE(macro) 0954 #undef macro 0955 return *this; 0956 } 0957 0958 void GDFA::setActions(const std::vector< QString >& actions) 0959 { 0960 #define macro(x) x->setActions(actions); 0961 EACH_TYPE(macro) 0962 #undef macro 0963 } 0964 0965 GDFA::GDFA() 0966 { 0967 #define macro(x) x = new DFA<T>; 0968 EACH_TYPE(macro) 0969 #undef macro 0970 } 0971 0972 GDFA::GDFA(const KDevPG::GDFA& o) 0973 { 0974 #define macro(x) x = new DFA<T>(*o.x); 0975 EACH_TYPE(macro) 0976 #undef macro 0977 } 0978 0979 GDFA::~GDFA() 0980 { 0981 #define macro(x) delete x; 0982 EACH_TYPE(macro) 0983 #undef macro 0984 } 0985 0986 GDFA& GDFA::operator=(const KDevPG::GDFA& o) 0987 { 0988 #define macro(x) *x = *o.x; 0989 EACH_TYPE(macro) 0990 #undef macro 0991 return *this; 0992 } 0993 0994 void GDFA::inspect() 0995 { 0996 #define DO_INSPECT(x) x->inspect(); 0997 EACH_TYPE(DO_INSPECT) 0998 #undef DO_INSPECT 0999 } 1000 1001 void GDFA::dotOutput(QTextStream& o, const QString& name) 1002 { 1003 #define DO_DOT(x) x->dotOutput(o, name); 1004 EACH_TYPE(DO_DOT) 1005 #undef DO_DOT 1006 } 1007 1008 GNFA GDFA::nfa() 1009 { 1010 GNFA res; 1011 #define DO_NFA(x) res.x = new NFA<T>(x->nfa()); 1012 EACH_TYPE(DO_NFA) 1013 #undef DO_NFA 1014 return res; 1015 } 1016 1017 GNFA::GNFA() 1018 { 1019 /* no initialization */ 1020 } 1021 1022 GNFA::GNFA(const KDevPG::GNFA& o) 1023 { 1024 #define macro(x) x = new NFA<T>(*o.x); 1025 EACH_TYPE(macro) 1026 #undef macro 1027 } 1028 1029 GNFA::GNFA(const std::vector< GNFA* >& init) 1030 { 1031 #define macro(x) vector< NFA<T> > vec(init.size()); for(size_t i = 0; i != init.size(); ++i) vec[i] = *(init[i]->x); x = new NFA<T>(vec); 1032 EACH_TYPE(macro) 1033 #undef macro 1034 } 1035 1036 GNFA::~GNFA() 1037 { 1038 #define macro(x) delete x; 1039 EACH_TYPE(macro) 1040 #undef macro 1041 } 1042 1043 GNFA& GNFA::operator=(const KDevPG::GNFA& o) 1044 { 1045 #define macro(x) *x = *o.x; 1046 EACH_TYPE(macro) 1047 #undef macro 1048 return *this; 1049 } 1050 1051 bool GNFA::acceptsEpsilon() const 1052 { 1053 #define DO_AE(x) return x->acceptsEpsilon(); 1054 EACH_TYPE(DO_AE) 1055 #undef DO_AE 1056 } 1057 1058 bool GNFA::isEmpty() const 1059 { 1060 #define DO_IE(x) return x->isEmpty(); 1061 EACH_TYPE(DO_IE) 1062 #undef DO_IE 1063 } 1064 1065 bool GNFA::isUnbounded() const 1066 { 1067 #define DO_IU(x) return x->isUnbounded(); 1068 EACH_TYPE(DO_IU) 1069 #undef DO_IU 1070 } 1071 1072 int GNFA::minLength() const 1073 { 1074 #define DO_ML(x) return x->minLength(); 1075 EACH_TYPE(DO_ML) 1076 #undef DO_ML 1077 } 1078 1079 int GNFA::maxLength() const 1080 { 1081 #define DO_ML(x) return x->maxLength(); 1082 EACH_TYPE(DO_ML) 1083 #undef DO_ML 1084 } 1085 1086 GNFA& GNFA::operator<<=(const KDevPG::GNFA& o) 1087 { 1088 #define DO_SHIFT(x) *x <<= *o.x; 1089 EACH_TYPE(DO_SHIFT) 1090 #undef DO_SHIFT 1091 return *this; 1092 } 1093 1094 GNFA& GNFA::operator|=(const KDevPG::GNFA& o) 1095 { 1096 #define DO_OR(x) *x |= *o.x; 1097 EACH_TYPE(DO_OR) 1098 #undef DO_OR 1099 return *this; 1100 } 1101 1102 GNFA& GNFA::operator&=(const KDevPG::GNFA& o) 1103 { 1104 #define DO_AND(x) *x &= *o.x; 1105 EACH_TYPE(DO_AND) 1106 #undef DO_AND 1107 return *this; 1108 } 1109 1110 GNFA& GNFA::operator^=(const KDevPG::GNFA& o) 1111 { 1112 #define DO_BUT(x) *x ^= *o.x; 1113 EACH_TYPE(DO_BUT) 1114 #undef DO_BUT 1115 return *this; 1116 } 1117 1118 GNFA& GNFA::operator*() 1119 { 1120 #define DO_STAR(x) **x; 1121 EACH_TYPE(DO_STAR) 1122 #undef DO_STAR 1123 return *this; 1124 } 1125 1126 GNFA& GNFA::negate() 1127 { 1128 #define DO_NEGATE(x) x->negate(); 1129 EACH_TYPE(DO_NEGATE) 1130 #undef DO_NEGATE 1131 return *this; 1132 } 1133 1134 GNFA& GNFA::minimize() 1135 { 1136 *this = dfa().minimize().nfa(); 1137 return *this; 1138 } 1139 1140 void GNFA::inspect() 1141 { 1142 #define DO_INSPECT(x) x->inspect(); 1143 EACH_TYPE(DO_INSPECT) 1144 #undef DO_INSPECT 1145 } 1146 1147 void GNFA::dotOutput(QTextStream& o, const QString& name) 1148 { 1149 #define DO_DOT(x) x->dotOutput(o, name); 1150 EACH_TYPE(DO_DOT) 1151 #undef DO_DOT 1152 } 1153 1154 GDFA GNFA::dfa() 1155 { 1156 GDFA res; 1157 #define macro(x) *res.x = x->dfa(); 1158 EACH_TYPE(macro) 1159 #undef macro 1160 return res; 1161 } 1162 1163 GNFA GNFA::empty() 1164 { 1165 GNFA res; 1166 #define macro(x) res.x = new NFA<T>; 1167 EACH_TYPE(macro) 1168 #undef macro 1169 return res; 1170 } 1171 1172 GNFA GNFA::word(const QString& str) 1173 { 1174 #define macro(x) \ 1175 GNFA res = GNFA::emptyWord(); \ 1176 QByteArray qba(str.toUtf8()); \ 1177 Codec2FromUtf8Iterator<T::codec>::Result iter(qba); \ 1178 while(iter.hasNext()) \ 1179 { \ 1180 *res.x <<= T(iter.next()); \ 1181 } \ 1182 return res; 1183 EACH_TYPE(macro) 1184 #undef macro 1185 } 1186 1187 GNFA GNFA::collection(const QString& str) 1188 { 1189 GNFA res = GNFA::empty(); 1190 QUtf16ToUcs4Iterator iter(str); 1191 while(iter.hasNext()) 1192 { 1193 quint32 next = iter.next(); 1194 res |= range(next, next+1); 1195 } 1196 return res; 1197 } 1198 1199 GNFA GNFA::anyChar() 1200 { 1201 GNFA res; 1202 #define macro(x) res.x = new NFA<T>(T::fullSet()); 1203 EACH_TYPE(macro) 1204 #undef macro 1205 // remove the surrogate range 1206 if(/*GDFA::type == TUcs2 || */GDFA::type == TUtf16) 1207 { 1208 *res.t2 ^= NFA<TableCharSet<Ucs2> >(TableCharSet<Ucs2>::range(0xd800, 0xe000)); 1209 } 1210 else if(/*GDFA::type == SUcs2 || */GDFA::type == SUtf16) 1211 { 1212 *res.s2 ^= NFA<SeqCharSet<Ucs2> >(SeqCharSet<Ucs2>::range(0xd800, 0xe000)); 1213 } 1214 // add surrogate pairs 1215 if(GDFA::type == SUtf16) 1216 { 1217 NFA<SeqCharSet<Ucs2> > surrpairs(SeqCharSet<Ucs2>::range(0xd800, 0xdc00)); 1218 surrpairs <<= NFA<SeqCharSet<Ucs2> >(SeqCharSet<Ucs2>::range(0xdc00, 0xe000)); 1219 } 1220 if(GDFA::type == TUtf16) 1221 { 1222 NFA<TableCharSet<Ucs2> > surrpairs(TableCharSet<Ucs2>::range(0xd800, 0xdc00)); 1223 surrpairs <<= NFA<TableCharSet<Ucs2> >(TableCharSet<Ucs2>::range(0xdc00, 0xe000)); 1224 } 1225 // all valid utf-8 values 1226 if(GDFA::type == SUtf8) 1227 { 1228 NFA<SeqCharSet<Latin1> > lowerBytes(SeqCharSet<Latin1>::range(0x80, 0xc0)); 1229 NFA<SeqCharSet<Latin1> > ascii(SeqCharSet<Latin1>::range(0, 0x7f)); 1230 NFA<SeqCharSet<Latin1> > topOf2(SeqCharSet<Latin1>::range(0xc0, 0xe0)); 1231 NFA<SeqCharSet<Latin1> > topOf3(SeqCharSet<Latin1>::range(0xe0, 0xf0)); 1232 NFA<SeqCharSet<Latin1> > topOf4(SeqCharSet<Latin1>::range(0xf0, 0x100)); 1233 *res.s1 = (ascii |= (topOf2 <<= lowerBytes) |= ((topOf3 <<= lowerBytes) <<= lowerBytes) |= (((topOf4 <<= lowerBytes) <<= lowerBytes) <<= lowerBytes)); 1234 } 1235 if(GDFA::type == TUtf8) 1236 { 1237 NFA<TableCharSet<Latin1> > lowerBytes(TableCharSet<Latin1>::range(0x80, 0xc0)); 1238 NFA<TableCharSet<Latin1> > ascii(TableCharSet<Latin1>::range(0, 0x7f)); 1239 NFA<TableCharSet<Latin1> > topOf2(TableCharSet<Latin1>::range(0xc0, 0xe0)); 1240 NFA<TableCharSet<Latin1> > topOf3(TableCharSet<Latin1>::range(0xe0, 0xf0)); 1241 NFA<TableCharSet<Latin1> > topOf4(TableCharSet<Latin1>::range(0xf0, 0x100)); 1242 *res.t1 = (ascii |= (topOf2 <<= lowerBytes) |= ((topOf3 <<= lowerBytes) <<= lowerBytes) |= (((topOf4 <<= lowerBytes) <<= lowerBytes) <<= lowerBytes)); 1243 } 1244 return res; 1245 } 1246 1247 GNFA GNFA::anything() 1248 { 1249 GNFA any = anyChar(); 1250 return *any; 1251 } 1252 1253 GNFA GNFA::emptyWord() 1254 { 1255 GNFA res; 1256 #define macro(x) res.x = new NFA<T>(NFA<T>::emptyWord()); 1257 EACH_TYPE(macro) 1258 #undef macro 1259 return res; 1260 } 1261 1262 #include "generated-kdev-utf8-tuples.h" 1263 1264 template<template<CharEncoding> class CharSet> 1265 void addUtf8Range(NFA<CharSet<Latin1> >& res, quint32 begin, quint32 end) 1266 { 1267 typedef typename Codec2Int<Latin1>::Result Int; 1268 Int from0, from1, from2, from3, to0, to1, to2, to3; 1269 if(begin < 0x80) 1270 { 1271 if(end <= 0x80) 1272 { 1273 res |= getUtf8Tuples<CharSet>(Int(begin), Int(end)); 1274 return; 1275 } 1276 res |= getUtf8Tuples<CharSet>(Int(begin), Int(0x80)); 1277 begin = 0x80; 1278 } 1279 if(begin < 0x800) 1280 { 1281 from1 = (begin & 0x3f) + 0x80; 1282 from0 = (begin >> 6) + 0xc0; 1283 Int to0, to1; 1284 if(end <= 0x800) 1285 { 1286 to1 = (end & 0x3f) + 0x80; 1287 to0 = (end >> 6) + 0xc0; 1288 } 1289 else 1290 { 1291 to1 = 0x80 + (1<<6); 1292 to0 = 0xc0 + (1<<5); 1293 } 1294 res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1); 1295 if(end <= 0x800) 1296 return; 1297 begin = 0x800; 1298 } 1299 if(begin < 0x10000) 1300 { 1301 from2 = (begin & 0x3f) + 0x80; 1302 from1 = ((begin >> 6) & 0x3f) + 0x80; 1303 from0 = (begin >> 12) + 0xe0; 1304 if(end <= 0x10000) 1305 { 1306 to2 = (end & 0x3f) + 0x80; 1307 to1 = ((end >> 6) & 0x3f) + 0x80; 1308 to0 = (end >> 12) + 0xe0; 1309 } 1310 else 1311 { 1312 to2 = to1 = 0x80 + (1<<6); 1313 to0 = 0xe0 + (1 << 4); 1314 } 1315 res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1, from2, to2); 1316 if(end <= 0x10000) 1317 return; 1318 begin = 0x10000; 1319 } 1320 from3 = (begin & 0x3f) + 0x80; 1321 from2 = ((begin >> 6) & 0x3f) + 0x80; 1322 from1 = ((begin >> 12) & 0x3f) + 0x80; 1323 from0 = (begin >> 18) + 0xf0; 1324 to3 = (end & 0x3f) + 0x80; 1325 to2 = ((end >> 6) & 0x3f) + 0x80; 1326 to1 = ((end >> 12) & 0x3f) + 0x80; 1327 to0 = (end >> 18) + 0xf0; 1328 res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1, from2, to2, from3, to3); 1329 } 1330 1331 GNFA GNFA::range(quint32 begin, quint32 end) 1332 { 1333 // Utf32: take it 1334 // Latin1, Ascii, Ucs2: cut it 1335 // Utf8, Utf32: complicate 1336 GNFA res; 1337 switch(GDFA::type) 1338 { 1339 case SAscii: { res.s0 = new NFA<SeqCharSet<Ascii> >(begin >= 0x80 ? SeqCharSet<Ascii>::emptySet() : SeqCharSet<Ascii>::range(begin, min((quint32)0x80, end))); } break; 1340 case SLatin1: { res.s1 = new NFA<SeqCharSet<Latin1> >(begin >= 0x100 ? SeqCharSet<Latin1>::emptySet() : SeqCharSet<Latin1>::range(begin, min((quint32)0x100, end))); } break; 1341 case SUtf8: { 1342 res.s1 = new NFA<SeqCharSet<Latin1> >(); 1343 addUtf8Range(*res.s1, begin, end); 1344 } break; 1345 case SUcs2: { 1346 res.s2 = new NFA<SeqCharSet<Ucs2> >(begin >= 0x10000 ? SeqCharSet<Ucs2>::emptySet() : SeqCharSet<Ucs2>::range(begin, min((quint32)0x10000, end))/*.difference(SeqCharSet<Ucs2>::range(0xd800, 0xe000))*/); 1347 } break; 1348 case SUtf16: { 1349 #define UTF16_IMPL(CS, field) \ 1350 if(end >= 0x110000 || begin >= 0x110000) \ 1351 { \ 1352 res.field = new NFA<CS<Ucs2> >(); \ 1353 } \ 1354 else if(end < 0x10000) \ 1355 { \ 1356 if(begin <= 0xd800) \ 1357 { \ 1358 if(end <= begin) \ 1359 { \ 1360 res.field = new NFA<CS<Ucs2> >(); \ 1361 } \ 1362 else if(end > 0xe000) \ 1363 { \ 1364 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, 0xd800).getUnion(CS<Ucs2>::range(0xe000, end))); \ 1365 } \ 1366 else \ 1367 { \ 1368 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, min(end, (quint32)0xd800))); \ 1369 }\ 1370 }\ 1371 else\ 1372 {\ 1373 auto mbegin = max(begin, (quint32)0xe000);\ 1374 if(end <= mbegin)\ 1375 {\ 1376 res.field = new NFA<CS<Ucs2> >();\ 1377 }\ 1378 else\ 1379 {\ 1380 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(mbegin, end));\ 1381 }\ 1382 }\ 1383 }\ 1384 else if(begin >= 0x10000)\ 1385 {\ 1386 if(end <= begin)\ 1387 {\ 1388 res.field = new NFA<CS<Ucs2> >();\ 1389 }\ 1390 else\ 1391 {\ 1392 quint32 startHighSurrogate = (((begin - 0x10000) & 0xffc00) >> 10) + 0xd800;\ 1393 quint32 startLowSurrogate = ((begin - 0x10000) & 0x3ff) + 0xdc00;\ 1394 quint32 endHighSurrogate = (((end - 0x10000) & 0xffc00) >> 10) + 0xd800;\ 1395 quint32 endLowSurrogate = ((end - 0x10000) & 0x3ff) + 0xdc00;\ 1396 if(endLowSurrogate == 0xdc00)\ 1397 {\ 1398 if(startLowSurrogate == 0xdc00)\ 1399 {\ 1400 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1401 *res.field <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(0xdc00, 0xe000));\ 1402 }\ 1403 else\ 1404 {\ 1405 auto firstPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, startHighSurrogate + 1));\ 1406 firstPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1407 res.field = new NFA<CS<Ucs2> >(firstPairs);\ 1408 if(startHighSurrogate < endHighSurrogate)\ 1409 {\ 1410 auto surrPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate + 1, endHighSurrogate + 1));\ 1411 surrPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(0xdc00, 0xe000));\ 1412 *res.field |= surrPairs;\ 1413 }\ 1414 }\ 1415 }\ 1416 else\ 1417 {\ 1418 if(startLowSurrogate == 0xdc00)\ 1419 {\ 1420 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1421 }\ 1422 else\ 1423 {\ 1424 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, startHighSurrogate + 1));\ 1425 *res.field <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, (startHighSurrogate == endHighSurrogate ? endLowSurrogate : 0xe000)));\ 1426 if(startHighSurrogate != endHighSurrogate)\ 1427 {\ 1428 if(startHighSurrogate + 1 != endHighSurrogate)\ 1429 {\ 1430 auto midPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate + 1, endHighSurrogate));\ 1431 midPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1432 *res.field |= midPairs;\ 1433 }\ 1434 auto lastPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(endHighSurrogate, endHighSurrogate + 1));\ 1435 lastPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, endLowSurrogate));\ 1436 *res.field |= lastPairs;\ 1437 }\ 1438 }\ 1439 }\ 1440 }\ 1441 }\ 1442 else\ 1443 {\ 1444 if(end <= begin)\ 1445 {\ 1446 res.field = new NFA<CS<Ucs2> >();\ 1447 }\ 1448 else\ 1449 {\ 1450 if(begin < 0xd800)\ 1451 {\ 1452 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, 0xd800).getUnion(CS<Ucs2>::range(0xe000, 0x10000)));\ 1453 }\ 1454 else\ 1455 {\ 1456 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(max(begin, 0xe000u), 0x10000));\ 1457 }\ 1458 quint32 startHighSurrogate = 0xd800;\ 1459 quint32 startLowSurrogate = 0xdc00;\ 1460 quint32 endHighSurrogate = (((end - 0x10000) & 0xffc00) >> 10) + 0xd800;\ 1461 quint32 endLowSurrogate = ((end - 0x10000) & 0x3ff) + 0xdc00;\ 1462 if(endLowSurrogate == 0xdc00)\ 1463 {\ 1464 auto surrPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1465 surrPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1466 *res.field |= surrPairs;\ 1467 }\ 1468 else\ 1469 {\ 1470 if(endHighSurrogate > startHighSurrogate)\ 1471 {\ 1472 auto midPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1473 midPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1474 *res.field |= midPairs;\ 1475 }\ 1476 auto lastPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(endHighSurrogate, endHighSurrogate + 1));\ 1477 lastPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, endLowSurrogate));\ 1478 *res.field |= lastPairs;\ 1479 }\ 1480 }\ 1481 } 1482 UTF16_IMPL(SeqCharSet, s2) 1483 } break; 1484 case SUcs4: { res.s3 = new NFA<SeqCharSet<Ucs4> >(SeqCharSet<Ucs4>::range(begin, end)); } break; 1485 case TAscii: { res.t0 = new NFA<TableCharSet<Ascii> >(begin >= 0x80 ? TableCharSet<Ascii>::emptySet() : TableCharSet<Ascii>::range(begin, min((quint32)0x80, end))); } break; 1486 case TLatin1: { res.t1 = new NFA<TableCharSet<Latin1> >(begin >= 0x100 ? TableCharSet<Latin1>::emptySet() : TableCharSet<Latin1>::range(begin, min((quint32)0x100, end))); } break; 1487 case TUtf8: { 1488 res.t1 = new NFA<TableCharSet<Latin1> >(); 1489 addUtf8Range(*res.t1, begin, end); 1490 } break; 1491 case TUcs2: { 1492 res.t2 = new NFA<TableCharSet<Ucs2> >(begin >= 0x10000 ? TableCharSet<Ucs2>::emptySet() : TableCharSet<Ucs2>::range(begin, min((quint32)0x10000, end))/*.difference(TableCharSet<Ucs2>::range(0xd800, 0xe000))*/); 1493 } 1494 case TUtf16: { 1495 UTF16_IMPL(TableCharSet, t2) 1496 #undef UTF16_IMPL 1497 } break; 1498 default: exit(-1); 1499 } 1500 // TODO: warning if empty 1501 return res; 1502 } 1503 1504 GNFA GNFA::character(quint32 codepoint) 1505 { 1506 return range(codepoint, codepoint+1); 1507 } 1508 1509 void deleteNFA(GNFA *ptr) 1510 { 1511 delete ptr; 1512 } 1513 1514 void deleteDFA(GDFA *ptr) 1515 { 1516 delete ptr; 1517 } 1518 1519 AutomatonType GDFA::type = SUcs2; 1520 1521 template<CharEncoding enc> 1522 const typename TableCharSet<enc>::BitArray TableCharSet<enc>::nullData = bitset<TableCharSet<enc>::nChars>(); 1523 1524 }