File indexing completed on 2024-04-28 15:54:30
0001 /* This file is part of kdev-pg-qt 0002 Copyright (C) 2010 Jonathan Schmidt-Dominé <devel@the-user.org> 0003 0004 This library is free software; you can redistribute it and/or 0005 modify it under the terms of the GNU Library General Public 0006 License as published by the Free Software Foundation; either 0007 version 2 of the License, or (at your option) any later version. 0008 0009 This library is distributed in the hope that it will be useful, 0010 but WITHOUT ANY WARRANTY; without even the implied warranty of 0011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0012 Library General Public License for more details. 0013 0014 You should have received a copy of the GNU Library General Public License 0015 along with this library; see the file COPYING.LIB. If not, write to 0016 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0017 Boston, MA 02110-1301, USA. 0018 */ 0019 0020 #include "kdev-pg-char-sets.h" 0021 #include "kdev-pg-regexp.h" 0022 #include "kdev-pg.h" 0023 #include "kdev-pg-regexp-helper.h" 0024 #include "kdev-pg-bit-array.h" 0025 #include <iostream> 0026 #include <queue> 0027 #include <stack> 0028 #include <QHash> 0029 #include <unordered_set> 0030 #include <unordered_map> 0031 0032 using namespace std; 0033 0034 // TODO: Lookahead operator 0035 // TODO: Line-End etc. 0036 // TODO: Deactivate/activate rules (have to keep the bitsets of possible final states) 0037 0038 #define q_Hash_to_tr1_hash(type) \ 0039 namespace std \ 0040 { \ 0041 template<> struct hash< type > \ 0042 { \ 0043 inline size_t operator()(const type &x) const \ 0044 { \ 0045 return qHash(x); \ 0046 } \ 0047 }; \ 0048 } 0049 0050 #if (QT_VERSION < QT_VERSION_CHECK(5, 14, 0)) 0051 // Qt >= 5.14 already has std::hash for QBitArray 0052 q_Hash_to_tr1_hash(QBitArray) 0053 #endif 0054 0055 namespace KDevPG 0056 { 0057 0058 typedef BitArray UsedBitArray; 0059 typedef QUtf8ToUcs4Iterator Iterator; 0060 // typedef TableCharSet<Ascii> CharSet; 0061 0062 template<typename CharSet> class NFA; 0063 0064 inline QString codeForDot(QString str) 0065 { 0066 QString out = ""; 0067 int pos = 0; 0068 forever 0069 { 0070 int npos = str.indexOf("\n\01!ASIgnore\"!!\n# ", pos); 0071 if(npos == -1) 0072 { 0073 out += str.midRef(pos); 0074 break; 0075 } 0076 out += str.midRef(pos, npos - pos); 0077 int nlpos = str.indexOf('\n', npos + 17); 0078 int codeendpos = str.indexOf("\n\01!AS/Ignore\"!!\n", nlpos); 0079 if(nlpos == -1 || codeendpos == -1) 0080 { 0081 out += "<junk>"; 0082 break; 0083 } 0084 out += str.midRef(nlpos + 1, codeendpos - nlpos - 1); 0085 pos = codeendpos + 17; 0086 } 0087 return out.replace('\"', "\\\"").replace('\n', '\t').trimmed(); 0088 } 0089 0090 /** 0091 * Deterministic finite automaton. 0092 */ 0093 template<typename CS> 0094 class DFA 0095 { 0096 public: 0097 typedef CS CharSet; 0098 private: 0099 size_t nstates; 0100 vector<vector<pair<CharSet, size_t> > > rules; 0101 size_t numActions; 0102 vector<size_t> accept; 0103 vector<QString> actions; 0104 friend class NFA<CharSet>; 0105 public: 0106 /// Code used for the detected tokens in the generated code 0107 void setActions(const vector<QString>& _actions) 0108 { 0109 actions = _actions; 0110 numActions = actions.size() - 1; 0111 } 0112 /// Generate code for transitions 0113 void codegen(QTextStream& str) 0114 { 0115 str << "goto _state_0; // no warning about unused label\n"; 0116 CharSetCondition<CharSet> csc; 0117 for(size_t i = 0; i != nstates; ++i) 0118 { 0119 str << "\n_state_" + QString::number(i) + ":\n"; 0120 if(accept[i]) 0121 str << "lpos = lxCURR_POS; lstate = " << accept[i] << ";\n"; 0122 str << "lxNEXT_CHR(chr); "; 0123 csc(str, rules[i]); 0124 } 0125 str << "_end:\nplain() = lpos;\nswitch(lstate) {\n"; 0126 for(size_t i = 0; i <= numActions; ++i) 0127 { 0128 str << "case " << QString::number(i) << ": "; 0129 if(i == 0) 0130 str << "goto _fail; // no warning about unused label\n_fail: "; 0131 str << "{" << actions[i] << "\nlxFINISH\n}\n"; 0132 } 0133 str << "}\n"; 0134 } 0135 /// .dot-output 0136 void dotOutput(QTextStream& out, const QString& name) 0137 { 0138 out << "digraph " << name << "{" << endl; 0139 for(size_t i = 0; i != nstates; ++i) 0140 { 0141 out << "s" << i << " [ label = \"" << i << "\""; 0142 if(i == 0) 0143 out << ", shape=rect, style=rounded"; 0144 else 0145 out << ", shape=oval"; 0146 if(accept[i] != 0) 0147 out << ", penwidth=4"; 0148 out << " ];" << endl; 0149 } 0150 for(size_t i = 1; i <= numActions; ++i) 0151 { 0152 out << "f" << i << " [ label = \"" << codeForDot(actions[i]) << "\", shape=rect, penwidth=2 ];" << endl; 0153 } 0154 for(size_t i = 0; i != nstates; ++i) 0155 { 0156 for(auto j = rules[i].begin(); j != rules[i].end(); ++j) 0157 { 0158 out << "s" << i << " -> " << "s" << j->second << " [ label = \"" << j->first << "\" ];" << endl; 0159 } 0160 if(accept[i] != 0) 0161 { 0162 out << "s" << i << " -> " << "f" << accept[i] << ";" << endl; 0163 } 0164 } 0165 out << "}" << endl; 0166 } 0167 /// Debugging output 0168 void inspect() 0169 { 0170 cout << "#states = " << nstates << " accept = <"; 0171 for(size_t i = 0; i != nstates; ++i) 0172 cout << ", " << accept[i]; 0173 cout << "> rules = {"; 0174 for(size_t i = 0; i != nstates; ++i) 0175 { 0176 cout << i << " = {"; 0177 for(size_t j = 0; j != rules[i].size(); ++j) 0178 cout << rules[i][j].first << " = " << rules[i][j].second << ", "; 0179 cout << "} "; 0180 } 0181 cout << "}" << endl; 0182 } 0183 /// Test if the string matches the regexp 0184 size_t accepts(const QString& str) 0185 { 0186 size_t state = 0; 0187 foreach(const QChar c, str) 0188 { 0189 foreach(const auto& r, rules[state]) 0190 { 0191 if(r.first.accepts(c.unicode())) // make it more portable!! 0192 { 0193 state = r.second; 0194 goto success; 0195 } 0196 } 0197 return false; 0198 success:; 0199 } 0200 return accept[state]; 0201 } 0202 /// Can be used to test tokenization 0203 pair<Iterator, size_t> nextAction(Iterator iter) 0204 { 0205 Iterator npos = iter; 0206 size_t action = -1; 0207 size_t state = 0; 0208 while(iter.hasNext()) 0209 { 0210 if(accept[state]) 0211 { 0212 npos = iter; 0213 action = accept[state]; 0214 } 0215 quint64 chr = iter.next(); // make it more portable!! 0216 foreach(const auto& r, rules[state]) 0217 { 0218 if(r.first.accepts(chr)) 0219 { 0220 state = r.second; 0221 goto success; 0222 } 0223 } 0224 return make_pair(npos, action); 0225 success:; 0226 } 0227 if(accept[state]) 0228 { 0229 npos = iter; 0230 action = accept[state]; 0231 } 0232 return make_pair(npos, action); 0233 } 0234 /// Remove each state x iff ¬active[x] 0235 DFA<CharSet>& filterStates(const UsedBitArray& active) 0236 { 0237 size_t curr = 0; 0238 vector<size_t> mapping(nstates); 0239 for(size_t i = 0; i != nstates; ++i) 0240 { 0241 if(active[i]) 0242 { 0243 rules[curr] = rules[i]; 0244 accept[curr] = accept[i]; 0245 mapping[i] = curr; 0246 ++curr; 0247 } 0248 } 0249 rules.resize(curr); 0250 accept.resize(curr); 0251 nstates = curr; 0252 for(size_t i = 0; i != nstates; ++i) 0253 { 0254 auto orules = rules[i]; 0255 rules[i].clear(); 0256 for(auto j = orules.begin(); j != orules.end(); ++j) 0257 { 0258 if(active[j->second]) 0259 rules[i].push_back(make_pair(j->first, mapping[j->second])); 0260 } 0261 } 0262 return *this; 0263 } 0264 /// Eliminate each state unarrivable from the start 0265 DFA<CharSet>& eliminateUnarrivableStates() 0266 { 0267 UsedBitArray arrivable(nstates); 0268 arrivable[0] = true; 0269 stack<size_t> todo; 0270 todo.push(0); 0271 while(todo.size()) 0272 { 0273 size_t curr = todo.top(); 0274 todo.pop(); 0275 foreach(const auto& i, rules[curr]) 0276 { 0277 if(!arrivable[i.second]) 0278 { 0279 arrivable[i.second] = true; 0280 todo.push(i.second); 0281 } 0282 } 0283 } 0284 return filterStates(arrivable); 0285 } 0286 /// Eliminate each state iff no final state is arivable from it 0287 DFA<CharSet>& eliminateInactiveStates() 0288 { 0289 UsedBitArray active(nstates); 0290 for(size_t i = 0; i != nstates; ++i) 0291 if(accept[i]) 0292 active[i] = true; 0293 bool changed; 0294 do 0295 { 0296 changed = false; 0297 for(size_t i = 0; i != nstates; ++i) 0298 { 0299 if(!active[i]) 0300 { 0301 for(size_t j = 0; j != rules[i].size(); ++j) 0302 { 0303 if(active[rules[i][j].second]) 0304 { 0305 active[i] = true; 0306 changed = true; 0307 break; 0308 } 0309 } 0310 } 0311 } 0312 } while(changed); 0313 return filterStates(active); 0314 } 0315 /// followers in same group for every input 0316 bool sameGroup(size_t ngroups, const vector<size_t>& ogroups, size_t x, size_t y) 0317 { 0318 vector<CharSet> xsets(ngroups), ysets(ngroups); 0319 foreach(const auto& i, rules[x]) 0320 xsets[ogroups[i.second]].unite(i.first); 0321 foreach(const auto& i, rules[y]) 0322 ysets[ogroups[i.second]].unite(i.first); 0323 return xsets == ysets; 0324 } 0325 DFA<CharSet>& minimize() 0326 { 0327 eliminateUnarrivableStates(); 0328 eliminateInactiveStates(); 0329 vector<vector<size_t> > grinv(numActions + 1); 0330 for(size_t i = 0; i != nstates; ++i) 0331 grinv[(accept[i] - accept[0] + grinv.size()) % grinv.size()].push_back(i); 0332 size_t ongroups; 0333 vector<size_t> groups(nstates); 0334 do 0335 { 0336 for(size_t i = 0; i != grinv.size(); ++i) 0337 for(size_t j = 0; j != grinv[i].size(); ++j) 0338 groups[grinv[i][j]] = i; 0339 ongroups = grinv.size(); 0340 grinv.clear(); 0341 for(size_t i = 0; i != nstates; ++i) 0342 { 0343 for(size_t j = 0; j != grinv.size(); ++j) 0344 { 0345 if(groups[grinv[j][0]] == groups[i] && sameGroup(ongroups, groups, grinv[j][0], i)) 0346 { 0347 grinv[j].push_back(i); 0348 goto found; 0349 } 0350 } 0351 grinv.resize(grinv.size()+1); 0352 grinv.back().push_back(i); 0353 found:; 0354 } 0355 } 0356 while(ongroups != grinv.size()); 0357 vector<vector<pair<CharSet, size_t> > > _rules(grinv.size()); 0358 vector<size_t> _accept(grinv.size()); 0359 for(size_t i = 0; i != nstates; ++i) 0360 { 0361 _accept[groups[i]] = accept[i]; 0362 foreach(const auto& j, rules[i]) 0363 _rules[groups[i]].push_back(make_pair(j.first, groups[j.second])); 0364 } 0365 rules = _rules; 0366 nstates = grinv.size(); 0367 accept = _accept; 0368 for(size_t i = 0; i != nstates; ++i) 0369 { 0370 vector<CharSet> mapping(nstates, CharSet::emptySet()); 0371 foreach(const auto& j, rules[i]) 0372 mapping[j.second].unite(j.first); 0373 rules[i].clear(); 0374 for(size_t j = 0; j != nstates; ++j) 0375 if(!mapping[j].empty()) 0376 rules[i].push_back(make_pair(mapping[j], j)); 0377 } 0378 eliminateUnarrivableStates(); 0379 eliminateInactiveStates(); 0380 return *this; 0381 } 0382 NFA<CharSet> nfa() const; 0383 }; 0384 0385 template<typename CS> 0386 class NFA 0387 { 0388 public: 0389 typedef CS CharSet; 0390 private: 0391 size_t nstates; 0392 vector<vector<pair<CharSet, size_t> > > rules; 0393 size_t accept; 0394 friend class DFA<CharSet>; 0395 public: 0396 void inspect() 0397 { 0398 cout << "#states = " << nstates << " accept = " << accept << " rules = {"; 0399 for(size_t i = 0; i != nstates; ++i) 0400 { 0401 cout << i << " = {"; 0402 for(size_t j = 0; j != rules[i].size(); ++j) 0403 cout << rules[i][j].first << " = " << rules[i][j].second << ", "; 0404 cout << "} "; 0405 } 0406 cout << "}" << endl; 0407 } 0408 void dotOutput(QTextStream& out, const QString& name) 0409 { 0410 out << "digraph " << name << "{" << endl; 0411 for(size_t i = 0; i != nstates; ++i) 0412 { 0413 out << "s" << i << " [ label = \"" << i << "\""; 0414 if(i == 0) 0415 out << ", shape=rect, style=rounded"; 0416 else 0417 out << ", shape=oval"; 0418 if(i >= accept) 0419 out << ", penwidth=4"; 0420 out << " ];" << endl; 0421 } 0422 for(size_t i = 0; i != nstates; ++i) 0423 { 0424 for(auto j = rules[i].begin(); j != rules[i].end(); ++j) 0425 { 0426 out << "s" << i << " -> " << "s" << j->second << " [ label = \"" << j->first << "\" ];" << endl; 0427 } 0428 } 0429 out << "}" << endl; 0430 } 0431 /** 0432 * Accepts no words. 0433 */ 0434 NFA() : nstates(2), rules(2), accept(1) 0435 {} 0436 static NFA<CharSet> emptyWord() 0437 { 0438 NFA<CharSet> res; 0439 res.nstates = 1; 0440 res.rules.resize(1); 0441 res.accept = 0; 0442 return res; 0443 } 0444 NFA(const NFA<CharSet>& o) : nstates(o.nstates), rules(o.rules), accept(o.accept) 0445 {} 0446 NFA(const CharSet& set) : nstates(2), rules(2), accept(1) 0447 { 0448 rules[0].push_back(make_pair(set, 1)); 0449 } 0450 explicit NFA(const std::vector<NFA<CharSet> >& list) 0451 { 0452 if(list.size() == 0) 0453 { 0454 nstates = 2; 0455 rules.resize(2); 0456 accept = 1; 0457 } 0458 else 0459 { 0460 accept = 1; 0461 for(size_t i = 0; i != list.size(); ++i) 0462 accept += list.begin()[i].nstates; 0463 nstates = accept + list.size(); 0464 rules.resize(nstates); 0465 size_t start = 1; 0466 for(size_t i = 0; i != list.size(); ++i) 0467 { 0468 for(size_t j = start; j != start + list.begin()[i].nstates; ++j) 0469 { 0470 rules[j] = list.begin()[i].rules[j - start]; 0471 for(size_t k = 0; k != rules[j].size(); ++k) 0472 rules[j][k].second += start; 0473 } 0474 rules[0].push_back(make_pair(CharSet(), start)); 0475 start += list.begin()[i].nstates; 0476 rules[start - 1].push_back(make_pair(CharSet(), nstates - list.size() + i)); 0477 } 0478 } 0479 } 0480 void adjust(size_t startnum) 0481 { 0482 accept += startnum; 0483 nstates += startnum; 0484 rules.resize(nstates); 0485 for(size_t i = nstates; i != startnum; ) 0486 { 0487 --i; 0488 rules[i] = rules[i - startnum]; 0489 for(size_t j = 0; j != rules[i].size(); ++j) 0490 rules[i][j].second += startnum; 0491 } 0492 for(size_t i = 0; i != startnum; ++i) 0493 rules[i].clear(); 0494 } 0495 NFA<CharSet>& operator|=(const NFA<CharSet>& o) 0496 { 0497 adjust(1); 0498 rules.resize(o.nstates+nstates+2); 0499 for(size_t i = nstates; i != nstates + o.nstates; ++i) 0500 { 0501 rules[i] = o.rules[i - nstates]; 0502 for(size_t j = 0; j != rules[i].size(); ++j) 0503 rules[i][j].second += nstates; 0504 } 0505 rules[0].push_back(make_pair(CharSet(), 1)); 0506 rules[0].push_back(make_pair(CharSet(), nstates)); 0507 nstates += o.nstates; 0508 rules[accept].push_back(make_pair(CharSet(), nstates)); 0509 rules[nstates-1].push_back(make_pair(CharSet(), nstates)); 0510 accept = nstates; 0511 ++nstates; 0512 return *this; 0513 } 0514 NFA<CharSet>& operator<<=(const NFA<CharSet>& o) 0515 { 0516 rules.resize(nstates + o.nstates + 1); 0517 rules[accept].push_back(make_pair(CharSet(), nstates)); 0518 accept = o.accept + nstates; 0519 for(size_t i = nstates; i != nstates+o.nstates; ++i) 0520 { 0521 rules[i] = o.rules[i-nstates]; 0522 for(size_t j = 0; j != rules[i].size(); ++j) 0523 rules[i][j].second += nstates; 0524 } 0525 nstates += o.nstates; 0526 return *this; 0527 } 0528 NFA<CharSet>& operator*() 0529 { 0530 rules[accept].push_back(make_pair(CharSet(), 0)); 0531 rules[0].push_back(make_pair(CharSet(), nstates)); 0532 accept = nstates; 0533 ++nstates; 0534 rules.resize(nstates); 0535 return *this; 0536 } 0537 UsedBitArray closure(const UsedBitArray& states) 0538 { 0539 UsedBitArray s(states); 0540 assert(s.size() == nstates); 0541 stack<size_t> todo; 0542 for(size_t i = 0; i != nstates; ++i) 0543 if(s[i]) 0544 todo.push(i); 0545 while(todo.size()) 0546 { 0547 size_t x = todo.top(); 0548 todo.pop(); 0549 for(size_t i = 0; i != rules[x].size(); ++i) 0550 { 0551 if(rules[x][i].first.epsilon() && !s[rules[x][i].second]) 0552 { 0553 s[rules[x][i].second] = true; 0554 todo.push(rules[x][i].second); 0555 } 0556 } 0557 } 0558 return s; 0559 } 0560 UsedBitArray vecUnion(const UsedBitArray& a, const UsedBitArray& b) 0561 { 0562 UsedBitArray res(a); 0563 for(size_t i = 0; i != a.size(); ++i) 0564 res[i] = res[i] || b[i]; 0565 return res; 0566 } 0567 vector< pair<CharSet, UsedBitArray > > follow(const UsedBitArray& states) 0568 { 0569 UsedBitArray s(states); 0570 vector<pair<CharSet, UsedBitArray > > pr(nstates); 0571 for(size_t i = 0; i != nstates; ++i) 0572 { 0573 if(s[i]) 0574 { 0575 foreach(const auto& r, rules[i]) 0576 { 0577 UsedBitArray nxt(nstates); 0578 nxt[r.second] = true; 0579 pr.push_back(make_pair(r.first, nxt)); 0580 } 0581 } 0582 } 0583 bool b = true; 0584 while(b) 0585 { 0586 b = false; 0587 for(size_t i = 0; i != pr.size(); ++i) 0588 { 0589 if(!pr[i].first.epsilon()) 0590 { 0591 for(size_t j = 0; j != i; ++j) 0592 { 0593 if(!pr[j].first.epsilon()) 0594 { 0595 CharSet inter = pr[i].first.intersection(pr[j].first); 0596 if(!inter.empty()) 0597 { 0598 CharSet diff1 = pr[i].first.difference(inter); 0599 CharSet diff2 = pr[j].first.difference(inter); 0600 UsedBitArray v1 = pr[i].second, v2 = pr[j].second; 0601 pr[i].first = inter; 0602 pr[i].second = vecUnion(pr[i].second, pr[j].second); 0603 if(diff1.empty()) 0604 { 0605 if(diff2.empty()) 0606 { 0607 swap(pr[j], pr.back()); 0608 pr.pop_back(); 0609 } 0610 else 0611 { 0612 pr[j].first = diff2; 0613 pr[j].second = v2; 0614 } 0615 } 0616 else 0617 { 0618 pr[j].first = diff1; 0619 pr[j].second = v1; 0620 if(!diff2.empty()) 0621 pr.push_back(make_pair(diff2, v2)); 0622 } 0623 b = true; 0624 goto retry; 0625 } 0626 } 0627 } 0628 } 0629 } 0630 retry:; 0631 } 0632 return pr; 0633 } 0634 DFA<CharSet> dfa() 0635 { 0636 unordered_set<UsedBitArray> states; 0637 unordered_map<UsedBitArray, vector<pair<CharSet, UsedBitArray > > > rules; 0638 stack<UsedBitArray > todo; 0639 UsedBitArray start(nstates); 0640 start[0] = true; 0641 start = closure(start); 0642 todo.push(start); 0643 while(todo.size()) 0644 { 0645 UsedBitArray x = todo.top(); 0646 todo.pop(); 0647 states.insert(x); 0648 rules[x]; 0649 vector<pair<CharSet, UsedBitArray > > nxt = follow(x); 0650 foreach(const auto& nx, nxt) 0651 { 0652 if(!nx.first.epsilon()) 0653 { 0654 UsedBitArray n = closure(nx.second); 0655 if(!states.count(n)) 0656 todo.push(n); 0657 rules[x].push_back(make_pair(nx.first, n)); 0658 } 0659 } 0660 } 0661 DFA<CharSet> _; 0662 _.nstates = states.size(); 0663 _.accept.resize(_.nstates); 0664 _.numActions = nstates - accept; 0665 _.actions.resize(_.numActions + 1); 0666 unordered_map<UsedBitArray, size_t> st; 0667 size_t cnt = 0; 0668 foreach(const UsedBitArray& i, states) 0669 { 0670 if(i == start && cnt) 0671 { 0672 st[*states.begin()] = cnt; 0673 st[i] = 0; 0674 } 0675 else 0676 st[i] = cnt; 0677 ++cnt; 0678 } 0679 _.rules.resize(_.nstates); 0680 for(auto i = st.begin(); i != st.end(); ++i) 0681 { 0682 for(size_t j = accept; j != nstates; ++j) 0683 { 0684 if(i->first[j]) 0685 { 0686 _.accept[i->second] = j - accept + 1; 0687 break; 0688 } 0689 } 0690 } 0691 for(auto i = rules.begin(); i != rules.end(); ++i) 0692 { 0693 size_t key = st[i->first]; 0694 foreach(const auto& nx, i->second) 0695 _.rules[key].push_back(make_pair(nx.first, st[nx.second])); 0696 } 0697 return _; 0698 } 0699 bool acceptsEpsilon() const 0700 { 0701 stack<size_t> todo; 0702 todo.push(0); 0703 UsedBitArray vis(nstates); 0704 vis[0] = true; 0705 while(!todo.empty()) 0706 { 0707 size_t curr = todo.top(); 0708 todo.pop(); 0709 if(curr >= accept) 0710 return true; 0711 foreach(const auto& nx, rules[curr]) 0712 { 0713 if(nx.first.epsilon() && !vis[nx.second]) 0714 { 0715 vis[nx.second] = true; 0716 todo.push(nx.second); 0717 } 0718 } 0719 } 0720 return false; 0721 } 0722 bool isEmpty() const 0723 { 0724 stack<size_t> todo; 0725 todo.push(0); 0726 UsedBitArray vis(nstates); 0727 vis[0] = true; 0728 while(!todo.empty()) 0729 { 0730 size_t curr = todo.top(); 0731 todo.pop(); 0732 if(curr >= accept) 0733 return false; 0734 foreach(const auto& nx, rules[curr]) 0735 { 0736 if(!vis[nx.second]) 0737 { 0738 vis[nx.second] =true; 0739 todo.push(nx.second); 0740 } 0741 } 0742 } 0743 return true; 0744 } 0745 int minLength() const 0746 { 0747 queue<size_t> _todo0, _todo1; 0748 queue<size_t> *todo0 = &_todo0, *todo1 = &_todo1; 0749 todo0->push(0); 0750 UsedBitArray vis(nstates); 0751 vis[0] = true; 0752 int length = 0; 0753 while(!todo0->empty()) 0754 { 0755 size_t curr = todo0->front(); 0756 todo0->pop(); 0757 if(curr >= accept) 0758 return length; 0759 foreach(const auto& nx, rules[curr]) 0760 { 0761 if(!vis[nx.second]) 0762 { 0763 vis[nx.second] = true; 0764 if(nx.first.epsilon()) 0765 todo0->push(nx.second); 0766 else 0767 todo1->push(nx.second); 0768 } 0769 } 0770 if(todo0->empty()) 0771 { 0772 ++length; 0773 swap(todo0, todo1); 0774 } 0775 } 0776 return -1; 0777 } 0778 private: 0779 bool unboundedCheck(size_t x, UsedBitArray& vis) const 0780 { 0781 vis[x] = true; 0782 foreach(const auto& nx, rules[x]) 0783 { 0784 if(vis[nx.second]) 0785 return true; 0786 else 0787 if(unboundedCheck(nx.second, vis)) 0788 return true; 0789 } 0790 vis[x] = false; 0791 return false; 0792 } 0793 public: 0794 bool isUnbounded() const 0795 { 0796 UsedBitArray vis(nstates); 0797 return unboundedCheck(0, vis); 0798 } 0799 int maxLength() const 0800 { 0801 // assumes that there are no circle unarrivable from start 0802 if(isUnbounded()) 0803 return -2; 0804 if(isEmpty()) 0805 return -1; 0806 queue<size_t> _todo0, _todo1; 0807 queue<size_t> *todo0 = &_todo0, *todo1 = &_todo1; 0808 vector<int> dist(nstates, -1); 0809 vector<vector<pair<CharSet, size_t> > > brules(nstates); 0810 for(size_t i = 0; i != nstates; ++i) 0811 { 0812 foreach(const auto& nx, rules[i]) 0813 { 0814 brules[nx.second].push_back(nx); 0815 brules[nx.second].back().second = i; 0816 } 0817 } 0818 for(size_t i = accept; i != nstates; ++i) 0819 { 0820 todo0->push(i); 0821 dist[i] = 0; 0822 } 0823 int length = 0; 0824 while(!todo0->empty()) 0825 { 0826 size_t curr = todo0->front(); 0827 todo0->pop(); 0828 foreach(const auto& nx, brules[curr]) 0829 { 0830 if(nx.first.epsilon()) 0831 { 0832 if(dist[nx.second] < length) 0833 { 0834 dist[nx.second] = length; 0835 todo0->push(nx.second); 0836 } 0837 } 0838 else 0839 { 0840 if(dist[nx.second] <= length) 0841 { 0842 dist[nx.second] = length + 1; 0843 todo1->push(nx.second); 0844 } 0845 } 0846 } 0847 if(todo0->empty()) 0848 { 0849 ++length; 0850 swap(todo0, todo1); 0851 } 0852 } 0853 return dist[0]; 0854 } 0855 NFA<CharSet>& negate() 0856 { 0857 DFA<CharSet> tmp = dfa(); 0858 tmp.rules.push_back(vector< pair<CharSet, size_t> >()); 0859 tmp.rules.back().push_back(make_pair(CharSet::fullSet(), tmp.nstates)); 0860 ++tmp.nstates; 0861 for(auto i = tmp.accept.begin(); i != tmp.accept.end(); ++i) 0862 { 0863 if(*i == 0) 0864 *i = 1; 0865 else 0866 *i = 0; 0867 } 0868 tmp.accept.push_back(1); 0869 for(auto i = tmp.rules.begin(); i != tmp.rules.end(); ++i) 0870 { 0871 CharSet successSet = CharSet::fullSet(); 0872 for(auto j = i->begin(); j != i->end(); ++j) 0873 { 0874 CharSet tmp = j->first; 0875 tmp.negate(); 0876 successSet.intersect(tmp); 0877 } 0878 if(!successSet.empty()) 0879 i->push_back(make_pair(successSet, tmp.nstates - 1)); 0880 } 0881 tmp.eliminateUnarrivableStates(); 0882 tmp.eliminateInactiveStates(); 0883 return *this = tmp.nfa(); 0884 } 0885 NFA<CharSet>& operator&=(const NFA<CharSet>& o) 0886 { 0887 NFA<CharSet> _o = o; 0888 _o.negate(); 0889 negate(); 0890 *this |= _o; 0891 negate(); 0892 return *this; 0893 } 0894 NFA<CharSet>& operator^=(const NFA<CharSet>& o) 0895 { 0896 negate(); 0897 return (*this |= o).negate(); 0898 } 0899 }; 0900 0901 template<typename CharSet> 0902 NFA< CharSet > DFA<CharSet>::nfa() const 0903 { 0904 NFA<CharSet> res; 0905 if(nstates == 0) 0906 return res; 0907 res.nstates = nstates; 0908 res.rules = rules; 0909 res.accept = nstates; 0910 for(size_t i = 0; i != nstates; ++i) 0911 { 0912 if(nstates + accept[i] > res.nstates) 0913 { 0914 res.nstates = nstates + accept[i]; 0915 res.rules.resize(res.nstates); 0916 } 0917 if(accept[i] != 0) 0918 res.rules[i].push_back(make_pair(CharSet(), nstates + accept[i] - 1)); 0919 } 0920 if(res.nstates == nstates) 0921 { 0922 ++res.nstates; 0923 res.rules.push_back(vector<pair<CharSet, size_t> >()); 0924 } 0925 return res; 0926 } 0927 0928 // notice: utf8/8bit and utf16/ucs2 do not require different NFAs and DFAs 0929 #define EACH_TYPE(macro) \ 0930 switch(GDFA::type) \ 0931 { \ 0932 case SAscii: { typedef SeqCharSet<Ascii> T; macro(s0); } break; \ 0933 case SLatin1: case SUtf8: { typedef SeqCharSet<Latin1> T; macro(s1); } break; \ 0934 case SUcs2: case SUtf16: { typedef SeqCharSet<Ucs2> T; macro(s2); } break; \ 0935 case SUcs4: { typedef SeqCharSet<Ucs4> T; macro(s3); } break; \ 0936 case TAscii: { typedef TableCharSet<Ascii> T; macro(t0); } break; \ 0937 case TLatin1: case TUtf8: { typedef TableCharSet<Latin1> T; macro(t1); } break; \ 0938 case TUcs2: case TUtf16: { typedef TableCharSet<Ucs2> T; macro(t2); } break; \ 0939 default: exit(-1); \ 0940 } 0941 0942 void GDFA::codegen(QTextStream& str) 0943 { 0944 #define macro(x) x->codegen(str); 0945 EACH_TYPE(macro) 0946 #undef macro 0947 } 0948 0949 GDFA& GDFA::minimize() 0950 { 0951 #define macro(x) x->minimize(); 0952 EACH_TYPE(macro) 0953 #undef macro 0954 return *this; 0955 } 0956 0957 void GDFA::setActions(const std::vector< QString >& actions) 0958 { 0959 #define macro(x) x->setActions(actions); 0960 EACH_TYPE(macro) 0961 #undef macro 0962 } 0963 0964 GDFA::GDFA() 0965 { 0966 #define macro(x) x = new DFA<T>; 0967 EACH_TYPE(macro) 0968 #undef macro 0969 } 0970 0971 GDFA::GDFA(const KDevPG::GDFA& o) 0972 { 0973 #define macro(x) x = new DFA<T>(*o.x); 0974 EACH_TYPE(macro) 0975 #undef macro 0976 } 0977 0978 GDFA::~GDFA() 0979 { 0980 #define macro(x) delete x; 0981 EACH_TYPE(macro) 0982 #undef macro 0983 } 0984 0985 GDFA& GDFA::operator=(const KDevPG::GDFA& o) 0986 { 0987 #define macro(x) *x = *o.x; 0988 EACH_TYPE(macro) 0989 #undef macro 0990 return *this; 0991 } 0992 0993 void GDFA::inspect() 0994 { 0995 #define DO_INSPECT(x) x->inspect(); 0996 EACH_TYPE(DO_INSPECT) 0997 #undef DO_INSPECT 0998 } 0999 1000 void GDFA::dotOutput(QTextStream& o, const QString& name) 1001 { 1002 #define DO_DOT(x) x->dotOutput(o, name); 1003 EACH_TYPE(DO_DOT) 1004 #undef DO_DOT 1005 } 1006 1007 GNFA GDFA::nfa() 1008 { 1009 GNFA res; 1010 #define DO_NFA(x) res.x = new NFA<T>(x->nfa()); 1011 EACH_TYPE(DO_NFA) 1012 #undef DO_NFA 1013 return res; 1014 } 1015 1016 GNFA::GNFA() 1017 { 1018 /* no initialization */ 1019 } 1020 1021 GNFA::GNFA(const KDevPG::GNFA& o) 1022 { 1023 #define macro(x) x = new NFA<T>(*o.x); 1024 EACH_TYPE(macro) 1025 #undef macro 1026 } 1027 1028 GNFA::GNFA(const std::vector< GNFA* >& init) 1029 { 1030 #define macro(x) vector< NFA<T> > vec(init.size()); for(size_t i = 0; i != init.size(); ++i) vec[i] = *(init[i]->x); x = new NFA<T>(vec); 1031 EACH_TYPE(macro) 1032 #undef macro 1033 } 1034 1035 GNFA::~GNFA() 1036 { 1037 #define macro(x) delete x; 1038 EACH_TYPE(macro) 1039 #undef macro 1040 } 1041 1042 GNFA& GNFA::operator=(const KDevPG::GNFA& o) 1043 { 1044 #define macro(x) *x = *o.x; 1045 EACH_TYPE(macro) 1046 #undef macro 1047 return *this; 1048 } 1049 1050 bool GNFA::acceptsEpsilon() const 1051 { 1052 #define DO_AE(x) return x->acceptsEpsilon(); 1053 EACH_TYPE(DO_AE) 1054 #undef DO_AE 1055 } 1056 1057 bool GNFA::isEmpty() const 1058 { 1059 #define DO_IE(x) return x->isEmpty(); 1060 EACH_TYPE(DO_IE) 1061 #undef DO_IE 1062 } 1063 1064 bool GNFA::isUnbounded() const 1065 { 1066 #define DO_IU(x) return x->isUnbounded(); 1067 EACH_TYPE(DO_IU) 1068 #undef DO_IU 1069 } 1070 1071 int GNFA::minLength() const 1072 { 1073 #define DO_ML(x) return x->minLength(); 1074 EACH_TYPE(DO_ML) 1075 #undef DO_ML 1076 } 1077 1078 int GNFA::maxLength() const 1079 { 1080 #define DO_ML(x) return x->maxLength(); 1081 EACH_TYPE(DO_ML) 1082 #undef DO_ML 1083 } 1084 1085 GNFA& GNFA::operator<<=(const KDevPG::GNFA& o) 1086 { 1087 #define DO_SHIFT(x) *x <<= *o.x; 1088 EACH_TYPE(DO_SHIFT) 1089 #undef DO_SHIFT 1090 return *this; 1091 } 1092 1093 GNFA& GNFA::operator|=(const KDevPG::GNFA& o) 1094 { 1095 #define DO_OR(x) *x |= *o.x; 1096 EACH_TYPE(DO_OR) 1097 #undef DO_OR 1098 return *this; 1099 } 1100 1101 GNFA& GNFA::operator&=(const KDevPG::GNFA& o) 1102 { 1103 #define DO_AND(x) *x &= *o.x; 1104 EACH_TYPE(DO_AND) 1105 #undef DO_AND 1106 return *this; 1107 } 1108 1109 GNFA& GNFA::operator^=(const KDevPG::GNFA& o) 1110 { 1111 #define DO_BUT(x) *x ^= *o.x; 1112 EACH_TYPE(DO_BUT) 1113 #undef DO_BUT 1114 return *this; 1115 } 1116 1117 GNFA& GNFA::operator*() 1118 { 1119 #define DO_STAR(x) **x; 1120 EACH_TYPE(DO_STAR) 1121 #undef DO_STAR 1122 return *this; 1123 } 1124 1125 GNFA& GNFA::negate() 1126 { 1127 #define DO_NEGATE(x) x->negate(); 1128 EACH_TYPE(DO_NEGATE) 1129 #undef DO_NEGATE 1130 return *this; 1131 } 1132 1133 GNFA& GNFA::minimize() 1134 { 1135 *this = dfa().minimize().nfa(); 1136 return *this; 1137 } 1138 1139 void GNFA::inspect() 1140 { 1141 #define DO_INSPECT(x) x->inspect(); 1142 EACH_TYPE(DO_INSPECT) 1143 #undef DO_INSPECT 1144 } 1145 1146 void GNFA::dotOutput(QTextStream& o, const QString& name) 1147 { 1148 #define DO_DOT(x) x->dotOutput(o, name); 1149 EACH_TYPE(DO_DOT) 1150 #undef DO_DOT 1151 } 1152 1153 GDFA GNFA::dfa() 1154 { 1155 GDFA res; 1156 #define macro(x) *res.x = x->dfa(); 1157 EACH_TYPE(macro) 1158 #undef macro 1159 return res; 1160 } 1161 1162 GNFA GNFA::empty() 1163 { 1164 GNFA res; 1165 #define macro(x) res.x = new NFA<T>; 1166 EACH_TYPE(macro) 1167 #undef macro 1168 return res; 1169 } 1170 1171 GNFA GNFA::word(const QString& str) 1172 { 1173 #define macro(x) \ 1174 GNFA res = GNFA::emptyWord(); \ 1175 QByteArray qba(str.toUtf8()); \ 1176 Codec2FromUtf8Iterator<T::codec>::Result iter(qba); \ 1177 while(iter.hasNext()) \ 1178 { \ 1179 *res.x <<= T(iter.next()); \ 1180 } \ 1181 return res; 1182 EACH_TYPE(macro) 1183 #undef macro 1184 } 1185 1186 GNFA GNFA::collection(const QString& str) 1187 { 1188 GNFA res = GNFA::empty(); 1189 QUtf16ToUcs4Iterator iter(str); 1190 while(iter.hasNext()) 1191 { 1192 quint32 next = iter.next(); 1193 res |= range(next, next+1); 1194 } 1195 return res; 1196 } 1197 1198 GNFA GNFA::anyChar() 1199 { 1200 GNFA res; 1201 #define macro(x) res.x = new NFA<T>(T::fullSet()); 1202 EACH_TYPE(macro) 1203 #undef macro 1204 // remove the surrogate range 1205 if(/*GDFA::type == TUcs2 || */GDFA::type == TUtf16) 1206 { 1207 *res.t2 ^= NFA<TableCharSet<Ucs2> >(TableCharSet<Ucs2>::range(0xd800, 0xe000)); 1208 } 1209 else if(/*GDFA::type == SUcs2 || */GDFA::type == SUtf16) 1210 { 1211 *res.s2 ^= NFA<SeqCharSet<Ucs2> >(SeqCharSet<Ucs2>::range(0xd800, 0xe000)); 1212 } 1213 // add surrogate pairs 1214 if(GDFA::type == SUtf16) 1215 { 1216 NFA<SeqCharSet<Ucs2> > surrpairs(SeqCharSet<Ucs2>::range(0xd800, 0xdc00)); 1217 surrpairs <<= NFA<SeqCharSet<Ucs2> >(SeqCharSet<Ucs2>::range(0xdc00, 0xe000)); 1218 } 1219 if(GDFA::type == TUtf16) 1220 { 1221 NFA<TableCharSet<Ucs2> > surrpairs(TableCharSet<Ucs2>::range(0xd800, 0xdc00)); 1222 surrpairs <<= NFA<TableCharSet<Ucs2> >(TableCharSet<Ucs2>::range(0xdc00, 0xe000)); 1223 } 1224 // all valid utf-8 values 1225 if(GDFA::type == SUtf8) 1226 { 1227 NFA<SeqCharSet<Latin1> > lowerBytes(SeqCharSet<Latin1>::range(0x80, 0xc0)); 1228 NFA<SeqCharSet<Latin1> > ascii(SeqCharSet<Latin1>::range(0, 0x7f)); 1229 NFA<SeqCharSet<Latin1> > topOf2(SeqCharSet<Latin1>::range(0xc0, 0xe0)); 1230 NFA<SeqCharSet<Latin1> > topOf3(SeqCharSet<Latin1>::range(0xe0, 0xf0)); 1231 NFA<SeqCharSet<Latin1> > topOf4(SeqCharSet<Latin1>::range(0xf0, 0x100)); 1232 *res.s1 = (ascii |= (topOf2 <<= lowerBytes) |= ((topOf3 <<= lowerBytes) <<= lowerBytes) |= (((topOf4 <<= lowerBytes) <<= lowerBytes) <<= lowerBytes)); 1233 } 1234 if(GDFA::type == TUtf8) 1235 { 1236 NFA<TableCharSet<Latin1> > lowerBytes(TableCharSet<Latin1>::range(0x80, 0xc0)); 1237 NFA<TableCharSet<Latin1> > ascii(TableCharSet<Latin1>::range(0, 0x7f)); 1238 NFA<TableCharSet<Latin1> > topOf2(TableCharSet<Latin1>::range(0xc0, 0xe0)); 1239 NFA<TableCharSet<Latin1> > topOf3(TableCharSet<Latin1>::range(0xe0, 0xf0)); 1240 NFA<TableCharSet<Latin1> > topOf4(TableCharSet<Latin1>::range(0xf0, 0x100)); 1241 *res.t1 = (ascii |= (topOf2 <<= lowerBytes) |= ((topOf3 <<= lowerBytes) <<= lowerBytes) |= (((topOf4 <<= lowerBytes) <<= lowerBytes) <<= lowerBytes)); 1242 } 1243 return res; 1244 } 1245 1246 GNFA GNFA::anything() 1247 { 1248 GNFA any = anyChar(); 1249 return *any; 1250 } 1251 1252 GNFA GNFA::emptyWord() 1253 { 1254 GNFA res; 1255 #define macro(x) res.x = new NFA<T>(NFA<T>::emptyWord()); 1256 EACH_TYPE(macro) 1257 #undef macro 1258 return res; 1259 } 1260 1261 #include "generated-kdev-utf8-tuples.h" 1262 1263 template<template<CharEncoding> class CharSet> 1264 void addUtf8Range(NFA<CharSet<Latin1> >& res, quint32 begin, quint32 end) 1265 { 1266 typedef typename Codec2Int<Latin1>::Result Int; 1267 Int from0, from1, from2, from3, to0, to1, to2, to3; 1268 if(begin < 0x80) 1269 { 1270 if(end <= 0x80) 1271 { 1272 res |= getUtf8Tuples<CharSet>(Int(begin), Int(end)); 1273 return; 1274 } 1275 res |= getUtf8Tuples<CharSet>(Int(begin), Int(0x80)); 1276 begin = 0x80; 1277 } 1278 if(begin < 0x800) 1279 { 1280 from1 = (begin & 0x3f) + 0x80; 1281 from0 = (begin >> 6) + 0xc0; 1282 Int to0, to1; 1283 if(end <= 0x800) 1284 { 1285 to1 = (end & 0x3f) + 0x80; 1286 to0 = (end >> 6) + 0xc0; 1287 } 1288 else 1289 { 1290 to1 = 0x80 + (1<<6); 1291 to0 = 0xc0 + (1<<5); 1292 } 1293 res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1); 1294 if(end <= 0x800) 1295 return; 1296 begin = 0x800; 1297 } 1298 if(begin < 0x10000) 1299 { 1300 from2 = (begin & 0x3f) + 0x80; 1301 from1 = ((begin >> 6) & 0x3f) + 0x80; 1302 from0 = (begin >> 12) + 0xe0; 1303 if(end <= 0x10000) 1304 { 1305 to2 = (end & 0x3f) + 0x80; 1306 to1 = ((end >> 6) & 0x3f) + 0x80; 1307 to0 = (end >> 12) + 0xe0; 1308 } 1309 else 1310 { 1311 to2 = to1 = 0x80 + (1<<6); 1312 to0 = 0xe0 + (1 << 4); 1313 } 1314 res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1, from2, to2); 1315 if(end <= 0x10000) 1316 return; 1317 begin = 0x10000; 1318 } 1319 from3 = (begin & 0x3f) + 0x80; 1320 from2 = ((begin >> 6) & 0x3f) + 0x80; 1321 from1 = ((begin >> 12) & 0x3f) + 0x80; 1322 from0 = (begin >> 18) + 0xf0; 1323 to3 = (end & 0x3f) + 0x80; 1324 to2 = ((end >> 6) & 0x3f) + 0x80; 1325 to1 = ((end >> 12) & 0x3f) + 0x80; 1326 to0 = (end >> 18) + 0xf0; 1327 res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1, from2, to2, from3, to3); 1328 } 1329 1330 GNFA GNFA::range(quint32 begin, quint32 end) 1331 { 1332 // Utf32: take it 1333 // Latin1, Ascii, Ucs2: cut it 1334 // Utf8, Utf32: complicate 1335 GNFA res; 1336 switch(GDFA::type) 1337 { 1338 case SAscii: { res.s0 = new NFA<SeqCharSet<Ascii> >(begin >= 0x80 ? SeqCharSet<Ascii>::emptySet() : SeqCharSet<Ascii>::range(begin, min((quint32)0x80, end))); } break; 1339 case SLatin1: { res.s1 = new NFA<SeqCharSet<Latin1> >(begin >= 0x100 ? SeqCharSet<Latin1>::emptySet() : SeqCharSet<Latin1>::range(begin, min((quint32)0x100, end))); } break; 1340 case SUtf8: { 1341 res.s1 = new NFA<SeqCharSet<Latin1> >(); 1342 addUtf8Range(*res.s1, begin, end); 1343 } break; 1344 case SUcs2: { 1345 res.s2 = new NFA<SeqCharSet<Ucs2> >(begin >= 0x10000 ? SeqCharSet<Ucs2>::emptySet() : SeqCharSet<Ucs2>::range(begin, min((quint32)0x10000, end))/*.difference(SeqCharSet<Ucs2>::range(0xd800, 0xe000))*/); 1346 } break; 1347 case SUtf16: { 1348 #define UTF16_IMPL(CS, field) \ 1349 if(end >= 0x110000 || begin >= 0x110000) \ 1350 { \ 1351 res.field = new NFA<CS<Ucs2> >(); \ 1352 } \ 1353 else if(end < 0x10000) \ 1354 { \ 1355 if(begin <= 0xd800) \ 1356 { \ 1357 if(end <= begin) \ 1358 { \ 1359 res.field = new NFA<CS<Ucs2> >(); \ 1360 } \ 1361 else if(end > 0xe000) \ 1362 { \ 1363 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, 0xd800).getUnion(CS<Ucs2>::range(0xe000, end))); \ 1364 } \ 1365 else \ 1366 { \ 1367 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, min(end, (quint32)0xd800))); \ 1368 }\ 1369 }\ 1370 else\ 1371 {\ 1372 auto mbegin = max(begin, (quint32)0xe000);\ 1373 if(end <= mbegin)\ 1374 {\ 1375 res.field = new NFA<CS<Ucs2> >();\ 1376 }\ 1377 else\ 1378 {\ 1379 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(mbegin, end));\ 1380 }\ 1381 }\ 1382 }\ 1383 else if(begin >= 0x10000)\ 1384 {\ 1385 if(end <= begin)\ 1386 {\ 1387 res.field = new NFA<CS<Ucs2> >();\ 1388 }\ 1389 else\ 1390 {\ 1391 quint32 startHighSurrogate = (((begin - 0x10000) & 0xffc00) >> 10) + 0xd800;\ 1392 quint32 startLowSurrogate = ((begin - 0x10000) & 0x3ff) + 0xdc00;\ 1393 quint32 endHighSurrogate = (((end - 0x10000) & 0xffc00) >> 10) + 0xd800;\ 1394 quint32 endLowSurrogate = ((end - 0x10000) & 0x3ff) + 0xdc00;\ 1395 if(endLowSurrogate == 0xdc00)\ 1396 {\ 1397 if(startLowSurrogate == 0xdc00)\ 1398 {\ 1399 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1400 *res.field <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(0xdc00, 0xe000));\ 1401 }\ 1402 else\ 1403 {\ 1404 auto firstPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, startHighSurrogate + 1));\ 1405 firstPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1406 res.field = new NFA<CS<Ucs2> >(firstPairs);\ 1407 if(startHighSurrogate < endHighSurrogate)\ 1408 {\ 1409 auto surrPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate + 1, endHighSurrogate + 1));\ 1410 surrPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(0xdc00, 0xe000));\ 1411 *res.field |= surrPairs;\ 1412 }\ 1413 }\ 1414 }\ 1415 else\ 1416 {\ 1417 if(startLowSurrogate == 0xdc00)\ 1418 {\ 1419 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1420 }\ 1421 else\ 1422 {\ 1423 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, startHighSurrogate + 1));\ 1424 *res.field <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, (startHighSurrogate == endHighSurrogate ? endLowSurrogate : 0xe000)));\ 1425 if(startHighSurrogate != endHighSurrogate)\ 1426 {\ 1427 if(startHighSurrogate + 1 != endHighSurrogate)\ 1428 {\ 1429 auto midPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate + 1, endHighSurrogate));\ 1430 midPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1431 *res.field |= midPairs;\ 1432 }\ 1433 auto lastPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(endHighSurrogate, endHighSurrogate + 1));\ 1434 lastPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, endLowSurrogate));\ 1435 *res.field |= lastPairs;\ 1436 }\ 1437 }\ 1438 }\ 1439 }\ 1440 }\ 1441 else\ 1442 {\ 1443 if(end <= begin)\ 1444 {\ 1445 res.field = new NFA<CS<Ucs2> >();\ 1446 }\ 1447 else\ 1448 {\ 1449 if(begin < 0xd800)\ 1450 {\ 1451 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, 0xd800).getUnion(CS<Ucs2>::range(0xe000, 0x10000)));\ 1452 }\ 1453 else\ 1454 {\ 1455 res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(max(begin, 0xe000u), 0x10000));\ 1456 }\ 1457 quint32 startHighSurrogate = 0xd800;\ 1458 quint32 startLowSurrogate = 0xdc00;\ 1459 quint32 endHighSurrogate = (((end - 0x10000) & 0xffc00) >> 10) + 0xd800;\ 1460 quint32 endLowSurrogate = ((end - 0x10000) & 0x3ff) + 0xdc00;\ 1461 if(endLowSurrogate == 0xdc00)\ 1462 {\ 1463 auto surrPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1464 surrPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1465 *res.field |= surrPairs;\ 1466 }\ 1467 else\ 1468 {\ 1469 if(endHighSurrogate > startHighSurrogate)\ 1470 {\ 1471 auto midPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\ 1472 midPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\ 1473 *res.field |= midPairs;\ 1474 }\ 1475 auto lastPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(endHighSurrogate, endHighSurrogate + 1));\ 1476 lastPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, endLowSurrogate));\ 1477 *res.field |= lastPairs;\ 1478 }\ 1479 }\ 1480 } 1481 UTF16_IMPL(SeqCharSet, s2) 1482 } break; 1483 case SUcs4: { res.s3 = new NFA<SeqCharSet<Ucs4> >(SeqCharSet<Ucs4>::range(begin, end)); } break; 1484 case TAscii: { res.t0 = new NFA<TableCharSet<Ascii> >(begin >= 0x80 ? TableCharSet<Ascii>::emptySet() : TableCharSet<Ascii>::range(begin, min((quint32)0x80, end))); } break; 1485 case TLatin1: { res.t1 = new NFA<TableCharSet<Latin1> >(begin >= 0x100 ? TableCharSet<Latin1>::emptySet() : TableCharSet<Latin1>::range(begin, min((quint32)0x100, end))); } break; 1486 case TUtf8: { 1487 res.t1 = new NFA<TableCharSet<Latin1> >(); 1488 addUtf8Range(*res.t1, begin, end); 1489 } break; 1490 case TUcs2: { 1491 res.t2 = new NFA<TableCharSet<Ucs2> >(begin >= 0x10000 ? TableCharSet<Ucs2>::emptySet() : TableCharSet<Ucs2>::range(begin, min((quint32)0x10000, end))/*.difference(TableCharSet<Ucs2>::range(0xd800, 0xe000))*/); 1492 } 1493 case TUtf16: { 1494 UTF16_IMPL(TableCharSet, t2) 1495 #undef UTF16_IMPL 1496 } break; 1497 default: exit(-1); 1498 } 1499 // TODO: warning if empty 1500 return res; 1501 } 1502 1503 GNFA GNFA::character(quint32 codepoint) 1504 { 1505 return range(codepoint, codepoint+1); 1506 } 1507 1508 void deleteNFA(GNFA *ptr) 1509 { 1510 delete ptr; 1511 } 1512 1513 void deleteDFA(GDFA *ptr) 1514 { 1515 delete ptr; 1516 } 1517 1518 AutomatonType GDFA::type = SUcs2; 1519 1520 template<CharEncoding enc> 1521 const typename TableCharSet<enc>::BitArray TableCharSet<enc>::nullData = bitset<TableCharSet<enc>::nChars>(); 1522 1523 }