kdevelop-pg-qt/kdev-pg/kdev-pg-regexp.cpp

0001 /* This file is part of kdev-pg-qt
0002    Copyright (C) 2010 Jonathan Schmidt-Dominé <devel@the-user.org>
0003
0004    This library is free software; you can redistribute it and/or
0005    modify it under the terms of the GNU Library General Public
0006    License as published by the Free Software Foundation; either
0007    version 2 of the License, or (at your option) any later version.
0008
0009    This library is distributed in the hope that it will be useful,
0010    but WITHOUT ANY WARRANTY; without even the implied warranty of
0011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0012    Library General Public License for more details.
0013
0014    You should have received a copy of the GNU Library General Public License
0015    along with this library; see the file COPYING.LIB.  If not, write to
0016    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0017    Boston, MA 02110-1301, USA.
0018 */
0019
0020 #include "kdev-pg-char-sets.h"
0021 #include "kdev-pg-regexp.h"
0022 #include "kdev-pg.h"
0023 #include "kdev-pg-regexp-helper.h"
0024 #include "kdev-pg-bit-array.h"
0025 #include <iostream>
0026 #include <queue>
0027 #include <stack>
0028 #include <QHash>
0029 #include <unordered_set>
0030 #include <unordered_map>
0031
0032 using namespace std;
0033
0034 // TODO: Lookahead operator
0035 // TODO: Line-End etc.
0036 // TODO: Deactivate/activate rules (have to keep the bitsets of possible final states)
0037
0038
0039 #define q_Hash_to_tr1_hash(type) \
0040 namespace std                                               \
0041 {                                                           \
0042     template<> struct hash< type >                          \
0043     {                                                       \
0044       inline size_t operator()(const type &x) const         \
0045       {                                                     \
0046         return qHash(x);                                    \
0047       }                                                     \
0048     };                                                      \
0049 }
0050
0051 #if (QT_VERSION < QT_VERSION_CHECK(5, 14, 0))
0052 // Qt >= 5.14 already has std::hash for QBitArray
0053 q_Hash_to_tr1_hash(QBitArray)
0054 #endif
0055
0056 namespace KDevPG
0057 {
0058
0059 typedef BitArray UsedBitArray;
0060 typedef QUtf8ToUcs4Iterator Iterator;
0061 // typedef TableCharSet<Ascii> CharSet;
0062
0063 template<typename CharSet> class NFA;
0064
0065 inline QString codeForDot(QString str)
0066 {
0067   QString out = "";
0068   int pos = 0;
0069   forever
0070   {
0071     int npos = str.indexOf("\n\01!ASIgnore\"!!\n# ", pos);
0072     if(npos == -1)
0073     {
0074       out += QStringView(str).mid(pos);
0075       break;
0076     }
0077     out += QStringView(str).mid(pos, npos - pos);
0078     int nlpos = str.indexOf('\n', npos + 17);
0079     int codeendpos = str.indexOf("\n\01!AS/Ignore\"!!\n", nlpos);
0080     if(nlpos == -1 || codeendpos == -1)
0081     {
0082       out += "<junk>";
0083       break;
0084     }
0085     out += QStringView(str).mid(nlpos + 1, codeendpos - nlpos - 1);
0086     pos = codeendpos + 17;
0087   }
0088   return out.replace('\"', "\\\"").replace('\n', '\t').trimmed();
0089 }
0090
0091 /**
0092  * Deterministic finite automaton.
0093  */
0094 template<typename CS>
0095 class DFA
0096 {
0097 public:
0098     typedef CS CharSet;
0099 private:
0100     size_t nstates;
0101     vector<vector<pair<CharSet, size_t> > > rules;
0102     size_t numActions;
0103     vector<size_t> accept;
0104     vector<QString> actions;
0105     friend class NFA<CharSet>;
0106 public:
0107     /// Code used for the detected tokens in the generated code
0108     void setActions(const vector<QString>& _actions)
0109     {
0110       actions = _actions;
0111       numActions = actions.size() - 1;
0112     }
0113     /// Generate code for transitions
0114     void codegen(QTextStream& str)
0115     {
0116       str << "goto _state_0; // no warning about unused label\n";
0117       CharSetCondition<CharSet> csc;
0118       for(size_t i = 0; i != nstates; ++i)
0119       {
0120         str << "\n_state_" + QString::number(i) + ":\n";
0121         if(accept[i])
0122           str << "lpos = lxCURR_POS; lstate = " << accept[i] << ";\n";
0123         str << "lxNEXT_CHR(chr); ";
0124         csc(str, rules[i]);
0125       }
0126       str << "_end:\nplain() = lpos;\nswitch(lstate) {\n";
0127       for(size_t i = 0; i <= numActions; ++i)
0128       {
0129         str << "case " << QString::number(i) << ": ";
0130         if(i == 0)
0131           str << "goto _fail; // no warning about unused label\n_fail: ";
0132         str << "{" << actions[i] << "\nlxFINISH\n}\n";
0133       }
0134       str << "}\n";
0135     }
0136     /// .dot-output
0137     void dotOutput(QTextStream& out, const QString& name)
0138     {
0139       out << "digraph " << name << "{" << Qt::endl;
0140       for(size_t i = 0; i != nstates; ++i)
0141       {
0142         out << "s" << i << " [ label = \"" << i << "\"";
0143         if(i == 0)
0144           out << ", shape=rect, style=rounded";
0145         else
0146           out << ", shape=oval";
0147         if(accept[i] != 0)
0148           out << ", penwidth=4";
0149         out << " ];" << Qt::endl;
0150       }
0151       for(size_t i = 1; i <= numActions; ++i)
0152       {
0153         out << "f" << i << " [ label = \"" << codeForDot(actions[i]) << "\", shape=rect, penwidth=2 ];" << Qt::endl;
0154       }
0155       for(size_t i = 0; i != nstates; ++i)
0156       {
0157         for(auto j = rules[i].begin(); j != rules[i].end(); ++j)
0158         {
0159           out << "s" << i << " -> " << "s" << j->second << " [ label = \"" << j->first << "\" ];" << Qt::endl;
0160         }
0161         if(accept[i] != 0)
0162         {
0163           out << "s" << i << " -> " << "f" << accept[i] << ";" << Qt::endl;
0164         }
0165       }
0166       out << "}" << Qt::endl;
0167     }
0168     /// Debugging output
0169     void inspect()
0170     {
0171         cout << "#states = " << nstates << " accept = <";
0172         for(size_t i = 0; i != nstates; ++i)
0173             cout << ", " << accept[i];
0174         cout << "> rules = {";
0175         for(size_t i = 0; i != nstates; ++i)
0176         {
0177             cout << i << " = {";
0178             for(size_t j = 0; j != rules[i].size(); ++j)
0179                 cout << rules[i][j].first << " = " << rules[i][j].second << ", ";
0180             cout << "} ";
0181         }
0182         cout << "}" << endl;
0183     }
0184     /// Test if the string matches the regexp
0185     size_t accepts(const QString& str)
0186     {
0187         size_t state = 0;
0188         foreach(const QChar c, str)
0189         {
0190             for(const auto& r : rules[state])
0191             {
0192                 if(r.first.accepts(c.unicode())) // make it more portable!!
0193                 {
0194                     state = r.second;
0195                     goto success;
0196                 }
0197             }
0198             return false;
0199             success:;
0200         }
0201         return accept[state];
0202     }
0203     /// Can be used to test tokenization
0204     pair<Iterator, size_t> nextAction(Iterator iter)
0205     {
0206       Iterator npos = iter;
0207       size_t action = -1;
0208       size_t state = 0;
0209       while(iter.hasNext())
0210       {
0211         if(accept[state])
0212         {
0213           npos = iter;
0214           action = accept[state];
0215         }
0216         quint64 chr = iter.next(); // make it more portable!!
0217         for(const auto& r : rules[state])
0218         {
0219           if(r.first.accepts(chr))
0220           {
0221             state = r.second;
0222             goto success;
0223           }
0224         }
0225         return make_pair(npos, action);
0226         success:;
0227       }
0228       if(accept[state])
0229       {
0230         npos = iter;
0231         action = accept[state];
0232       }
0233       return make_pair(npos, action);
0234     }
0235     /// Remove each state x iff ¬active[x]
0236     DFA<CharSet>& filterStates(const UsedBitArray& active)
0237     {
0238         size_t curr = 0;
0239         vector<size_t> mapping(nstates);
0240         for(size_t i = 0; i != nstates; ++i)
0241         {
0242             if(active[i])
0243             {
0244                 rules[curr] = rules[i];
0245                 accept[curr] = accept[i];
0246                 mapping[i] = curr;
0247                 ++curr;
0248             }
0249         }
0250         rules.resize(curr);
0251         accept.resize(curr);
0252         nstates = curr;
0253         for(size_t i = 0; i != nstates; ++i)
0254         {
0255             auto orules = rules[i];
0256             rules[i].clear();
0257             for(auto j = orules.begin(); j != orules.end(); ++j)
0258             {
0259                 if(active[j->second])
0260                   rules[i].push_back(make_pair(j->first, mapping[j->second]));
0261             }
0262         }
0263         return *this;
0264     }
0265     /// Eliminate each state unarrivable from the start
0266     DFA<CharSet>& eliminateUnarrivableStates()
0267     {
0268         UsedBitArray arrivable(nstates);
0269         arrivable[0] = true;
0270         stack<size_t> todo;
0271         todo.push(0);
0272         while(todo.size())
0273         {
0274             size_t curr = todo.top();
0275             todo.pop();
0276             for(const auto& i : rules[curr])
0277             {
0278                 if(!arrivable[i.second])
0279                 {
0280                     arrivable[i.second] = true;
0281                     todo.push(i.second);
0282                 }
0283             }
0284         }
0285         return filterStates(arrivable);
0286     }
0287     /// Eliminate each state iff no final state is  arivable from it
0288     DFA<CharSet>& eliminateInactiveStates()
0289     {
0290       UsedBitArray active(nstates);
0291       for(size_t i = 0; i != nstates; ++i)
0292         if(accept[i])
0293           active[i] = true;
0294       bool changed;
0295       do
0296       {
0297         changed = false;
0298         for(size_t i = 0; i != nstates; ++i)
0299         {
0300           if(!active[i])
0301           {
0302             for(size_t j = 0; j != rules[i].size(); ++j)
0303             {
0304               if(active[rules[i][j].second])
0305               {
0306                 active[i] = true;
0307                 changed = true;
0308                 break;
0309               }
0310             }
0311           }
0312         }
0313       } while(changed);
0314       return filterStates(active);
0315     }
0316     /// followers in same group for every input
0317     bool sameGroup(size_t ngroups, const vector<size_t>& ogroups, size_t x, size_t y)
0318     {
0319         vector<CharSet> xsets(ngroups), ysets(ngroups);
0320         for(const auto& i : rules[x])
0321             xsets[ogroups[i.second]].unite(i.first);
0322         for(const auto& i : rules[y])
0323             ysets[ogroups[i.second]].unite(i.first);
0324         return xsets == ysets;
0325     }
0326     DFA<CharSet>& minimize()
0327     {
0328         eliminateUnarrivableStates();
0329         eliminateInactiveStates();
0330         vector<vector<size_t> > grinv(numActions + 1);
0331         for(size_t i = 0; i != nstates; ++i)
0332             grinv[(accept[i] - accept[0] + grinv.size()) % grinv.size()].push_back(i);
0333         size_t ongroups;
0334         vector<size_t> groups(nstates);
0335         do
0336         {
0337             for(size_t i = 0; i != grinv.size(); ++i)
0338                 for(size_t j = 0; j != grinv[i].size(); ++j)
0339                     groups[grinv[i][j]] = i;
0340             ongroups = grinv.size();
0341             grinv.clear();
0342             for(size_t i = 0; i != nstates; ++i)
0343             {
0344                 for(size_t j = 0; j != grinv.size(); ++j)
0345                 {
0346                     if(groups[grinv[j][0]] == groups[i] && sameGroup(ongroups, groups, grinv[j][0], i))
0347                     {
0348                         grinv[j].push_back(i);
0349                         goto found;
0350                     }
0351                 }
0352                 grinv.resize(grinv.size()+1);
0353                 grinv.back().push_back(i);
0354                 found:;
0355             }
0356         }
0357         while(ongroups != grinv.size());
0358         vector<vector<pair<CharSet, size_t> > > _rules(grinv.size());
0359         vector<size_t> _accept(grinv.size());
0360         for(size_t i = 0; i != nstates; ++i)
0361         {
0362             _accept[groups[i]] = accept[i];
0363             for(const auto& j : rules[i])
0364                 _rules[groups[i]].push_back(make_pair(j.first, groups[j.second]));
0365         }
0366         rules = _rules;
0367         nstates = grinv.size();
0368         accept = _accept;
0369         for(size_t i = 0; i != nstates; ++i)
0370         {
0371             vector<CharSet> mapping(nstates, CharSet::emptySet());
0372             for(const auto& j : rules[i])
0373                 mapping[j.second].unite(j.first);
0374             rules[i].clear();
0375             for(size_t j = 0; j != nstates; ++j)
0376                 if(!mapping[j].empty())
0377                     rules[i].push_back(make_pair(mapping[j], j));
0378         }
0379         eliminateUnarrivableStates();
0380         eliminateInactiveStates();
0381         return *this;
0382     }
0383     NFA<CharSet> nfa() const;
0384 };
0385
0386 template<typename CS>
0387 class NFA
0388 {
0389 public:
0390     typedef CS CharSet;
0391 private:
0392     size_t nstates;
0393     vector<vector<pair<CharSet, size_t> > > rules;
0394     size_t accept;
0395     friend class DFA<CharSet>;
0396 public:
0397     void inspect()
0398     {
0399         cout << "#states = " << nstates << " accept = " << accept << " rules = {";
0400         for(size_t i = 0; i != nstates; ++i)
0401         {
0402             cout << i << " = {";
0403             for(size_t j = 0; j != rules[i].size(); ++j)
0404                 cout << rules[i][j].first << " = " << rules[i][j].second << ", ";
0405             cout << "} ";
0406         }
0407         cout << "}" << endl;
0408     }
0409     void dotOutput(QTextStream& out, const QString& name)
0410     {
0411       out << "digraph " << name << "{" << Qt::endl;
0412       for(size_t i = 0; i != nstates; ++i)
0413       {
0414         out << "s" << i << " [ label = \"" << i << "\"";
0415         if(i == 0)
0416           out << ", shape=rect, style=rounded";
0417         else
0418           out << ", shape=oval";
0419         if(i >= accept)
0420           out << ", penwidth=4";
0421         out << " ];" << Qt::endl;
0422       }
0423       for(size_t i = 0; i != nstates; ++i)
0424       {
0425         for(auto j = rules[i].begin(); j != rules[i].end(); ++j)
0426         {
0427           out << "s" << i << " -> " << "s" << j->second << " [ label = \"" << j->first << "\" ];" << Qt::endl;
0428         }
0429       }
0430       out << "}" << Qt::endl;
0431     }
0432     /**
0433      * Accepts no words.
0434      */
0435     NFA() : nstates(2), rules(2), accept(1)
0436     {}
0437     static NFA<CharSet> emptyWord()
0438     {
0439       NFA<CharSet> res;
0440       res.nstates = 1;
0441       res.rules.resize(1);
0442       res.accept = 0;
0443       return res;
0444     }
0445     NFA(const NFA<CharSet>& o) : nstates(o.nstates), rules(o.rules), accept(o.accept)
0446     {}
0447     NFA(const CharSet& set) : nstates(2), rules(2), accept(1)
0448     {
0449         rules[0].push_back(make_pair(set, 1));
0450     }
0451     explicit NFA(const std::vector<NFA<CharSet> >& list)
0452     {
0453       if(list.size() == 0)
0454       {
0455         nstates = 2;
0456         rules.resize(2);
0457         accept = 1;
0458       }
0459       else
0460       {
0461         accept = 1;
0462         for(size_t i = 0; i != list.size(); ++i)
0463           accept += list.begin()[i].nstates;
0464         nstates = accept + list.size();
0465         rules.resize(nstates);
0466         size_t start = 1;
0467         for(size_t i = 0; i != list.size(); ++i)
0468         {
0469           for(size_t j = start; j != start + list.begin()[i].nstates; ++j)
0470           {
0471               rules[j] = list.begin()[i].rules[j - start];
0472               for(size_t k = 0; k != rules[j].size(); ++k)
0473                   rules[j][k].second += start;
0474           }
0475           rules[0].push_back(make_pair(CharSet(), start));
0476           start += list.begin()[i].nstates;
0477           rules[start - 1].push_back(make_pair(CharSet(), nstates - list.size() + i));
0478         }
0479       }
0480     }
0481     void adjust(size_t startnum)
0482     {
0483         accept += startnum;
0484         nstates += startnum;
0485         rules.resize(nstates);
0486         for(size_t i = nstates; i != startnum; )
0487         {
0488             --i;
0489             rules[i] = rules[i - startnum];
0490             for(size_t j = 0; j != rules[i].size(); ++j)
0491                 rules[i][j].second += startnum;
0492         }
0493         for(size_t i = 0; i != startnum; ++i)
0494             rules[i].clear();
0495     }
0496     NFA<CharSet>& operator|=(const NFA<CharSet>& o)
0497     {
0498         adjust(1);
0499         rules.resize(o.nstates+nstates+2);
0500         for(size_t i = nstates; i != nstates + o.nstates; ++i)
0501         {
0502             rules[i] = o.rules[i - nstates];
0503             for(size_t j = 0; j != rules[i].size(); ++j)
0504                 rules[i][j].second += nstates;
0505         }
0506         rules[0].push_back(make_pair(CharSet(), 1));
0507         rules[0].push_back(make_pair(CharSet(), nstates));
0508         nstates += o.nstates;
0509         rules[accept].push_back(make_pair(CharSet(), nstates));
0510         rules[nstates-1].push_back(make_pair(CharSet(), nstates));
0511         accept = nstates;
0512         ++nstates;
0513         return *this;
0514     }
0515     NFA<CharSet>& operator<<=(const NFA<CharSet>& o)
0516     {
0517         rules.resize(nstates + o.nstates + 1);
0518         rules[accept].push_back(make_pair(CharSet(), nstates));
0519         accept = o.accept + nstates;
0520         for(size_t i = nstates; i != nstates+o.nstates; ++i)
0521         {
0522             rules[i] = o.rules[i-nstates];
0523             for(size_t j = 0; j != rules[i].size(); ++j)
0524                 rules[i][j].second += nstates;
0525         }
0526         nstates += o.nstates;
0527         return *this;
0528     }
0529     NFA<CharSet>& operator*()
0530     {
0531         rules[accept].push_back(make_pair(CharSet(), 0));
0532         rules[0].push_back(make_pair(CharSet(), nstates));
0533         accept = nstates;
0534         ++nstates;
0535         rules.resize(nstates);
0536         return *this;
0537     }
0538     UsedBitArray closure(const UsedBitArray& states)
0539     {
0540         UsedBitArray s(states);
0541         assert(s.size() == nstates);
0542         stack<size_t> todo;
0543         for(size_t i = 0; i != nstates; ++i)
0544             if(s[i])
0545                 todo.push(i);
0546         while(todo.size())
0547         {
0548             size_t x = todo.top();
0549             todo.pop();
0550             for(size_t i = 0; i != rules[x].size(); ++i)
0551             {
0552                 if(rules[x][i].first.epsilon() && !s[rules[x][i].second])
0553                 {
0554                     s[rules[x][i].second] = true;
0555                     todo.push(rules[x][i].second);
0556                 }
0557             }
0558         }
0559         return s;
0560     }
0561     UsedBitArray vecUnion(const UsedBitArray& a, const UsedBitArray& b)
0562     {
0563         UsedBitArray res(a);
0564         for(size_t i = 0; i != a.size(); ++i)
0565             res[i] = res[i] || b[i];
0566         return res;
0567     }
0568     vector< pair<CharSet, UsedBitArray > > follow(const UsedBitArray& states)
0569     {
0570         UsedBitArray s(states);
0571         vector<pair<CharSet, UsedBitArray > > pr(nstates);
0572         for(size_t i = 0; i != nstates; ++i)
0573         {
0574             if(s[i])
0575             {
0576                 for(const auto& r : rules[i])
0577                 {
0578                     UsedBitArray nxt(nstates);
0579                     nxt[r.second] = true;
0580                     pr.push_back(make_pair(r.first, nxt));
0581                 }
0582             }
0583         }
0584         bool b = true;
0585         while(b)
0586         {
0587             b = false;
0588             for(size_t i = 0; i != pr.size(); ++i)
0589             {
0590                 if(!pr[i].first.epsilon())
0591                 {
0592                     for(size_t j = 0; j != i; ++j)
0593                     {
0594                         if(!pr[j].first.epsilon())
0595                         {
0596                             CharSet inter = pr[i].first.intersection(pr[j].first);
0597                             if(!inter.empty())
0598                             {
0599                                 CharSet diff1 = pr[i].first.difference(inter);
0600                                 CharSet diff2 = pr[j].first.difference(inter);
0601                                 UsedBitArray v1 = pr[i].second, v2 = pr[j].second;
0602                                 pr[i].first = inter;
0603                                 pr[i].second = vecUnion(pr[i].second, pr[j].second);
0604                                 if(diff1.empty())
0605                                 {
0606                                     if(diff2.empty())
0607                                     {
0608                                         swap(pr[j], pr.back());
0609                                         pr.pop_back();
0610                                     }
0611                                     else
0612                                     {
0613                                         pr[j].first = diff2;
0614                                         pr[j].second = v2;
0615                                     }
0616                                 }
0617                                 else
0618                                 {
0619                                     pr[j].first = diff1;
0620                                     pr[j].second = v1;
0621                                     if(!diff2.empty())
0622                                         pr.push_back(make_pair(diff2, v2));
0623                                 }
0624                                 b = true;
0625                                 goto retry;
0626                             }
0627                         }
0628                     }
0629                 }
0630             }
0631             retry:;
0632         }
0633         return pr;
0634     }
0635     DFA<CharSet> dfa()
0636     {
0637         unordered_set<UsedBitArray> states;
0638         unordered_map<UsedBitArray, vector<pair<CharSet, UsedBitArray > > > rules;
0639         stack<UsedBitArray > todo;
0640         UsedBitArray start(nstates);
0641         start[0] = true;
0642         start = closure(start);
0643         todo.push(start);
0644         while(todo.size())
0645         {
0646             UsedBitArray x = todo.top();
0647             todo.pop();
0648             states.insert(x);
0649             rules[x];
0650             vector<pair<CharSet, UsedBitArray > > nxt = follow(x);
0651             for(const auto& nx : nxt)
0652             {
0653                 if(!nx.first.epsilon())
0654                 {
0655                     UsedBitArray n = closure(nx.second);
0656                     if(!states.count(n))
0657                         todo.push(n);
0658                     rules[x].push_back(make_pair(nx.first, n));
0659                 }
0660             }
0661         }
0662         DFA<CharSet> _;
0663         _.nstates = states.size();
0664         _.accept.resize(_.nstates);
0665         _.numActions = nstates - accept;
0666         _.actions.resize(_.numActions + 1);
0667         unordered_map<UsedBitArray, size_t> st;
0668         size_t cnt = 0;
0669         foreach(const UsedBitArray& i, states)
0670         {
0671             if(i == start && cnt)
0672             {
0673                 st[*states.begin()] = cnt;
0674                 st[i] = 0;
0675             }
0676             else
0677                 st[i] = cnt;
0678             ++cnt;
0679         }
0680         _.rules.resize(_.nstates);
0681         for(auto i = st.begin(); i != st.end(); ++i)
0682         {
0683           for(size_t j = accept; j != nstates; ++j)
0684           {
0685             if(i->first[j])
0686             {
0687               _.accept[i->second] = j - accept + 1;
0688               break;
0689             }
0690           }
0691         }
0692         for(auto i = rules.begin(); i != rules.end(); ++i)
0693         {
0694             size_t key = st[i->first];
0695             for(const auto& nx : i->second)
0696                 _.rules[key].push_back(make_pair(nx.first, st[nx.second]));
0697         }
0698         return _;
0699     }
0700     bool acceptsEpsilon() const
0701     {
0702       stack<size_t> todo;
0703       todo.push(0);
0704       UsedBitArray vis(nstates);
0705       vis[0] = true;
0706       while(!todo.empty())
0707       {
0708         size_t curr = todo.top();
0709         todo.pop();
0710         if(curr >= accept)
0711           return true;
0712         foreach(const auto& nx, rules[curr])
0713         {
0714           if(nx.first.epsilon() && !vis[nx.second])
0715           {
0716             vis[nx.second] = true;
0717             todo.push(nx.second);
0718           }
0719         }
0720       }
0721       return false;
0722     }
0723     bool isEmpty() const
0724     {
0725       stack<size_t> todo;
0726       todo.push(0);
0727       UsedBitArray vis(nstates);
0728       vis[0] = true;
0729       while(!todo.empty())
0730       {
0731         size_t curr = todo.top();
0732         todo.pop();
0733         if(curr >= accept)
0734           return false;
0735         foreach(const auto& nx, rules[curr])
0736         {
0737           if(!vis[nx.second])
0738           {
0739             vis[nx.second] =true;
0740             todo.push(nx.second);
0741           }
0742         }
0743       }
0744       return true;
0745     }
0746     int minLength() const
0747     {
0748       queue<size_t> _todo0, _todo1;
0749       queue<size_t> *todo0 = &_todo0, *todo1 = &_todo1;
0750       todo0->push(0);
0751       UsedBitArray vis(nstates);
0752       vis[0] = true;
0753       int length = 0;
0754       while(!todo0->empty())
0755       {
0756         size_t curr = todo0->front();
0757         todo0->pop();
0758         if(curr >= accept)
0759           return length;
0760         foreach(const auto& nx, rules[curr])
0761         {
0762           if(!vis[nx.second])
0763           {
0764             vis[nx.second] = true;
0765             if(nx.first.epsilon())
0766               todo0->push(nx.second);
0767             else
0768               todo1->push(nx.second);
0769           }
0770         }
0771         if(todo0->empty())
0772         {
0773           ++length;
0774           swap(todo0, todo1);
0775         }
0776       }
0777       return -1;
0778     }
0779 private:
0780     bool unboundedCheck(size_t x, UsedBitArray& vis) const
0781     {
0782       vis[x] = true;
0783       foreach(const auto& nx, rules[x])
0784       {
0785         if(vis[nx.second])
0786           return true;
0787         else
0788           if(unboundedCheck(nx.second, vis))
0789             return true;
0790       }
0791       vis[x] = false;
0792       return false;
0793     }
0794 public:
0795     bool isUnbounded() const
0796     {
0797       UsedBitArray vis(nstates);
0798       return unboundedCheck(0, vis);
0799     }
0800     int maxLength() const
0801     {
0802       // assumes that there are no circle unarrivable from start
0803       if(isUnbounded())
0804         return -2;
0805       if(isEmpty())
0806         return -1;
0807       queue<size_t> _todo0, _todo1;
0808       queue<size_t> *todo0 = &_todo0, *todo1 = &_todo1;
0809       vector<int> dist(nstates, -1);
0810       vector<vector<pair<CharSet, size_t> > > brules(nstates);
0811       for(size_t i = 0; i != nstates; ++i)
0812       {
0813         foreach(const auto& nx, rules[i])
0814         {
0815           brules[nx.second].push_back(nx);
0816           brules[nx.second].back().second = i;
0817         }
0818       }
0819       for(size_t i = accept; i != nstates; ++i)
0820       {
0821         todo0->push(i);
0822         dist[i] = 0;
0823       }
0824       int length = 0;
0825       while(!todo0->empty())
0826       {
0827         size_t curr = todo0->front();
0828         todo0->pop();
0829         foreach(const auto& nx, brules[curr])
0830         {
0831           if(nx.first.epsilon())
0832           {
0833             if(dist[nx.second] < length)
0834             {
0835               dist[nx.second] = length;
0836               todo0->push(nx.second);
0837             }
0838           }
0839           else
0840           {
0841             if(dist[nx.second] <= length)
0842             {
0843               dist[nx.second] = length + 1;
0844               todo1->push(nx.second);
0845             }
0846           }
0847         }
0848         if(todo0->empty())
0849         {
0850           ++length;
0851           swap(todo0, todo1);
0852         }
0853       }
0854       return dist[0];
0855     }
0856     NFA<CharSet>& negate()
0857     {
0858       DFA<CharSet> tmp = dfa();
0859       tmp.rules.push_back(vector< pair<CharSet, size_t> >());
0860       tmp.rules.back().push_back(make_pair(CharSet::fullSet(), tmp.nstates));
0861       ++tmp.nstates;
0862       for(auto i = tmp.accept.begin(); i != tmp.accept.end(); ++i)
0863       {
0864         if(*i == 0)
0865           *i = 1;
0866         else
0867           *i = 0;
0868       }
0869       tmp.accept.push_back(1);
0870       for(auto i = tmp.rules.begin(); i != tmp.rules.end(); ++i)
0871       {
0872         CharSet successSet = CharSet::fullSet();
0873         for(auto j = i->begin(); j != i->end(); ++j)
0874         {
0875           CharSet tmp = j->first;
0876           tmp.negate();
0877           successSet.intersect(tmp);
0878         }
0879         if(!successSet.empty())
0880           i->push_back(make_pair(successSet, tmp.nstates - 1));
0881       }
0882       tmp.eliminateUnarrivableStates();
0883       tmp.eliminateInactiveStates();
0884       return *this = tmp.nfa();
0885     }
0886     NFA<CharSet>& operator&=(const NFA<CharSet>& o)
0887     {
0888       NFA<CharSet> _o = o;
0889       _o.negate();
0890       negate();
0891       *this |= _o;
0892       negate();
0893       return *this;
0894     }
0895     NFA<CharSet>& operator^=(const NFA<CharSet>& o)
0896     {
0897       negate();
0898       return (*this |= o).negate();
0899     }
0900 };
0901
0902 template<typename CharSet>
0903 NFA< CharSet > DFA<CharSet>::nfa() const
0904 {
0905   NFA<CharSet> res;
0906   if(nstates == 0)
0907     return res;
0908   res.nstates = nstates;
0909   res.rules = rules;
0910   res.accept = nstates;
0911   for(size_t i = 0; i != nstates; ++i)
0912   {
0913     if(nstates + accept[i] > res.nstates)
0914     {
0915       res.nstates = nstates + accept[i];
0916       res.rules.resize(res.nstates);
0917     }
0918     if(accept[i] != 0)
0919       res.rules[i].push_back(make_pair(CharSet(), nstates + accept[i] - 1));
0920   }
0921   if(res.nstates == nstates)
0922   {
0923     ++res.nstates;
0924     res.rules.push_back(vector<pair<CharSet, size_t> >());
0925   }
0926   return res;
0927 }
0928
0929 // notice: utf8/8bit and utf16/ucs2 do not require different NFAs and DFAs
0930 #define EACH_TYPE(macro) \
0931 switch(GDFA::type) \
0932 { \
0933   case SAscii: { typedef SeqCharSet<Ascii> T; macro(s0); } break; \
0934   case SLatin1: case SUtf8: { typedef SeqCharSet<Latin1> T; macro(s1); } break; \
0935   case SUcs2: case SUtf16: { typedef SeqCharSet<Ucs2> T; macro(s2); } break; \
0936   case SUcs4: { typedef SeqCharSet<Ucs4> T; macro(s3); } break; \
0937   case TAscii: { typedef TableCharSet<Ascii> T; macro(t0); } break; \
0938   case TLatin1: case TUtf8: { typedef TableCharSet<Latin1> T; macro(t1); } break; \
0939   case TUcs2: case TUtf16: { typedef TableCharSet<Ucs2> T; macro(t2); } break; \
0940   default: exit(-1); \
0941 }
0942
0943 void GDFA::codegen(QTextStream& str)
0944 {
0945 #define macro(x) x->codegen(str);
0946     EACH_TYPE(macro)
0947 #undef macro
0948 }
0949
0950 GDFA& GDFA::minimize()
0951 {
0952 #define macro(x) x->minimize();
0953   EACH_TYPE(macro)
0954 #undef macro
0955   return *this;
0956 }
0957
0958 void GDFA::setActions(const std::vector< QString >& actions)
0959 {
0960 #define macro(x) x->setActions(actions);
0961   EACH_TYPE(macro)
0962 #undef macro
0963 }
0964
0965 GDFA::GDFA()
0966 {
0967 #define macro(x) x = new DFA<T>;
0968     EACH_TYPE(macro)
0969 #undef macro
0970 }
0971
0972 GDFA::GDFA(const KDevPG::GDFA& o)
0973 {
0974 #define macro(x) x = new DFA<T>(*o.x);
0975     EACH_TYPE(macro)
0976 #undef macro
0977 }
0978
0979 GDFA::~GDFA()
0980 {
0981 #define macro(x) delete x;
0982     EACH_TYPE(macro)
0983 #undef macro
0984 }
0985
0986 GDFA& GDFA::operator=(const KDevPG::GDFA& o)
0987 {
0988 #define macro(x) *x = *o.x;
0989   EACH_TYPE(macro)
0990 #undef macro
0991   return *this;
0992 }
0993
0994 void GDFA::inspect()
0995 {
0996 #define DO_INSPECT(x) x->inspect();
0997   EACH_TYPE(DO_INSPECT)
0998 #undef DO_INSPECT
0999 }
1000
1001 void GDFA::dotOutput(QTextStream& o, const QString& name)
1002 {
1003 #define DO_DOT(x) x->dotOutput(o, name);
1004   EACH_TYPE(DO_DOT)
1005 #undef DO_DOT
1006 }
1007
1008 GNFA GDFA::nfa()
1009 {
1010   GNFA res;
1011 #define DO_NFA(x) res.x = new NFA<T>(x->nfa());
1012   EACH_TYPE(DO_NFA)
1013 #undef DO_NFA
1014   return res;
1015 }
1016
1017 GNFA::GNFA()
1018 {
1019   /* no initialization */
1020 }
1021
1022 GNFA::GNFA(const KDevPG::GNFA& o)
1023 {
1024 #define macro(x) x = new NFA<T>(*o.x);
1025   EACH_TYPE(macro)
1026 #undef macro
1027 }
1028
1029 GNFA::GNFA(const std::vector< GNFA* >& init)
1030 {
1031 #define macro(x) vector< NFA<T> > vec(init.size()); for(size_t i = 0; i != init.size(); ++i) vec[i] = *(init[i]->x); x = new NFA<T>(vec);
1032   EACH_TYPE(macro)
1033 #undef macro
1034 }
1035
1036 GNFA::~GNFA()
1037 {
1038 #define macro(x) delete x;
1039   EACH_TYPE(macro)
1040 #undef macro
1041 }
1042
1043 GNFA& GNFA::operator=(const KDevPG::GNFA& o)
1044 {
1045 #define macro(x) *x = *o.x;
1046   EACH_TYPE(macro)
1047 #undef macro
1048   return *this;
1049 }
1050
1051 bool GNFA::acceptsEpsilon() const
1052 {
1053 #define DO_AE(x) return x->acceptsEpsilon();
1054   EACH_TYPE(DO_AE)
1055 #undef DO_AE
1056 }
1057
1058 bool GNFA::isEmpty() const
1059 {
1060 #define DO_IE(x) return x->isEmpty();
1061   EACH_TYPE(DO_IE)
1062 #undef DO_IE
1063 }
1064
1065 bool GNFA::isUnbounded() const
1066 {
1067 #define DO_IU(x) return x->isUnbounded();
1068   EACH_TYPE(DO_IU)
1069 #undef DO_IU
1070 }
1071
1072 int GNFA::minLength() const
1073 {
1074 #define DO_ML(x) return x->minLength();
1075   EACH_TYPE(DO_ML)
1076 #undef DO_ML
1077 }
1078
1079 int GNFA::maxLength() const
1080 {
1081   #define DO_ML(x) return x->maxLength();
1082   EACH_TYPE(DO_ML)
1083   #undef DO_ML
1084 }
1085
1086 GNFA& GNFA::operator<<=(const KDevPG::GNFA& o)
1087 {
1088 #define DO_SHIFT(x) *x <<= *o.x;
1089   EACH_TYPE(DO_SHIFT)
1090 #undef DO_SHIFT
1091   return *this;
1092 }
1093
1094 GNFA& GNFA::operator|=(const KDevPG::GNFA& o)
1095 {
1096 #define DO_OR(x) *x |= *o.x;
1097   EACH_TYPE(DO_OR)
1098 #undef DO_OR
1099   return *this;
1100 }
1101
1102 GNFA& GNFA::operator&=(const KDevPG::GNFA& o)
1103 {
1104 #define DO_AND(x) *x &= *o.x;
1105   EACH_TYPE(DO_AND)
1106 #undef DO_AND
1107   return *this;
1108 }
1109
1110 GNFA& GNFA::operator^=(const KDevPG::GNFA& o)
1111 {
1112   #define DO_BUT(x) *x ^= *o.x;
1113   EACH_TYPE(DO_BUT)
1114   #undef DO_BUT
1115   return *this;
1116 }
1117
1118 GNFA& GNFA::operator*()
1119 {
1120 #define DO_STAR(x) **x;
1121   EACH_TYPE(DO_STAR)
1122 #undef DO_STAR
1123   return *this;
1124 }
1125
1126 GNFA& GNFA::negate()
1127 {
1128 #define DO_NEGATE(x) x->negate();
1129   EACH_TYPE(DO_NEGATE)
1130 #undef DO_NEGATE
1131   return *this;
1132 }
1133
1134 GNFA& GNFA::minimize()
1135 {
1136   *this = dfa().minimize().nfa();
1137   return *this;
1138 }
1139
1140 void GNFA::inspect()
1141 {
1142 #define DO_INSPECT(x) x->inspect();
1143   EACH_TYPE(DO_INSPECT)
1144 #undef DO_INSPECT
1145 }
1146
1147 void GNFA::dotOutput(QTextStream& o, const QString& name)
1148 {
1149 #define DO_DOT(x) x->dotOutput(o, name);
1150   EACH_TYPE(DO_DOT)
1151 #undef DO_DOT
1152 }
1153
1154 GDFA GNFA::dfa()
1155 {
1156   GDFA res;
1157 #define macro(x) *res.x = x->dfa();
1158   EACH_TYPE(macro)
1159 #undef macro
1160   return res;
1161 }
1162
1163 GNFA GNFA::empty()
1164 {
1165   GNFA res;
1166 #define macro(x) res.x = new NFA<T>;
1167   EACH_TYPE(macro)
1168 #undef macro
1169   return res;
1170 }
1171
1172 GNFA GNFA::word(const QString& str)
1173 {
1174 #define macro(x) \
1175   GNFA res = GNFA::emptyWord(); \
1176   QByteArray qba(str.toUtf8()); \
1177   Codec2FromUtf8Iterator<T::codec>::Result iter(qba); \
1178   while(iter.hasNext()) \
1179   { \
1180     *res.x <<= T(iter.next()); \
1181   } \
1182   return res;
1183   EACH_TYPE(macro)
1184 #undef macro
1185 }
1186
1187 GNFA GNFA::collection(const QString& str)
1188 {
1189   GNFA res = GNFA::empty();
1190   QUtf16ToUcs4Iterator iter(str);
1191   while(iter.hasNext())
1192   {
1193     quint32 next = iter.next();
1194     res |= range(next, next+1);
1195   }
1196   return res;
1197 }
1198
1199 GNFA GNFA::anyChar()
1200 {
1201   GNFA res;
1202 #define macro(x) res.x = new NFA<T>(T::fullSet());
1203   EACH_TYPE(macro)
1204 #undef macro
1205   // remove the surrogate range
1206   if(/*GDFA::type == TUcs2 || */GDFA::type == TUtf16)
1207   {
1208     *res.t2 ^= NFA<TableCharSet<Ucs2> >(TableCharSet<Ucs2>::range(0xd800, 0xe000));
1209   }
1210   else if(/*GDFA::type == SUcs2 || */GDFA::type == SUtf16)
1211   {
1212     *res.s2 ^= NFA<SeqCharSet<Ucs2> >(SeqCharSet<Ucs2>::range(0xd800, 0xe000));
1213   }
1214   // add surrogate pairs
1215   if(GDFA::type == SUtf16)
1216   {
1217     NFA<SeqCharSet<Ucs2> > surrpairs(SeqCharSet<Ucs2>::range(0xd800, 0xdc00));
1218     surrpairs <<= NFA<SeqCharSet<Ucs2> >(SeqCharSet<Ucs2>::range(0xdc00, 0xe000));
1219   }
1220   if(GDFA::type == TUtf16)
1221   {
1222     NFA<TableCharSet<Ucs2> > surrpairs(TableCharSet<Ucs2>::range(0xd800, 0xdc00));
1223     surrpairs <<= NFA<TableCharSet<Ucs2> >(TableCharSet<Ucs2>::range(0xdc00, 0xe000));
1224   }
1225   // all valid utf-8 values
1226   if(GDFA::type == SUtf8)
1227   {
1228     NFA<SeqCharSet<Latin1> > lowerBytes(SeqCharSet<Latin1>::range(0x80, 0xc0));
1229     NFA<SeqCharSet<Latin1> > ascii(SeqCharSet<Latin1>::range(0, 0x7f));
1230     NFA<SeqCharSet<Latin1> > topOf2(SeqCharSet<Latin1>::range(0xc0, 0xe0));
1231     NFA<SeqCharSet<Latin1> > topOf3(SeqCharSet<Latin1>::range(0xe0, 0xf0));
1232     NFA<SeqCharSet<Latin1> > topOf4(SeqCharSet<Latin1>::range(0xf0, 0x100));
1233     *res.s1 = (ascii |= (topOf2 <<= lowerBytes) |= ((topOf3 <<= lowerBytes) <<= lowerBytes) |= (((topOf4 <<= lowerBytes) <<= lowerBytes) <<= lowerBytes));
1234   }
1235   if(GDFA::type == TUtf8)
1236   {
1237     NFA<TableCharSet<Latin1> > lowerBytes(TableCharSet<Latin1>::range(0x80, 0xc0));
1238     NFA<TableCharSet<Latin1> > ascii(TableCharSet<Latin1>::range(0, 0x7f));
1239     NFA<TableCharSet<Latin1> > topOf2(TableCharSet<Latin1>::range(0xc0, 0xe0));
1240     NFA<TableCharSet<Latin1> > topOf3(TableCharSet<Latin1>::range(0xe0, 0xf0));
1241     NFA<TableCharSet<Latin1> > topOf4(TableCharSet<Latin1>::range(0xf0, 0x100));
1242     *res.t1 = (ascii |= (topOf2 <<= lowerBytes) |= ((topOf3 <<= lowerBytes) <<= lowerBytes) |= (((topOf4 <<= lowerBytes) <<= lowerBytes) <<= lowerBytes));
1243   }
1244   return res;
1245 }
1246
1247 GNFA GNFA::anything()
1248 {
1249   GNFA any = anyChar();
1250   return *any;
1251 }
1252
1253 GNFA GNFA::emptyWord()
1254 {
1255   GNFA res;
1256 #define macro(x) res.x = new NFA<T>(NFA<T>::emptyWord());
1257   EACH_TYPE(macro)
1258 #undef macro
1259   return res;
1260 }
1261
1262 #include "generated-kdev-utf8-tuples.h"
1263
1264 template<template<CharEncoding> class CharSet>
1265 void addUtf8Range(NFA<CharSet<Latin1> >& res, quint32 begin, quint32 end)
1266 {
1267   typedef typename Codec2Int<Latin1>::Result Int;
1268   Int from0, from1, from2, from3, to0, to1, to2, to3;
1269   if(begin < 0x80)
1270   {
1271     if(end <= 0x80)
1272     {
1273       res |= getUtf8Tuples<CharSet>(Int(begin), Int(end));
1274       return;
1275     }
1276     res |= getUtf8Tuples<CharSet>(Int(begin), Int(0x80));
1277     begin = 0x80;
1278   }
1279   if(begin < 0x800)
1280   {
1281     from1 = (begin & 0x3f) + 0x80;
1282     from0 = (begin >> 6) + 0xc0;
1283     Int to0, to1;
1284     if(end <= 0x800)
1285     {
1286       to1 = (end & 0x3f) + 0x80;
1287       to0 = (end >> 6) + 0xc0;
1288     }
1289     else
1290     {
1291       to1 = 0x80 + (1<<6);
1292       to0 = 0xc0 + (1<<5);
1293     }
1294     res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1);
1295     if(end <= 0x800)
1296       return;
1297     begin = 0x800;
1298   }
1299   if(begin < 0x10000)
1300   {
1301     from2 = (begin & 0x3f) + 0x80;
1302     from1 = ((begin >> 6) & 0x3f) + 0x80;
1303     from0 = (begin >> 12) + 0xe0;
1304     if(end <= 0x10000)
1305     {
1306       to2 = (end & 0x3f) + 0x80;
1307       to1 = ((end >> 6) & 0x3f) + 0x80;
1308       to0 = (end >> 12) + 0xe0;
1309     }
1310     else
1311     {
1312       to2 = to1 = 0x80 + (1<<6);
1313       to0 = 0xe0 + (1 << 4);
1314     }
1315     res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1, from2, to2);
1316     if(end <= 0x10000)
1317       return;
1318     begin = 0x10000;
1319   }
1320   from3 = (begin & 0x3f) + 0x80;
1321   from2 = ((begin >> 6) & 0x3f) + 0x80;
1322   from1 = ((begin >> 12) & 0x3f) + 0x80;
1323   from0 = (begin >> 18) + 0xf0;
1324   to3 = (end & 0x3f) + 0x80;
1325   to2 = ((end >> 6) & 0x3f) + 0x80;
1326   to1 = ((end >> 12) & 0x3f) + 0x80;
1327   to0 = (end >> 18) + 0xf0;
1328   res |= getUtf8Tuples<CharSet>(from0, to0, from1, to1, from2, to2, from3, to3);
1329 }
1330
1331 GNFA GNFA::range(quint32 begin, quint32 end)
1332 {
1333   // Utf32: take it
1334   // Latin1, Ascii, Ucs2: cut it
1335   // Utf8, Utf32: complicate
1336   GNFA res;
1337   switch(GDFA::type)
1338   {
1339     case SAscii: { res.s0 = new NFA<SeqCharSet<Ascii> >(begin >= 0x80 ? SeqCharSet<Ascii>::emptySet() : SeqCharSet<Ascii>::range(begin, min((quint32)0x80, end))); } break;
1340     case SLatin1: { res.s1 = new NFA<SeqCharSet<Latin1> >(begin >= 0x100 ? SeqCharSet<Latin1>::emptySet() : SeqCharSet<Latin1>::range(begin, min((quint32)0x100, end))); } break;
1341     case SUtf8: {
1342       res.s1 = new NFA<SeqCharSet<Latin1> >();
1343       addUtf8Range(*res.s1, begin, end);
1344     } break;
1345     case SUcs2: {
1346       res.s2 = new NFA<SeqCharSet<Ucs2> >(begin >= 0x10000 ? SeqCharSet<Ucs2>::emptySet() : SeqCharSet<Ucs2>::range(begin, min((quint32)0x10000, end))/*.difference(SeqCharSet<Ucs2>::range(0xd800, 0xe000))*/);
1347     } break;
1348     case SUtf16: {
1349 #define UTF16_IMPL(CS, field) \
1350       if(end >= 0x110000 || begin >= 0x110000)                                          \
1351       {                                                                                 \
1352         res.field = new NFA<CS<Ucs2> >();                                               \
1353       }                                                                                 \
1354       else if(end < 0x10000)                                                            \
1355       {                                                                                 \
1356         if(begin <= 0xd800)                                                             \
1357         {                                                                               \
1358           if(end <= begin)                                                              \
1359           {                                                                             \
1360             res.field = new NFA<CS<Ucs2> >();                                           \
1361           }                                                                             \
1362           else if(end > 0xe000)                                                         \
1363           {                                                                             \
1364             res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, 0xd800).getUnion(CS<Ucs2>::range(0xe000, end))); \
1365           }                                                                             \
1366           else                                                                          \
1367           {                                                                             \
1368             res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, min(end, (quint32)0xd800))); \
1369           }\
1370         }\
1371         else\
1372         {\
1373           auto mbegin = max(begin, (quint32)0xe000);\
1374           if(end <= mbegin)\
1375           {\
1376             res.field = new NFA<CS<Ucs2> >();\
1377           }\
1378           else\
1379           {\
1380             res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(mbegin, end));\
1381           }\
1382         }\
1383       }\
1384       else if(begin >= 0x10000)\
1385       {\
1386         if(end <= begin)\
1387         {\
1388           res.field = new NFA<CS<Ucs2> >();\
1389         }\
1390         else\
1391         {\
1392           quint32 startHighSurrogate = (((begin - 0x10000) & 0xffc00) >> 10) + 0xd800;\
1393           quint32 startLowSurrogate = ((begin - 0x10000) & 0x3ff) + 0xdc00;\
1394           quint32 endHighSurrogate = (((end - 0x10000) & 0xffc00) >> 10) + 0xd800;\
1395           quint32 endLowSurrogate = ((end - 0x10000) & 0x3ff) + 0xdc00;\
1396           if(endLowSurrogate == 0xdc00)\
1397           {\
1398             if(startLowSurrogate == 0xdc00)\
1399             {\
1400               res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\
1401               *res.field <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(0xdc00, 0xe000));\
1402             }\
1403             else\
1404             {\
1405               auto firstPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, startHighSurrogate + 1));\
1406               firstPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\
1407               res.field = new NFA<CS<Ucs2> >(firstPairs);\
1408               if(startHighSurrogate < endHighSurrogate)\
1409               {\
1410                 auto surrPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate + 1, endHighSurrogate + 1));\
1411                 surrPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(0xdc00, 0xe000));\
1412                 *res.field |= surrPairs;\
1413               }\
1414             }\
1415           }\
1416           else\
1417           {\
1418             if(startLowSurrogate == 0xdc00)\
1419             {\
1420               res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\
1421             }\
1422             else\
1423             {\
1424               res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, startHighSurrogate + 1));\
1425               *res.field <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, (startHighSurrogate == endHighSurrogate ? endLowSurrogate : 0xe000)));\
1426               if(startHighSurrogate != endHighSurrogate)\
1427               {\
1428                 if(startHighSurrogate + 1 != endHighSurrogate)\
1429                 {\
1430                   auto midPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate + 1, endHighSurrogate));\
1431                   midPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\
1432                   *res.field |= midPairs;\
1433                 }\
1434                 auto lastPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(endHighSurrogate, endHighSurrogate + 1));\
1435                 lastPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, endLowSurrogate));\
1436                 *res.field |= lastPairs;\
1437               }\
1438             }\
1439           }\
1440         }\
1441       }\
1442       else\
1443       {\
1444         if(end <= begin)\
1445         {\
1446           res.field = new NFA<CS<Ucs2> >();\
1447         }\
1448         else\
1449         {\
1450           if(begin < 0xd800)\
1451           {\
1452             res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(begin, 0xd800).getUnion(CS<Ucs2>::range(0xe000, 0x10000)));\
1453           }\
1454           else\
1455           {\
1456             res.field = new NFA<CS<Ucs2> >(CS<Ucs2>::range(max(begin, 0xe000u), 0x10000));\
1457           }\
1458           quint32 startHighSurrogate = 0xd800;\
1459           quint32 startLowSurrogate = 0xdc00;\
1460           quint32 endHighSurrogate = (((end - 0x10000) & 0xffc00) >> 10) + 0xd800;\
1461           quint32 endLowSurrogate = ((end - 0x10000) & 0x3ff) + 0xdc00;\
1462           if(endLowSurrogate == 0xdc00)\
1463           {\
1464             auto surrPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\
1465             surrPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\
1466             *res.field |= surrPairs;\
1467           }\
1468           else\
1469           {\
1470             if(endHighSurrogate > startHighSurrogate)\
1471             {\
1472               auto midPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(startHighSurrogate, endHighSurrogate));\
1473               midPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, 0xe000));\
1474               *res.field |= midPairs;\
1475             }\
1476             auto lastPairs = NFA<CS<Ucs2> >(CS<Ucs2>::range(endHighSurrogate, endHighSurrogate + 1));\
1477             lastPairs <<= NFA<CS<Ucs2> >(CS<Ucs2>::range(startLowSurrogate, endLowSurrogate));\
1478             *res.field |= lastPairs;\
1479           }\
1480         }\
1481       }
1482       UTF16_IMPL(SeqCharSet, s2)
1483     } break;
1484     case SUcs4: { res.s3 = new NFA<SeqCharSet<Ucs4> >(SeqCharSet<Ucs4>::range(begin, end)); } break;
1485     case TAscii: { res.t0 = new NFA<TableCharSet<Ascii> >(begin >= 0x80 ? TableCharSet<Ascii>::emptySet() : TableCharSet<Ascii>::range(begin, min((quint32)0x80, end))); } break;
1486     case TLatin1: { res.t1 = new NFA<TableCharSet<Latin1> >(begin >= 0x100 ? TableCharSet<Latin1>::emptySet() : TableCharSet<Latin1>::range(begin, min((quint32)0x100, end))); } break;
1487     case TUtf8: {
1488       res.t1 = new NFA<TableCharSet<Latin1> >();
1489       addUtf8Range(*res.t1, begin, end);
1490     } break;
1491     case TUcs2: {
1492       res.t2 = new NFA<TableCharSet<Ucs2> >(begin >= 0x10000 ? TableCharSet<Ucs2>::emptySet() : TableCharSet<Ucs2>::range(begin, min((quint32)0x10000, end))/*.difference(TableCharSet<Ucs2>::range(0xd800, 0xe000))*/);
1493     }
1494     case TUtf16: {
1495       UTF16_IMPL(TableCharSet, t2)
1496       #undef UTF16_IMPL
1497     } break;
1498     default: exit(-1);
1499   }
1500   // TODO: warning if empty
1501   return res;
1502 }
1503
1504 GNFA GNFA::character(quint32 codepoint)
1505 {
1506   return range(codepoint, codepoint+1);
1507 }
1508
1509 void deleteNFA(GNFA *ptr)
1510 {
1511   delete ptr;
1512 }
1513
1514 void deleteDFA(GDFA *ptr)
1515 {
1516   delete ptr;
1517 }
1518
1519 AutomatonType GDFA::type = SUcs2;
1520
1521 template<CharEncoding enc>
1522 const typename TableCharSet<enc>::BitArray TableCharSet<enc>::nullData = bitset<TableCharSet<enc>::nChars>();
1523
1524 }