File indexing completed on 2024-04-21 04:36:18

0001 /* This file is part of kdev-pg-qt
0002    Copyright (C) 2010 Jonathan Schmidt-Dominé <devel@the-user.org>
0003 
0004    This library is free software; you can redistribute it and/or
0005    modify it under the terms of the GNU Library General Public
0006    License as published by the Free Software Foundation; either
0007    version 2 of the License, or (at your option) any later version.
0008 
0009    This library is distributed in the hope that it will be useful,
0010    but WITHOUT ANY WARRANTY; without even the implied warranty of
0011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0012    Library General Public License for more details.
0013 
0014    You should have received a copy of the GNU Library General Public License
0015    along with this library; see the file COPYING.LIB.  If not, write to
0016    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0017    Boston, MA 02110-1301, USA.
0018 */
0019 
0020 #ifndef KDEV_PG_REGEXP_HELPER
0021 #define KDEV_PG_REGEXP_HELPER
0022 
0023 #include <kdev-pg-char-sets.h>
0024 
0025 #include <iomanip>
0026 
0027 namespace KDevPG
0028 {
0029 
0030 template<CharEncoding codec>
0031 class SeqCharSet;
0032 
0033 inline void printChar(ostream& o, uint x)
0034 {
0035   auto flags = o.flags();
0036   auto width = o.width();
0037   auto fill = o.fill();
0038   if(x == '\\')
0039     o << "'\\\\'";
0040   else if(x == '\'')
0041     o << "'\\\\''";
0042   else if(x == '"')
0043     o << "'\\\"'";
0044   else if(x >= 32 && x <= 126)
0045     o << '\'' << (char)x << '\'';
0046   else
0047     o << hex << "\\" << setw(2) << setfill('0') << x;
0048   o.fill(fill);
0049   o.width(width);
0050   o.flags(flags);
0051 }
0052 
0053 inline void printChar(QTextStream& o, uint x)
0054 {
0055   auto base = o.integerBase();
0056   auto width = o.fieldWidth();
0057   auto pad = o.padChar();
0058   if(x == '\\')
0059     o << "'\\\\'";
0060   else if(x == '\'')
0061     o << "'\\\\''";
0062   else if(x == '"')
0063     o << "'\\\"'";
0064   else if(x >= 32 && x <= 126)
0065     o << '\'' << (char)x << '\'';
0066   else
0067     o << Qt::hex << "\\" << qSetFieldWidth(2) << qSetPadChar('0') << x;
0068   o.setPadChar(pad);
0069   o.setFieldWidth(width);
0070   o.setIntegerBase(base);
0071 }
0072 
0073 template<typename Stream, CharEncoding codec>
0074 Stream& operator<<(Stream&, const SeqCharSet<codec>&);
0075 
0076 template<class cs> class CharSetCondition;
0077 
0078 template<CharEncoding _codec>
0079 class SeqCharSet
0080 {
0081 public:
0082     static const CharEncoding codec = _codec;
0083 private:
0084     typedef typename Codec2Int<codec>::Result Int;
0085     typedef SeqCharSet<codec> Self;
0086     vector<pair<Int, Int> > data;
0087     bool mEpsilon: 1;
0088     template<typename Stream, CharEncoding codec> /* when using fixed codec I get linker errors (undefined reference), templates do not get recognized to be the same */
0089     friend Stream& operator<<(Stream&, const SeqCharSet<codec>&);
0090     friend class CharSetCondition< SeqCharSet<codec> >;
0091 public:
0092     SeqCharSet(const QString& s ) : mEpsilon(false)
0093     {
0094       auto str = qString2Codec<codec>(s);
0095       sort(str.begin(), str.end());
0096       Int last = -1;
0097       for(int i = 0; i != str.size(); ++i)
0098       {
0099         Int x = str[i];
0100         if(x == last)
0101           last = ++data.back().second;
0102         else
0103           data.push_back(make_pair(x, last = x+1));
0104       }
0105     }
0106     SeqCharSet(Int chr) : mEpsilon(false)
0107     {
0108       data.push_back(make_pair(chr, chr + 1));
0109     }
0110     SeqCharSet() : mEpsilon(true)
0111     {}
0112     SeqCharSet(const Self& o) : data(o.data), mEpsilon(o.mEpsilon)
0113     {
0114     }
0115     static Self fullSet()
0116     {
0117       Self ret;
0118       ret.mEpsilon = false;
0119       ret.data.push_back(make_pair(Int(0), Int(Codec2Size<codec>::value)));
0120       return ret;
0121     }
0122     static Self range(Int begin, Int end)
0123     {
0124       Self ret;
0125       ret.mEpsilon = false;
0126       if(begin < end)
0127         ret.data.push_back(make_pair(begin, end));
0128       return ret;
0129     }
0130     static Self emptySet()
0131     {
0132       Self ret;
0133       ret.mEpsilon = false;
0134       return ret;
0135     }
0136     SeqCharSet& operator=(const Self& o)
0137     {
0138       data = o.data;
0139       mEpsilon = o.mEpsilon;
0140       return *this;
0141     }
0142     bool epsilon() const
0143     {
0144         return mEpsilon;
0145     }
0146     bool empty() const
0147     {
0148         return data.empty();
0149     }
0150     bool accepts(Int x) const
0151     {
0152         int l = 0, r = data.size();
0153         while(l != r)
0154         {
0155           int m = (l + r) / 2;
0156           if(data[m].first <= x)
0157           {
0158             if(data[m].second > x)
0159               return true;
0160             l = m + 1;
0161           }
0162           else
0163             r = m;
0164         }
0165         return false;
0166     }
0167     Self intersection(const Self& o) const
0168     {
0169         Self ret;
0170         ret.mEpsilon = false;
0171         size_t tpos = 0, opos = 0;
0172         while(tpos != data.size() && opos != o.data.size())
0173         {
0174           if(data[tpos].first >= o.data[opos].second)
0175             ++opos;
0176           else if(o.data[opos].first >= data[tpos].second)
0177             ++tpos;
0178           else
0179           {
0180             ret.data.push_back(make_pair(max(o.data[opos].first, data[tpos].first), min(o.data[opos].second, data[tpos].second)));
0181             if(data[tpos].second > o.data[opos].second)
0182               ++opos;
0183             else
0184               ++tpos;
0185           }
0186         }
0187         // the end??
0188         return ret;
0189     }
0190     Self& intersect(const Self& o)
0191     {
0192         *this = this->intersection(o);
0193         return *this;
0194     }
0195     Self& negate()
0196     {
0197         if(!mEpsilon)
0198         {
0199           if(data.size() == 0)
0200           {
0201             data.push_back(make_pair(0, Int(Codec2Size<codec>::value)));
0202           }
0203           else if(data.front().first == 0)
0204           {
0205             for(size_t i = 0; i != data.size()-1; ++i)
0206             {
0207               data[i].first = data[i].second;
0208               data[i].second = data[i+1].first;
0209             }
0210             if(data.back().second == Codec2Size<codec>::value)
0211               data.pop_back();
0212             else
0213             {
0214               data.back().first = data.back().second;
0215               data.back().second = Codec2Size<codec>::value;
0216             }
0217           }
0218           else
0219           {
0220             Int tmpfirst = 0;
0221             for(size_t i = 0; i != data.size(); ++i)
0222             {
0223               Int tmptmpfirst = data[i].first;
0224               data[i].first = tmpfirst;
0225               tmpfirst = data[i].second;
0226               data[i].second = tmptmpfirst;
0227             }
0228             if(data.back().second != Codec2Size<codec>::value)
0229               data.push_back(make_pair(tmpfirst, Int(Codec2Size<codec>::value)));
0230           }
0231         }
0232         return *this;
0233     }
0234     Self difference(const Self& o) const
0235     {
0236         Self ret;
0237         ret.mEpsilon = false;
0238         size_t tpos = 0, opos = 0;
0239         bool inT = false, inO = false;
0240         size_t start = -1; // if there is a -1 we will know what happened
0241         while(tpos != data.size() && opos != o.data.size())
0242         {
0243           if(inT)
0244           {
0245             if(inO)
0246             {
0247               if(o.data[opos].second >= data[tpos].second)
0248               {
0249                 inT = false;
0250                 ++tpos;
0251               }
0252               else
0253               {
0254                 start = o.data[opos].second;
0255                 inO = false;
0256                 ++opos;
0257               }
0258             }
0259             else
0260             {
0261               if(o.data[opos].first >= data[tpos].second)
0262               {
0263                 if(start != data[tpos].second)
0264                   ret.data.push_back(make_pair(start, data[tpos].second));
0265                 ++tpos;
0266                 inT = false;
0267               }
0268               else
0269               {
0270                 if(o.data[opos].first > start)
0271                   ret.data.push_back(make_pair(start, o.data[opos].first));
0272                 inO = true;
0273               }
0274             }
0275           }
0276           else
0277           {
0278             if(inO)
0279             {
0280               if(o.data[opos].second <= data[tpos].first)
0281               {
0282                 inO = false;
0283                 ++opos;
0284               }
0285               else
0286               {
0287                 inT = true;
0288                 start = o.data[opos].second;
0289               }
0290             }
0291             else
0292             {
0293               if(data[tpos].first < o.data[opos].first)
0294               {
0295                 inT = true;
0296                 start = data[tpos].first;
0297               }
0298               else
0299               {
0300                 inO = true;
0301               }
0302             }
0303           }
0304         }
0305         if(inT)
0306           ret.data.push_back(make_pair(start, data[tpos++].second));
0307         while(tpos != data.size())
0308           ret.data.push_back(data[tpos++]);
0309         return ret;
0310     }
0311     Self getUnion(const Self& o) const
0312     {
0313         Self ret;
0314         ret.mEpsilon = false;
0315         size_t tpos = 0, opos = 0;
0316         bool inT = false, inO = false;
0317         size_t start = -1; // if there is a -1 we will know what happened
0318         while(tpos != data.size() && opos != o.data.size())
0319         {
0320           if(inT)
0321           {
0322             if(inO)
0323             {
0324               if(o.data[opos].second >= data[tpos].second)
0325               {
0326                 inT = false;
0327                 ++tpos;
0328               }
0329               else
0330               {
0331                 inO = false;
0332                 ++opos;
0333               }
0334             }
0335             else
0336             {
0337               if(o.data[opos].first > data[tpos].second)
0338               {
0339                 if(start != data[tpos].second)
0340                   ret.data.push_back(make_pair(start, data[tpos].second));
0341                 ++tpos;
0342                 inT = false;
0343               }
0344               else
0345               {
0346                 inO = true;
0347               }
0348             }
0349           }
0350           else
0351           {
0352             if(inO)
0353             {
0354               if(o.data[opos].second < data[tpos].first)
0355               {
0356                 if(start != o.data[opos].second)
0357                   ret.data.push_back(make_pair(start, o.data[opos].second));
0358                 inO = false;
0359                 ++opos;
0360               }
0361               else
0362               {
0363                 inT = true;
0364               }
0365             }
0366             else
0367             {
0368               if(data[tpos].first < o.data[opos].first)
0369               {
0370                 inT = true;
0371                 start = data[tpos].first;
0372               }
0373               else
0374               {
0375                 inO = true;
0376                 start = o.data[opos].first;
0377               }
0378             }
0379           }
0380         }
0381         if(inT)
0382           ret.data.push_back(make_pair(start, data[tpos++].second));
0383         while(tpos != data.size())
0384           ret.data.push_back(data[tpos++]);
0385         if(inO)
0386           ret.data.push_back(make_pair(start, o.data[opos++].second));
0387         while(opos != o.data.size())
0388           ret.data.push_back(o.data[opos++]);
0389         return ret;
0390     }
0391     Self& unite(const Self& o)
0392     {
0393         data = getUnion(o).data;
0394         return *this;
0395     }
0396     bool operator==(const Self& o) const
0397     {
0398         return mEpsilon == o.mEpsilon && data == o.data;
0399     }
0400 };
0401 
0402 template<typename Stream, CharEncoding codec>
0403 Stream& operator<<(Stream &o, const SeqCharSet<codec> &cs)
0404 {
0405   typedef typename Codec2Int<codec>::Result Int;
0406   if(cs.mEpsilon)
0407     o << "&epsilon;";
0408   else
0409   {
0410     for(auto i = cs.data.begin(); i != cs.data.end(); ++i)
0411     {
0412       if(i != cs.data.begin())
0413         o << ", ";
0414       const auto &p = *i;
0415       if(p.first == p.second - 1)
0416         printChar(o, p.first);
0417       else
0418       {
0419         printChar(o, p.first);
0420         o << "-";
0421         printChar(o, (uint)p.second-1);
0422       }
0423     }
0424   }
0425   return o;
0426 }
0427 
0428 template<CharEncoding codec>
0429 class TableCharSet;
0430 
0431 template<typename Stream, CharEncoding codec>
0432 Stream& operator<<(Stream&, const TableCharSet<codec>&);
0433 
0434 template<CharEncoding _codec>
0435 class TableCharSet
0436 {
0437 public:
0438   static const CharEncoding codec = _codec;
0439 private:
0440   static const size_t nChars = Codec2Size<codec>::value;
0441   typedef bitset<nChars> BitArray;
0442   static const BitArray nullData;
0443   bitset<Codec2Size<codec>::value> data;
0444   bool mEpsilon: 1;
0445   typedef TableCharSet<codec> Self;
0446   typedef typename Codec2Int<codec>::Result Int;
0447   template<typename Stream, CharEncoding codec> /* when using fixed codec I get linker errors (undefined reference), templates do not get recognized to be the same */
0448   friend Stream& KDevPG::operator<<(Stream&, const TableCharSet<codec>&);
0449 public:
0450   TableCharSet(const QString& s) : mEpsilon(false)
0451   {
0452       const auto str = qString2Codec<codec>(s);
0453       Int last = -1;
0454       for(int i = 0; i != str.size(); ++i)
0455       {
0456         Int x = str[i];
0457         data[x] = true;
0458       }
0459   }
0460   TableCharSet(Int chr) : mEpsilon(false)
0461   {
0462     data[chr] = true;
0463   }
0464   TableCharSet() : mEpsilon(true)
0465   {}
0466   static Self emptySet()
0467   {
0468     Self ret;
0469     ret.mEpsilon = false;
0470     return ret;
0471   }
0472   static Self fullSet()
0473   {
0474     Self ret;
0475     ret.data.set();
0476     ret.mEpsilon = false;
0477     return ret;
0478   }
0479   static Self range(Int begin, Int end)
0480   {
0481     Self ret;
0482     if(begin < end)
0483       for(Int i = begin; i != end; ++i)
0484         ret.data[i] = true;
0485     ret.mEpsilon = false;
0486     return ret;
0487   }
0488   Self& negate()
0489   {
0490     data.flip();
0491     return *this;
0492   }
0493   bool accepts(Int x) const
0494   {
0495     return data[x];
0496   }
0497   bool empty() const
0498   {
0499     return data == nullData;
0500   }
0501   bool epsilon() const
0502   {
0503     return mEpsilon;
0504   }
0505   Self& intersect(const Self& o)
0506   {
0507     data &= o.data;
0508     return *this;
0509   }
0510   Self intersection(const Self& o) const
0511   {
0512     Self ret(*this);
0513     ret.data &= o.data;
0514     return ret;
0515   }
0516   Self difference(const Self& o) const
0517   {
0518     Self ret(o);
0519     ret.data.flip();
0520     ret.data &= data;
0521     return ret;
0522   }
0523   Self getUnion(const Self& o) const
0524   {
0525     Self ret(*this);
0526     ret.data |= o.data;
0527     return ret;
0528   }
0529   Self& unite(const Self& o)
0530   {
0531     data |= o.data;
0532     return *this;
0533   }
0534   bool operator==(const Self& o) const
0535   {
0536     return mEpsilon == o.mEpsilon && data == o.data;
0537   }
0538 };
0539 
0540 template<typename Stream, CharEncoding codec>
0541 Stream& operator<<(Stream &o, const TableCharSet<codec> &cs)
0542 {
0543   typedef typename Codec2Int<codec>::Result Int;
0544   if(cs.mEpsilon)
0545     o << "&epsilon;";
0546   else
0547   {
0548     for(size_t i = 0; i != cs.data.size(); ++i)
0549       if(cs.data[i])
0550         printChar(o, (uint)i);
0551   }
0552   return o;
0553 }
0554 
0555 template<class CharSet>
0556 class CharSetCondition
0557 {
0558 };
0559 
0560 template<CharEncoding codec>
0561 class CharSetCondition<TableCharSet<codec> >
0562 {
0563   typedef typename Codec2Int<codec>::Result Int;
0564 public:
0565   void operator()(QTextStream& str, const vector<pair<TableCharSet<codec>, size_t> >& transition)
0566   {
0567     str << "switch(chr) { ";
0568     for(size_t j = 0; j != transition.size(); ++j)
0569     {
0570       if(!transition[j].first.empty())
0571       {
0572         for(quint64 i = 0; i != Codec2Size<codec>::value; ++i)
0573         {
0574           if(transition[j].first.accepts(i))
0575             str << "case " << quint64(i) << ":\n";
0576         }
0577         str << "goto _state_" << transition[j].second << ";";
0578       }
0579     }
0580     str << "default: goto _end;\n}\n";
0581   }
0582 };
0583 
0584 template<CharEncoding codec>
0585 class CharSetCondition<SeqCharSet<codec> >
0586 {
0587   typedef typename Codec2Int<codec>::Result Int;
0588   void codegen(QTextStream& str, const vector<pair<pair<Int, Int>, size_t> >& data, size_t l, size_t r, size_t tmin, size_t tmax)
0589   {
0590     if(l == r)
0591       str << "goto _end;";
0592     else if(r == l + 1)
0593     {
0594       if(data[l].first.first == tmin)
0595       {
0596         if(data[l].first.second == tmax)
0597           str << "goto _state_" << data[l].second << ";";
0598         else
0599         {
0600           if(data[l].first.second == data[l].first.first + 1)
0601             str << "if(chr == " << quint64(data[l].first.first) << ")";
0602           else
0603             str << "if(chr < " << quint64(data[l].first.second) << ")";
0604           str << "\ngoto _state_" << data[l].second << "; else\ngoto _end;\n";
0605         }
0606       }
0607       else
0608       {
0609         if(data[l].first.second == tmax)
0610         {
0611           if(data[l].first.second == data[l].first.first + 1)
0612             str << "if(chr == " << quint64(data[l].first.first) << ")";
0613           else
0614             str << "if(chr >= " << quint64(data[l].first.first) << ")";
0615           str << "\ngoto _state_" << data[l].second << "; else\ngoto _end;\n";
0616         }
0617         else
0618         {
0619           if(data[l].first.first == data[l].first.second - 1)
0620             str << "if(chr == " << quint64(data[l].first.first) << ")";
0621           else if(data[l].first.first == data[l].first.second - 2)
0622             str << "if(chr == " << quint64(data[l].first.first) << " || chr == " << quint64(data[l].first.first + 1) << ")";
0623           else
0624             str << "if(chr >= " << quint64(data[l].first.first) << " && chr < " << quint64(data[l].first.second) << ")";
0625           str << "\ngoto _state_" << data[l].second << "; else\ngoto _end;\n";
0626         }
0627       }
0628     }
0629     else
0630     {
0631       size_t mid = (l + r) / 2;
0632       str << "if(chr < " << quint64(data[mid].first.first) << ") {";
0633       codegen(str, data, l, mid, tmin, data[mid].first.first);
0634       str << "} else {";
0635       codegen(str, data, mid, r, data[mid].first.first, tmax);
0636       str << "}";
0637     }
0638   }
0639 public:
0640   void operator()(QTextStream& str, const vector<pair<SeqCharSet<codec>, size_t> >& transition)
0641   {
0642     vector<pair<pair<Int, Int>, size_t> > data;
0643     for(size_t i = 0; i != transition.size(); ++i)
0644     {
0645       for(size_t j = 0; j != transition[i].first.data.size(); ++j)
0646         data.push_back(make_pair(transition[i].first.data[j], transition[i].second));
0647     }
0648     sort(data.begin(), data.end());
0649     codegen(str, data, 0, data.size(), 0, Codec2Size<codec>::value);
0650   }
0651 };
0652 
0653 // just for the case QTextCodec may be needed
0654 /*
0655 class QAsciiCodec : public QTextCodec
0656 {
0657 public:
0658   QAsciiCodec() : QTextCodec()
0659   {
0660   }
0661   QList<QByteArray> aliases() const
0662   {
0663     QList<QByteArray> ret;
0664     ret << "ascii" << "us-ascii" << "usascii";
0665     return ret;
0666   }
0667   QByteArray convertFromUnicode(const QChar* in, int length, ConverterState* state) const
0668   {
0669     Q_UNUSED(state);
0670     return QString(in, length).toAscii();
0671   }
0672   virtual QString convertToUnicode(const char* in, int length, ConverterState* state) const
0673   {
0674     Q_UNUSED(state);
0675     return QString::fromAscii(in, length);
0676   }
0677   virtual int mibEnum() const
0678   {
0679     return 47110815; // pseudo random, should be unique
0680   }
0681   virtual QByteArray name() const
0682   {
0683     return "us-ascii";
0684   }
0685 } qAsciiCodec;
0686 */
0687 
0688 }
0689 
0690 #endif