File indexing completed on 2024-04-21 04:36:08

0001 /* This file is part of kdev-pg-qt
0002    Copyright (C) 2010 Jonathan Schmidt-Dominé <devel@the-user.org>
0003 
0004    This library is free software; you can redistribute it and/or
0005    modify it under the terms of the GNU Library General Public
0006    License as published by the Free Software Foundation; either
0007    version 2 of the License, or (at your option) any later version.
0008 
0009    This library is distributed in the hope that it will be useful,
0010    but WITHOUT ANY WARRANTY; without even the implied warranty of
0011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0012    Library General Public License for more details.
0013 
0014    You should have received a copy of the GNU Library General Public License
0015    along with this library; see the file COPYING.LIB.  If not, write to
0016    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0017    Boston, MA 02110-1301, USA.
0018 */
0019 
0020 //krazy:excludeall=inline
0021 #ifndef KDEV_PG_CHAR_SETS
0022 #define KDEV_PG_CHAR_SETS
0023 
0024 #include <string>
0025 #include <iostream>
0026 #include <vector>
0027 #include <set>
0028 #include <map>
0029 #include <bitset>
0030 #include <algorithm>
0031 
0032 using namespace std;
0033 
0034 #include <QString>
0035 #include <QByteArray>
0036 #include <QVector>
0037 #include <QTextStream>
0038 #include <QStringList>
0039 
0040 /**
0041  * @FILE
0042  * This file implements various iterator-classes providing character-set specific input streams.
0043  * Each iterator-class has these members:
0044  * @li Int: The type of the values returned by the stream
0045  * @li InputInt: Type of the values in the underlying representation
0046  * @li PlainIterator: InputInt*-like type used to reference underlying positions
0047  * @li next(): returns the next value and advances the stream position
0048  * @li hasNext(): checks if there are elements left
0049  * @li plain(): returns the PlainIterator for the next position
0050  * @li operator-(other): distance in the underlying representation between two of the iterators
0051  * @li begin: returns the PlainIterator at the beginning of the stream
0052  * @todo implement iterators for QIODevices, FILE and STL-streams, would need more abstraction: differentiate between input handling and decoding
0053  */
0054 
0055 namespace KDevPG
0056 {
0057 
0058 enum CharEncoding
0059 {
0060   Ascii = 0,
0061   Latin1 = 1,
0062   Utf8 = 2,
0063   Utf16 = 3,
0064   Ucs2 = 4,
0065   Ucs4 = 5
0066 };
0067 
0068 template<CharEncoding codec>
0069 struct Codec2False
0070 {
0071   enum { value = false };
0072 };
0073 
0074 // use a bit more space than necessary, so we will be able to use ranges excluding the end
0075 template<CharEncoding codec>
0076 struct Codec2Int
0077 {
0078   typedef uchar Result;
0079 };
0080 
0081 template<>
0082 struct Codec2Int<Latin1>
0083 {
0084   typedef quint16 Result;
0085 };
0086 
0087 template<>
0088 struct Codec2Int<Utf8>
0089 {
0090   typedef quint16 Result;
0091 };
0092 
0093 template<>
0094 struct Codec2Int<Ucs2>
0095 {
0096   typedef quint32 Result;
0097 };
0098 
0099 template<>
0100 struct Codec2Int<Utf16>
0101 {
0102   typedef quint32 Result;
0103 };
0104 
0105 template<>
0106 struct Codec2Int<Ucs4>
0107 {
0108   typedef quint32 Result;
0109 };
0110 
0111 
0112 template<CharEncoding codec>
0113 struct Codec2Container
0114 {
0115   typedef QByteArray Result;
0116 };
0117 
0118 template<>
0119 struct Codec2Container<Ucs2>
0120 {
0121   typedef QVector<quint16> Result;
0122 };
0123 
0124 template<>
0125 struct Codec2Container<Utf16>
0126 {
0127   typedef QVector<quint16> Result;
0128 };
0129 
0130 template<>
0131 struct Codec2Container<Ucs4>
0132 {
0133   typedef QVector<quint32> Result;
0134 };
0135 
0136 template<CharEncoding codec>
0137 struct Codec2Size
0138 {
0139   enum { value = 256 };
0140 };
0141 
0142 template<>
0143 struct Codec2Size<Ascii>
0144 {
0145   enum { value = 128 };
0146 };
0147 
0148 template<>
0149 struct Codec2Size<Ucs2>
0150 {
0151   enum { value = 65536 }; // That is a really large table!!
0152 };
0153 
0154 template<>
0155 struct Codec2Size<Utf16>
0156 {
0157   enum { value = 65536 }; // That is a really large table!!
0158 };
0159 
0160 template<>
0161 struct Codec2Size<Ucs4>
0162 {
0163   enum { value = 0x110000 }; // You should not do this!!
0164 };
0165 
0166 template<CharEncoding codec>
0167 inline typename Codec2Container<codec>::Result qString2Codec(const QString& /*str*/)
0168 {
0169   static_assert(Codec2False<codec>::value, "Unknown codec");
0170 }
0171 
0172 /// @todo check for invalid characters
0173 template<>
0174 inline QByteArray qString2Codec<Ascii>(const QString& str)
0175 {
0176   /// FIXME: in Qt5 there is no Ascii anymore, and in Qt4 it also was something different
0177   /// as it was configurable. In Russia, e.g. Ascii was something different than in Europe etc. pp...
0178   /// See: http://qt-project.org/doc/qt-4.8/qstring.html#toAscii
0179   /// All of this code here should probably be dropped and replaced by QTextCoded or similar
0180   return str.toLatin1();
0181 }
0182 
0183 template<>
0184 inline QByteArray qString2Codec<Latin1>(const QString& str)
0185 {
0186   return str.toLatin1();
0187 }
0188 
0189 template<>
0190 inline QByteArray qString2Codec<Utf8>(const QString& str)
0191 {
0192   return str.toUtf8();
0193 }
0194 
0195 template<>
0196 inline QVector<quint32> qString2Codec<Ucs4>(const QString& str)
0197 {
0198   return str.toUcs4();
0199 }
0200 
0201 template<>
0202 inline QVector<quint16> qString2Codec<Ucs2>(const QString& str)
0203 {
0204   QVector<quint16> ret(str.size());
0205   memcpy(&ret[0], str.utf16(), 2*str.size());
0206   return ret;
0207 }
0208 
0209 template<>
0210 inline QVector<quint16> qString2Codec<Utf16>(const QString& str)
0211 {
0212   QVector<quint16> ret(str.size());
0213   memcpy(&ret[0], str.utf16(), 2*str.size());
0214   return ret;
0215 }
0216 
0217 class QStringIterator
0218 {
0219   QString::const_iterator _begin, iter, end;
0220 public:
0221   typedef quint16 Int;
0222   typedef quint16 InputInt;
0223   typedef QString::const_iterator PlainIterator;
0224   QStringIterator(const QString& str) : _begin(str.begin()), iter(str.begin()), end(str.end())
0225   {
0226     
0227   }
0228   quint16 next()
0229   {
0230     return iter++->unicode();
0231   }
0232   bool hasNext()
0233   {
0234     return iter != end;
0235   }
0236   ptrdiff_t operator-(const QStringIterator& other) const
0237   {
0238     return iter - other.iter;
0239   }
0240   PlainIterator plain()
0241   {
0242     return iter;
0243   }
0244   PlainIterator begin()
0245   {
0246     return _begin;
0247   }
0248 };
0249 
0250 template<typename String>
0251 class ByteStringIterator
0252 {
0253   typename String::const_iterator _begin, iter, end;
0254 public:
0255   typedef uchar Int;
0256   typedef uchar InputInt;
0257   typedef typename String::const_iterator PlainIterator;
0258   ByteStringIterator(const String& str) : _begin(str.begin()), iter(str.begin()), end(str.end())
0259   {
0260     
0261   }
0262   uchar next()
0263   {
0264     return *iter++;
0265   }
0266   bool hasNext()
0267   {
0268     return iter != end;
0269   }
0270   ptrdiff_t operator-(const ByteStringIterator& other) const
0271   {
0272     return iter - other.iter;
0273   }
0274   PlainIterator& plain()
0275   {
0276     return iter;
0277   }  
0278   PlainIterator& begin()
0279   {
0280     return _begin;
0281   }
0282 };
0283 
0284 typedef ByteStringIterator<QByteArray> QByteArrayIterator;
0285 typedef ByteStringIterator<string> StdStringIterator;
0286 
0287 class QUtf16ToUcs4Iterator
0288 {
0289   union { QChar const *ptr; quint16 const *raw; };
0290   quint16 const *_begin, *end;
0291 public:
0292   typedef quint32 Int;
0293   typedef quint16 InputInt;
0294   typedef InputInt const* PlainIterator;
0295   QUtf16ToUcs4Iterator(const QString& str) : raw(str.utf16()), _begin(str.utf16()), end(raw + str.size())
0296   {
0297     
0298   }
0299   quint32 next()
0300   {
0301     quint32 ret = ptr->unicode();
0302     if(QChar::isHighSurrogate(*raw))
0303       ret = QChar::surrogateToUcs4(ret, *(++raw));
0304     ++ptr;
0305     return ret;
0306   }
0307   bool hasNext()
0308   {
0309     return raw != end;
0310   }
0311   ptrdiff_t operator-(const QUtf16ToUcs4Iterator& other) const
0312   {
0313     return ptr - other.ptr;
0314   }
0315   PlainIterator& plain()
0316   {
0317     return raw;
0318   }
0319   PlainIterator& begin()
0320   {
0321     return _begin;
0322   }
0323 };
0324 
0325 template<typename String>
0326 class Utf8ToUcs4Iterator
0327 {
0328 public:
0329   typedef typename String::const_iterator PlainIterator;
0330 private:
0331   PlainIterator _begin, ptr, end;
0332 public:
0333   typedef quint32 Int;
0334   typedef uchar InputInt;
0335   Utf8ToUcs4Iterator(const String& str) : _begin(str.begin()), ptr(_begin), end(ptr + str.size())
0336   {
0337     
0338   }
0339   PlainIterator& plain()
0340   {
0341     return ptr;
0342   }
0343   quint32 next()
0344   {
0345     /*
0346     Algorithm:
0347     
0348     Start:
0349       case chr < 128
0350         use it directly
0351       case (chr & 0xe0) == 0xc0
0352         (chr & 0x1f) -> add next
0353       case (chr & 0xf0) == 0xe0
0354         (chr & 0x0f) -> add next two
0355       case (chr & 0xf8) == 0xf0
0356         (chr & 0x07) -> add next three
0357       default
0358         invalid
0359         
0360     Add:
0361       condition: (next & 0xc0) == 0x80
0362       ret = (ret << 6) | (nextChr & 0x3f)
0363       QChar::isUnicodeNonCharacter -> invalid
0364     */
0365     
0366     while(true)
0367     {
0368       retry:
0369       uchar chr = *ptr;
0370       if(chr < 128)
0371       {
0372         ++ptr;
0373         return chr;
0374       }
0375       quint32 ret;
0376       if((chr & 0xe0) == 0xc0)
0377       {
0378         ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f);
0379       }
0380       else if((chr & 0xf0) == 0xe0)
0381       {
0382         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0383         ret = (ret << 6) | ((*++ptr) & 0x3f);
0384       }
0385       else if((chr & 0xf8) == 0xf0)
0386       {
0387         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0388         ret = (ret << 6) | ((*++ptr) & 0x3f);
0389         ret = (ret << 6) | ((*++ptr) & 0x3f);
0390       }
0391       else
0392       {
0393         ++ptr;
0394         goto retry;
0395       }
0396       ++ptr;
0397       if((ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15)
0398         return ret;
0399       // ignore the error, jump back :-)
0400     }
0401   }
0402   bool hasNext()
0403   {
0404     return ptr != end;
0405   }
0406   ptrdiff_t operator-(const String& other) const
0407   {
0408     return ptr - other.ptr;
0409   }
0410   PlainIterator& begin()
0411   {
0412     return _begin;
0413   }
0414 };
0415 
0416 typedef Utf8ToUcs4Iterator<QByteArray> QUtf8ToUcs4Iterator;
0417 typedef Utf8ToUcs4Iterator<string> StdStringUtf8ToUcs4Iterator;
0418 
0419 class QUtf8ToUcs2Iterator
0420 {
0421   uchar const *_begin, *ptr, *end;
0422 public:
0423   typedef quint16 Int;
0424   typedef uchar InputInt;
0425   typedef InputInt const* PlainIterator;
0426   QUtf8ToUcs2Iterator(const QByteArray& qba) : _begin(reinterpret_cast<uchar const*>(qba.data())), ptr(_begin), end(ptr + qba.size())
0427   {
0428     
0429   }
0430   PlainIterator& plain()
0431   {
0432     return ptr;
0433   }
0434   quint16 next()
0435   { 
0436     while(true)
0437     {
0438       retry:
0439       uchar chr = *ptr;
0440       if(chr < 128)
0441       {
0442         ++ptr;
0443         return chr;
0444       }
0445       quint32 ret;
0446       if((chr & 0xe0) == 0xc0)
0447       {
0448         ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f);
0449       }
0450       else if((chr & 0xf0) == 0xe0)
0451       {
0452         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0453         ret = (ret << 6) | ((*++ptr) & 0x3f);
0454       }
0455       else if((chr & 0xf8) == 0xf0)
0456       {
0457         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0458         ret = (ret << 6) | ((*++ptr) & 0x3f);
0459         ret = (ret << 6) | ((*++ptr) & 0x3f);
0460       }
0461       else
0462       {
0463         ++ptr;
0464         goto retry;
0465       }
0466       ++ptr;
0467       if(ret <= 0xffff && (ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15)
0468         return ret;
0469       // ignore the error, jump back :-)
0470     }
0471   }
0472   bool hasNext()
0473   {
0474     return ptr != end;
0475   }
0476   ptrdiff_t operator-(const QUtf8ToUcs2Iterator& other) const
0477   {
0478     return ptr - other.ptr;
0479   }
0480   PlainIterator& begin()
0481   {
0482     return _begin;
0483   }
0484 };
0485 
0486 class QUtf8ToUtf16Iterator
0487 {
0488   uchar const *_begin, *ptr, *end;
0489   quint16 surrogate;
0490 public:
0491   typedef quint16 Int;
0492   typedef uchar InputInt;
0493   typedef InputInt const* PlainIterator;
0494   QUtf8ToUtf16Iterator(const QByteArray& qba) : _begin(reinterpret_cast<uchar const*>(qba.data())), ptr(_begin), end(ptr + qba.size()), surrogate(0)
0495   {
0496     
0497   }
0498   PlainIterator& plain()
0499   {
0500     return ptr;
0501   }
0502   quint16 next()
0503   {
0504     if(surrogate != 0)
0505     {
0506       Int tmp = surrogate;
0507       surrogate = 0;
0508       return tmp;
0509     }
0510     while(true)
0511     {
0512       retry:
0513       uchar chr = *ptr;
0514       if(chr < 128)
0515       {
0516         ++ptr;
0517         return chr;
0518       }
0519       quint32 ret;
0520       if((chr & 0xe0) == 0xc0)
0521       {
0522         ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f);
0523       }
0524       else if((chr & 0xf0) == 0xe0)
0525       {
0526         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0527         ret = (ret << 6) | ((*++ptr) & 0x3f);
0528       }
0529       else if((chr & 0xf8) == 0xf0)
0530       {
0531         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0532         ret = (ret << 6) | ((*++ptr) & 0x3f);
0533         ret = (ret << 6) | ((*++ptr) & 0x3f);
0534       }
0535       else
0536       {
0537         ++ptr;
0538         goto retry;
0539       }
0540       ++ptr;
0541       if(ret <= 0xffff)
0542       {
0543         if((ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15)
0544           return ret;
0545         // ignoe the error ;)
0546       }
0547       else
0548       {
0549         surrogate = QChar::lowSurrogate(ret);
0550         return QChar::highSurrogate(ret);
0551       }
0552     }
0553   }
0554   bool hasNext()
0555   {
0556     return ptr != end;
0557   }
0558   ptrdiff_t operator-(const QUtf8ToUtf16Iterator& other) const
0559   {
0560     return ptr - other.ptr;
0561   }
0562   PlainIterator& begin()
0563   {
0564     return _begin;
0565   }
0566 };
0567 
0568 class QUtf8ToAsciiIterator
0569 {
0570   uchar const *_begin, *ptr, *end;
0571 public:
0572   typedef uchar Int;
0573   typedef uchar InputInt;
0574   typedef InputInt const* PlainIterator;
0575   QUtf8ToAsciiIterator(const QByteArray& qba) : _begin(reinterpret_cast<uchar const*>(qba.data())), ptr(_begin), end(ptr + qba.size())
0576   {
0577     
0578   }
0579   PlainIterator& plain()
0580   {
0581     return ptr;
0582   }
0583   Int next()
0584   { 
0585     while(true)
0586     {
0587       uchar chr = *ptr;
0588       if(chr < 128)
0589       {
0590         ++ptr;
0591         return chr;
0592       }
0593       quint32 ret;
0594       if((chr & 0xe0) == 0xc0)
0595       {
0596         ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f);
0597       }
0598       if((chr & 0xf0) == 0xe0)
0599       {
0600         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0601         ret = (ret << 6) | ((*++ptr) & 0x3f);
0602       }
0603       if((chr & 0xf8) == 0xf0)
0604       {
0605         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0606         ret = (ret << 6) | ((*++ptr) & 0x3f);
0607         ret = (ret << 6) | ((*++ptr) & 0x3f);
0608       }
0609       ++ptr;
0610       // ignore the error, jump back :-)
0611       // TODO: error handling?
0612     }
0613   }
0614   bool hasNext()
0615   {
0616     return ptr != end;
0617   }
0618   ptrdiff_t operator-(const QUtf8ToAsciiIterator& other) const
0619   {
0620     return ptr - other.ptr;
0621   }
0622   PlainIterator& begin()
0623   {
0624     return _begin;
0625   }
0626 };
0627 
0628 template<CharEncoding codec>
0629 struct Codec2FromUtf8Iterator
0630 {
0631   typedef QByteArrayIterator Result;
0632 };
0633 
0634 template<>
0635 struct Codec2FromUtf8Iterator<Ascii>
0636 {
0637   typedef QUtf8ToAsciiIterator Result;
0638 };
0639 
0640 template<>
0641 struct Codec2FromUtf8Iterator<Ucs2>
0642 {
0643   typedef QUtf8ToUcs2Iterator Result;
0644 };
0645 
0646 template<>
0647 struct Codec2FromUtf8Iterator<Utf16>
0648 {
0649   typedef QUtf8ToUtf16Iterator Result;
0650 };
0651 
0652 template<>
0653 struct Codec2FromUtf8Iterator<Ucs4>
0654 {
0655   typedef QUtf8ToUcs4Iterator Result;
0656 };
0657 
0658 }
0659 
0660 #endif