File indexing completed on 2023-10-03 03:55:29
0001 /* This file is part of kdev-pg-qt 0002 Copyright (C) 2010 Jonathan Schmidt-Dominé <devel@the-user.org> 0003 0004 This library is free software; you can redistribute it and/or 0005 modify it under the terms of the GNU Library General Public 0006 License as published by the Free Software Foundation; either 0007 version 2 of the License, or (at your option) any later version. 0008 0009 This library is distributed in the hope that it will be useful, 0010 but WITHOUT ANY WARRANTY; without even the implied warranty of 0011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0012 Library General Public License for more details. 0013 0014 You should have received a copy of the GNU Library General Public License 0015 along with this library; see the file COPYING.LIB. If not, write to 0016 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0017 Boston, MA 02110-1301, USA. 0018 */ 0019 0020 //krazy:excludeall=inline 0021 #ifndef KDEV_PG_CHAR_SETS 0022 #define KDEV_PG_CHAR_SETS 0023 0024 #include <string> 0025 #include <iostream> 0026 #include <vector> 0027 #include <set> 0028 #include <map> 0029 #include <bitset> 0030 #include <algorithm> 0031 0032 using namespace std; 0033 0034 #include <QString> 0035 #include <QByteArray> 0036 #include <QVector> 0037 #include <QTextStream> 0038 #include <QStringList> 0039 0040 /** 0041 * @FILE 0042 * This file implements various iterator-classes providing character-set specific input streams. 0043 * Each iterator-class has these members: 0044 * @li Int: The type of the values returned by the stream 0045 * @li InputInt: Type of the values in the underlying representation 0046 * @li PlainIterator: InputInt*-like type used to reference underlying positions 0047 * @li next(): returns the next value and advances the stream position 0048 * @li hasNext(): checks if there are elements left 0049 * @li plain(): returns the PlainIterator for the next position 0050 * @li operator-(other): distance in the underlying representation between two of the iterators 0051 * @li begin: returns the PlainIterator at the beginning of the stream 0052 * @todo implement iterators for QIODevices, FILE and STL-streams, would need more abstraction: differentiate between input handling and decoding 0053 */ 0054 0055 namespace KDevPG 0056 { 0057 0058 enum CharEncoding 0059 { 0060 Ascii = 0, 0061 Latin1 = 1, 0062 Utf8 = 2, 0063 Utf16 = 3, 0064 Ucs2 = 4, 0065 Ucs4 = 5 0066 }; 0067 0068 template<CharEncoding codec> 0069 struct Codec2False 0070 { 0071 enum { value = false }; 0072 }; 0073 0074 // use a bit more space than necessary, so we will be able to use ranges excluding the end 0075 template<CharEncoding codec> 0076 struct Codec2Int 0077 { 0078 typedef uchar Result; 0079 }; 0080 0081 template<> 0082 struct Codec2Int<Latin1> 0083 { 0084 typedef quint16 Result; 0085 }; 0086 0087 template<> 0088 struct Codec2Int<Utf8> 0089 { 0090 typedef quint16 Result; 0091 }; 0092 0093 template<> 0094 struct Codec2Int<Ucs2> 0095 { 0096 typedef quint32 Result; 0097 }; 0098 0099 template<> 0100 struct Codec2Int<Utf16> 0101 { 0102 typedef quint32 Result; 0103 }; 0104 0105 template<> 0106 struct Codec2Int<Ucs4> 0107 { 0108 typedef quint32 Result; 0109 }; 0110 0111 0112 template<CharEncoding codec> 0113 struct Codec2Container 0114 { 0115 typedef QByteArray Result; 0116 }; 0117 0118 template<> 0119 struct Codec2Container<Ucs2> 0120 { 0121 typedef QVector<quint16> Result; 0122 }; 0123 0124 template<> 0125 struct Codec2Container<Utf16> 0126 { 0127 typedef QVector<quint16> Result; 0128 }; 0129 0130 template<> 0131 struct Codec2Container<Ucs4> 0132 { 0133 typedef QVector<quint32> Result; 0134 }; 0135 0136 template<CharEncoding codec> 0137 struct Codec2Size 0138 { 0139 enum { value = 256 }; 0140 }; 0141 0142 template<> 0143 struct Codec2Size<Ascii> 0144 { 0145 enum { value = 128 }; 0146 }; 0147 0148 template<> 0149 struct Codec2Size<Ucs2> 0150 { 0151 enum { value = 65536 }; // That is a really large table!! 0152 }; 0153 0154 template<> 0155 struct Codec2Size<Utf16> 0156 { 0157 enum { value = 65536 }; // That is a really large table!! 0158 }; 0159 0160 template<> 0161 struct Codec2Size<Ucs4> 0162 { 0163 enum { value = 0x110000 }; // You should not do this!! 0164 }; 0165 0166 template<CharEncoding codec> 0167 inline typename Codec2Container<codec>::Result qString2Codec(const QString& /*str*/) 0168 { 0169 static_assert(Codec2False<codec>::value, "Unknown codec"); 0170 } 0171 0172 /// @todo check for invalid characters 0173 template<> 0174 inline QByteArray qString2Codec<Ascii>(const QString& str) 0175 { 0176 /// FIXME: in Qt5 there is no Ascii anymore, and in Qt4 it also was something different 0177 /// as it was configurable. In Russia, e.g. Ascii was something different than in Europe etc. pp... 0178 /// See: http://qt-project.org/doc/qt-4.8/qstring.html#toAscii 0179 /// All of this code here should probably be dropped and replaced by QTextCoded or similar 0180 return str.toLatin1(); 0181 } 0182 0183 template<> 0184 inline QByteArray qString2Codec<Latin1>(const QString& str) 0185 { 0186 return str.toLatin1(); 0187 } 0188 0189 template<> 0190 inline QByteArray qString2Codec<Utf8>(const QString& str) 0191 { 0192 return str.toUtf8(); 0193 } 0194 0195 template<> 0196 inline QVector<quint32> qString2Codec<Ucs4>(const QString& str) 0197 { 0198 return str.toUcs4(); 0199 } 0200 0201 template<> 0202 inline QVector<quint16> qString2Codec<Ucs2>(const QString& str) 0203 { 0204 QVector<quint16> ret(str.size()); 0205 memcpy(&ret[0], str.utf16(), 2*str.size()); 0206 return ret; 0207 } 0208 0209 template<> 0210 inline QVector<quint16> qString2Codec<Utf16>(const QString& str) 0211 { 0212 QVector<quint16> ret(str.size()); 0213 memcpy(&ret[0], str.utf16(), 2*str.size()); 0214 return ret; 0215 } 0216 0217 class QStringIterator 0218 { 0219 QString::const_iterator _begin, iter, end; 0220 public: 0221 typedef quint16 Int; 0222 typedef quint16 InputInt; 0223 typedef QString::const_iterator PlainIterator; 0224 QStringIterator(const QString& str) : _begin(str.begin()), iter(str.begin()), end(str.end()) 0225 { 0226 0227 } 0228 quint16 next() 0229 { 0230 return iter++->unicode(); 0231 } 0232 bool hasNext() 0233 { 0234 return iter != end; 0235 } 0236 ptrdiff_t operator-(const QStringIterator& other) const 0237 { 0238 return iter - other.iter; 0239 } 0240 PlainIterator plain() 0241 { 0242 return iter; 0243 } 0244 PlainIterator begin() 0245 { 0246 return _begin; 0247 } 0248 }; 0249 0250 template<typename String> 0251 class ByteStringIterator 0252 { 0253 typename String::const_iterator _begin, iter, end; 0254 public: 0255 typedef uchar Int; 0256 typedef uchar InputInt; 0257 typedef typename String::const_iterator PlainIterator; 0258 ByteStringIterator(const String& str) : _begin(str.begin()), iter(str.begin()), end(str.end()) 0259 { 0260 0261 } 0262 uchar next() 0263 { 0264 return *iter++; 0265 } 0266 bool hasNext() 0267 { 0268 return iter != end; 0269 } 0270 ptrdiff_t operator-(const ByteStringIterator& other) const 0271 { 0272 return iter - other.iter; 0273 } 0274 PlainIterator& plain() 0275 { 0276 return iter; 0277 } 0278 PlainIterator& begin() 0279 { 0280 return _begin; 0281 } 0282 }; 0283 0284 typedef ByteStringIterator<QByteArray> QByteArrayIterator; 0285 typedef ByteStringIterator<string> StdStringIterator; 0286 0287 class QUtf16ToUcs4Iterator 0288 { 0289 union { QChar const *ptr; quint16 const *raw; }; 0290 quint16 const *_begin, *end; 0291 public: 0292 typedef quint32 Int; 0293 typedef quint16 InputInt; 0294 typedef InputInt const* PlainIterator; 0295 QUtf16ToUcs4Iterator(const QString& str) : raw(str.utf16()), _begin(str.utf16()), end(raw + str.size()) 0296 { 0297 0298 } 0299 quint32 next() 0300 { 0301 quint32 ret = ptr->unicode(); 0302 if(QChar::isHighSurrogate(*raw)) 0303 ret = QChar::surrogateToUcs4(ret, *(++raw)); 0304 ++ptr; 0305 return ret; 0306 } 0307 bool hasNext() 0308 { 0309 return raw != end; 0310 } 0311 ptrdiff_t operator-(const QUtf16ToUcs4Iterator& other) const 0312 { 0313 return ptr - other.ptr; 0314 } 0315 PlainIterator& plain() 0316 { 0317 return raw; 0318 } 0319 PlainIterator& begin() 0320 { 0321 return _begin; 0322 } 0323 }; 0324 0325 template<typename String> 0326 class Utf8ToUcs4Iterator 0327 { 0328 public: 0329 typedef typename String::const_iterator PlainIterator; 0330 private: 0331 PlainIterator _begin, ptr, end; 0332 public: 0333 typedef quint32 Int; 0334 typedef uchar InputInt; 0335 Utf8ToUcs4Iterator(const String& str) : _begin(str.begin()), ptr(_begin), end(ptr + str.size()) 0336 { 0337 0338 } 0339 PlainIterator& plain() 0340 { 0341 return ptr; 0342 } 0343 quint32 next() 0344 { 0345 /* 0346 Algorithm: 0347 0348 Start: 0349 case chr < 128 0350 use it directly 0351 case (chr & 0xe0) == 0xc0 0352 (chr & 0x1f) -> add next 0353 case (chr & 0xf0) == 0xe0 0354 (chr & 0x0f) -> add next two 0355 case (chr & 0xf8) == 0xf0 0356 (chr & 0x07) -> add next three 0357 default 0358 invalid 0359 0360 Add: 0361 condition: (next & 0xc0) == 0x80 0362 ret = (ret << 6) | (nextChr & 0x3f) 0363 QChar::isUnicodeNonCharacter -> invalid 0364 */ 0365 0366 while(true) 0367 { 0368 retry: 0369 uchar chr = *ptr; 0370 if(chr < 128) 0371 { 0372 ++ptr; 0373 return chr; 0374 } 0375 quint32 ret; 0376 if((chr & 0xe0) == 0xc0) 0377 { 0378 ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f); 0379 } 0380 else if((chr & 0xf0) == 0xe0) 0381 { 0382 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0383 ret = (ret << 6) | ((*++ptr) & 0x3f); 0384 } 0385 else if((chr & 0xf8) == 0xf0) 0386 { 0387 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0388 ret = (ret << 6) | ((*++ptr) & 0x3f); 0389 ret = (ret << 6) | ((*++ptr) & 0x3f); 0390 } 0391 else 0392 { 0393 ++ptr; 0394 goto retry; 0395 } 0396 ++ptr; 0397 if((ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15) 0398 return ret; 0399 // ignore the error, jump back :-) 0400 } 0401 } 0402 bool hasNext() 0403 { 0404 return ptr != end; 0405 } 0406 ptrdiff_t operator-(const String& other) const 0407 { 0408 return ptr - other.ptr; 0409 } 0410 PlainIterator& begin() 0411 { 0412 return _begin; 0413 } 0414 }; 0415 0416 typedef Utf8ToUcs4Iterator<QByteArray> QUtf8ToUcs4Iterator; 0417 typedef Utf8ToUcs4Iterator<string> StdStringUtf8ToUcs4Iterator; 0418 0419 class QUtf8ToUcs2Iterator 0420 { 0421 uchar const *_begin, *ptr, *end; 0422 public: 0423 typedef quint16 Int; 0424 typedef uchar InputInt; 0425 typedef InputInt const* PlainIterator; 0426 QUtf8ToUcs2Iterator(const QByteArray& qba) : _begin(reinterpret_cast<uchar const*>(qba.data())), ptr(_begin), end(ptr + qba.size()) 0427 { 0428 0429 } 0430 PlainIterator& plain() 0431 { 0432 return ptr; 0433 } 0434 quint16 next() 0435 { 0436 while(true) 0437 { 0438 retry: 0439 uchar chr = *ptr; 0440 if(chr < 128) 0441 { 0442 ++ptr; 0443 return chr; 0444 } 0445 quint32 ret; 0446 if((chr & 0xe0) == 0xc0) 0447 { 0448 ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f); 0449 } 0450 else if((chr & 0xf0) == 0xe0) 0451 { 0452 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0453 ret = (ret << 6) | ((*++ptr) & 0x3f); 0454 } 0455 else if((chr & 0xf8) == 0xf0) 0456 { 0457 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0458 ret = (ret << 6) | ((*++ptr) & 0x3f); 0459 ret = (ret << 6) | ((*++ptr) & 0x3f); 0460 } 0461 else 0462 { 0463 ++ptr; 0464 goto retry; 0465 } 0466 ++ptr; 0467 if(ret <= 0xffff && (ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15) 0468 return ret; 0469 // ignore the error, jump back :-) 0470 } 0471 } 0472 bool hasNext() 0473 { 0474 return ptr != end; 0475 } 0476 ptrdiff_t operator-(const QUtf8ToUcs2Iterator& other) const 0477 { 0478 return ptr - other.ptr; 0479 } 0480 PlainIterator& begin() 0481 { 0482 return _begin; 0483 } 0484 }; 0485 0486 class QUtf8ToUtf16Iterator 0487 { 0488 uchar const *_begin, *ptr, *end; 0489 quint16 surrogate; 0490 public: 0491 typedef quint16 Int; 0492 typedef uchar InputInt; 0493 typedef InputInt const* PlainIterator; 0494 QUtf8ToUtf16Iterator(const QByteArray& qba) : _begin(reinterpret_cast<uchar const*>(qba.data())), ptr(_begin), end(ptr + qba.size()), surrogate(0) 0495 { 0496 0497 } 0498 PlainIterator& plain() 0499 { 0500 return ptr; 0501 } 0502 quint16 next() 0503 { 0504 if(surrogate != 0) 0505 { 0506 Int tmp = surrogate; 0507 surrogate = 0; 0508 return tmp; 0509 } 0510 while(true) 0511 { 0512 retry: 0513 uchar chr = *ptr; 0514 if(chr < 128) 0515 { 0516 ++ptr; 0517 return chr; 0518 } 0519 quint32 ret; 0520 if((chr & 0xe0) == 0xc0) 0521 { 0522 ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f); 0523 } 0524 else if((chr & 0xf0) == 0xe0) 0525 { 0526 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0527 ret = (ret << 6) | ((*++ptr) & 0x3f); 0528 } 0529 else if((chr & 0xf8) == 0xf0) 0530 { 0531 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0532 ret = (ret << 6) | ((*++ptr) & 0x3f); 0533 ret = (ret << 6) | ((*++ptr) & 0x3f); 0534 } 0535 else 0536 { 0537 ++ptr; 0538 goto retry; 0539 } 0540 ++ptr; 0541 if(ret <= 0xffff) 0542 { 0543 if((ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15) 0544 return ret; 0545 // ignoe the error ;) 0546 } 0547 else 0548 { 0549 surrogate = QChar::lowSurrogate(ret); 0550 return QChar::highSurrogate(ret); 0551 } 0552 } 0553 } 0554 bool hasNext() 0555 { 0556 return ptr != end; 0557 } 0558 ptrdiff_t operator-(const QUtf8ToUtf16Iterator& other) const 0559 { 0560 return ptr - other.ptr; 0561 } 0562 PlainIterator& begin() 0563 { 0564 return _begin; 0565 } 0566 }; 0567 0568 class QUtf8ToAsciiIterator 0569 { 0570 uchar const *_begin, *ptr, *end; 0571 public: 0572 typedef uchar Int; 0573 typedef uchar InputInt; 0574 typedef InputInt const* PlainIterator; 0575 QUtf8ToAsciiIterator(const QByteArray& qba) : _begin(reinterpret_cast<uchar const*>(qba.data())), ptr(_begin), end(ptr + qba.size()) 0576 { 0577 0578 } 0579 PlainIterator& plain() 0580 { 0581 return ptr; 0582 } 0583 Int next() 0584 { 0585 while(true) 0586 { 0587 uchar chr = *ptr; 0588 if(chr < 128) 0589 { 0590 ++ptr; 0591 return chr; 0592 } 0593 quint32 ret; 0594 if((chr & 0xe0) == 0xc0) 0595 { 0596 ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f); 0597 } 0598 if((chr & 0xf0) == 0xe0) 0599 { 0600 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0601 ret = (ret << 6) | ((*++ptr) & 0x3f); 0602 } 0603 if((chr & 0xf8) == 0xf0) 0604 { 0605 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0606 ret = (ret << 6) | ((*++ptr) & 0x3f); 0607 ret = (ret << 6) | ((*++ptr) & 0x3f); 0608 } 0609 ++ptr; 0610 // ignore the error, jump back :-) 0611 // TODO: error handling? 0612 } 0613 } 0614 bool hasNext() 0615 { 0616 return ptr != end; 0617 } 0618 ptrdiff_t operator-(const QUtf8ToAsciiIterator& other) const 0619 { 0620 return ptr - other.ptr; 0621 } 0622 PlainIterator& begin() 0623 { 0624 return _begin; 0625 } 0626 }; 0627 0628 template<CharEncoding codec> 0629 struct Codec2FromUtf8Iterator 0630 { 0631 typedef QByteArrayIterator Result; 0632 }; 0633 0634 template<> 0635 struct Codec2FromUtf8Iterator<Ascii> 0636 { 0637 typedef QUtf8ToAsciiIterator Result; 0638 }; 0639 0640 template<> 0641 struct Codec2FromUtf8Iterator<Ucs2> 0642 { 0643 typedef QUtf8ToUcs2Iterator Result; 0644 }; 0645 0646 template<> 0647 struct Codec2FromUtf8Iterator<Utf16> 0648 { 0649 typedef QUtf8ToUtf16Iterator Result; 0650 }; 0651 0652 template<> 0653 struct Codec2FromUtf8Iterator<Ucs4> 0654 { 0655 typedef QUtf8ToUcs4Iterator Result; 0656 }; 0657 0658 } 0659 0660 #endif