File indexing completed on 2024-05-05 13:02:15
0001 #include <iostream> 0002 #include <QString> 0003 0004 class QUtf16Iterator 0005 { 0006 union { QChar const *ptr; quint16 const *raw; }; 0007 public: 0008 QUtf16Iterator(const QString& str) : raw(str.utf16()) 0009 { 0010 0011 } 0012 QUtf16Iterator& operator++() 0013 { 0014 if(QChar::isHighSurrogate(*raw)) 0015 ++ptr; 0016 ++ptr; 0017 return *this; 0018 } 0019 quint32 operator*() const 0020 { 0021 // big endian 0022 quint32 ret = ptr->unicode(); 0023 if(QChar::isHighSurrogate(*raw)) 0024 return QChar::surrogateToUcs4(ret, raw[1]); 0025 return ret; 0026 } 0027 quint32 next() 0028 { 0029 quint32 ret = ptr->unicode(); 0030 if(QChar::isHighSurrogate(*raw)) 0031 ret = QChar::surrogateToUcs4(ret, *(++raw)); 0032 ++ptr; 0033 return ret; 0034 } 0035 bool atEnd(const QString& str) 0036 { 0037 return raw - str.utf16() == str.size(); 0038 } 0039 }; 0040 0041 class QUtf8Iterator 0042 { 0043 uchar const *ptr; 0044 public: 0045 QUtf8Iterator(const QByteArray& qba) : ptr(reinterpret_cast<uchar const*>(qba.data())) 0046 { 0047 0048 } 0049 QUtf8Iterator& operator++() 0050 { 0051 qFatal("not implemented"); 0052 return *this; 0053 } 0054 quint32 operator*() const 0055 { 0056 qFatal("not implemented"); 0057 } 0058 quint32 next() 0059 { 0060 /* 0061 Algorithm: 0062 0063 Start: 0064 case chr < 128 0065 use it directly 0066 case (chr & 0xe0) == 0xc0 0067 (chr & 0x1f) -> add next 0068 case (chr & 0xf0) == 0xe0 0069 (chr & 0x0f) -> add next two 0070 case (chr & 0xf8) == 0xf0 0071 (chr & 0x07) -> add next three 0072 default 0073 invalid 0074 0075 Add: 0076 condition: (next & 0xc0) == 0x80 0077 ret = (ret << 6) | (nextChr & 0x3f) 0078 QChar::isUnicodeNonCharacter -> invalid 0079 */ 0080 0081 while(true) 0082 { 0083 uchar chr = *ptr; 0084 if(chr < 128) 0085 { 0086 ++ptr; 0087 return chr; 0088 } 0089 quint32 ret; 0090 if((chr & 0xe0) == 0xc0) 0091 { 0092 ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f); 0093 } 0094 if((chr & 0xf0) == 0xe0) 0095 { 0096 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0097 ret = (ret << 6) | ((*++ptr) & 0x3f); 0098 } 0099 if((chr & 0xf8) == 0xf0) 0100 { 0101 ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f); 0102 ret = (ret << 6) | ((*++ptr) & 0x3f); 0103 ret = (ret << 6) | ((*++ptr) & 0x3f); 0104 } 0105 ++ptr; 0106 if((ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15) 0107 return ret; 0108 // ignore the error, jump back :-) 0109 } 0110 } 0111 bool atEnd(const QByteArray& str) 0112 { 0113 return ptr - reinterpret_cast<uchar const*>(str.data()) == str.size(); 0114 } 0115 }; 0116 0117 int main() 0118 { 0119 QByteArray utf8("§utf8ärrayßようこそ中華民族𐀃bla"); // linear b character 0120 QUtf8Iterator utf8i(utf8); 0121 size_t s = 0; 0122 while(!utf8i.atEnd(utf8)) 0123 { 0124 ++s; 0125 quint32 u = utf8i.next(); 0126 std::cout << u << " " << QString(u).toUtf8().data() << std::endl; 0127 } 0128 std::cout << "utf8 size: " << utf8.size() << " real size: " << s << std::endl; 0129 QString utf16 = QString::fromUtf8("$utf16ärrayßようこそ中華民族𐀃bla"); 0130 QUtf16Iterator utf16i(utf16); 0131 s = 0; 0132 while(!utf16i.atEnd(utf16)) 0133 { 0134 ++s; 0135 quint32 u = utf16i.next(); 0136 std::cout << u << " " << QString(u).toUtf8().data() << std::endl; 0137 } 0138 std::cout << "utf16 size: " << utf16.size() << " real size: " << s << std::endl; 0139 }