File indexing completed on 2024-04-28 04:36:09

0001 #include <iostream>
0002 #include <QString>
0003 
0004 class QUtf16Iterator
0005 {
0006   union { QChar const *ptr; quint16 const *raw; };
0007 public:
0008   QUtf16Iterator(const QString& str) : raw(str.utf16())
0009   {
0010     
0011   }
0012   QUtf16Iterator& operator++()
0013   {
0014     if(QChar::isHighSurrogate(*raw))
0015       ++ptr;
0016     ++ptr;
0017     return *this;
0018   }
0019   quint32 operator*() const
0020   {
0021     // big endian
0022     quint32 ret = ptr->unicode();
0023     if(QChar::isHighSurrogate(*raw))
0024       return QChar::surrogateToUcs4(ret, raw[1]);
0025     return ret;
0026   }
0027   quint32 next()
0028   {
0029     quint32 ret = ptr->unicode();
0030     if(QChar::isHighSurrogate(*raw))
0031       ret = QChar::surrogateToUcs4(ret, *(++raw));
0032     ++ptr;
0033     return ret;
0034   }
0035   bool atEnd(const QString& str)
0036   {
0037     return raw - str.utf16() == str.size();
0038   }
0039 };
0040 
0041 class QUtf8Iterator
0042 {
0043   uchar const *ptr;
0044 public:
0045   QUtf8Iterator(const QByteArray& qba) : ptr(reinterpret_cast<uchar const*>(qba.data()))
0046   {
0047     
0048   }
0049   QUtf8Iterator& operator++()
0050   {
0051     qFatal("not implemented");
0052     return *this;
0053   }
0054   quint32 operator*() const
0055   {
0056     qFatal("not implemented");
0057   }
0058   quint32 next()
0059   {
0060     /*
0061     Algorithm:
0062     
0063     Start:
0064       case chr < 128
0065         use it directly
0066       case (chr & 0xe0) == 0xc0
0067         (chr & 0x1f) -> add next
0068       case (chr & 0xf0) == 0xe0
0069         (chr & 0x0f) -> add next two
0070       case (chr & 0xf8) == 0xf0
0071         (chr & 0x07) -> add next three
0072       default
0073         invalid
0074         
0075     Add:
0076       condition: (next & 0xc0) == 0x80
0077       ret = (ret << 6) | (nextChr & 0x3f)
0078       QChar::isUnicodeNonCharacter -> invalid
0079     */
0080     
0081     while(true)
0082     {
0083       uchar chr = *ptr;
0084       if(chr < 128)
0085       {
0086         ++ptr;
0087         return chr;
0088       }
0089       quint32 ret;
0090       if((chr & 0xe0) == 0xc0)
0091       {
0092         ret = ((chr & 0x1f) << 6) | ((*++ptr) & 0x3f);
0093       }
0094       if((chr & 0xf0) == 0xe0)
0095       {
0096         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0097         ret = (ret << 6) | ((*++ptr) & 0x3f);
0098       }
0099       if((chr & 0xf8) == 0xf0)
0100       {
0101         ret = ((chr & 0x0f) << 6) | ((*++ptr) & 0x3f);
0102         ret = (ret << 6) | ((*++ptr) & 0x3f);
0103         ret = (ret << 6) | ((*++ptr) & 0x3f);
0104       }
0105       ++ptr;
0106       if((ret & 0xfffe) != 0xfffe && (ret - 0xfdd0U) > 15)
0107         return ret;
0108       // ignore the error, jump back :-)
0109     }
0110   }
0111   bool atEnd(const QByteArray& str)
0112   {
0113     return ptr - reinterpret_cast<uchar const*>(str.data()) == str.size();
0114   }
0115 };
0116 
0117 int main()
0118 {
0119   QByteArray utf8("§utf8ärrayßようこそ中華民族𐀃bla"); // linear b character
0120   QUtf8Iterator utf8i(utf8);
0121   size_t s = 0;
0122   while(!utf8i.atEnd(utf8))
0123   {
0124     ++s;
0125     quint32 u = utf8i.next();
0126     std::cout << u << " " << QString(u).toUtf8().data() << std::endl;
0127   }
0128   std::cout << "utf8 size: " << utf8.size() << " real size: " << s << std::endl;
0129   QString utf16 = QString::fromUtf8("$utf16ärrayßようこそ中華民族𐀃bla");
0130   QUtf16Iterator utf16i(utf16);
0131   s = 0;
0132   while(!utf16i.atEnd(utf16))
0133   {
0134     ++s;
0135     quint32 u = utf16i.next();
0136     std::cout << u << " " << QString(u).toUtf8().data() << std::endl;
0137   }
0138   std::cout << "utf16 size: " << utf16.size() << " real size: " << s << std::endl;
0139 }