src/misc/kencodingdetector.cpp

0001 /*
0002     This file is part of the KDE libraries
0003
0004     Copyright (C) 1999 Lars Knoll (knoll@kde.org)
0005     Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
0006     Copyright (C) 2003 Apple Computer, Inc.
0007     Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
0008
0009     This library is free software; you can redistribute it and/or
0010     modify it under the terms of the GNU Library General Public
0011     License as published by the Free Software Foundation; either
0012     version 2 of the License, or (at your option) any later version.
0013
0014     This library is distributed in the hope that it will be useful,
0015     but WITHOUT ANY WARRANTY; without even the implied warranty of
0016     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0017     Library General Public License for more details.
0018
0019     You should have received a copy of the GNU Library General Public License
0020     along with this library; see the file COPYING.LIB.  If not, write to
0021     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0022     Boston, MA 02110-1301, USA.
0023 */
0024 //----------------------------------------------------------------------------
0025 //
0026 // decoder for input stream
0027
0028 #include "kencodingdetector.h"
0029
0030 #undef DECODE_DEBUG
0031 //#define DECODE_DEBUG
0032
0033 #define MAX_BUFFER 16*1024
0034
0035 #include <assert.h>
0036
0037 #include "guess_ja_p.h"
0038
0039 #include "khtml_debug.h"
0040 #include <QRegExp>
0041 #include <QTextCodec>
0042
0043 #include "kcharsets.h"
0044 #include <klocalizedstring.h>
0045
0046 #include <ctype.h>
0047
0048 enum MIB {
0049     MibLatin1  = 4,
0050     Mib8859_8  = 85,
0051     MibUtf8    = 106,
0052     MibUcs2    = 1000,
0053     MibUtf16   = 1015,
0054     MibUtf16BE = 1013,
0055     MibUtf16LE = 1014
0056 };
0057
0058 static bool is16Bit(QTextCodec *codec)
0059 {
0060     switch (codec->mibEnum()) {
0061     case MibUtf16:
0062     case MibUtf16BE:
0063     case MibUtf16LE:
0064     case MibUcs2:
0065         return true;
0066     default:
0067         return false;
0068     }
0069 }
0070
0071 class KEncodingDetectorPrivate
0072 {
0073 public:
0074     QTextCodec *m_codec;
0075     QTextDecoder *m_decoder; // utf16
0076     QTextCodec *m_defaultCodec;
0077     QByteArray  m_storeDecoderName;
0078
0079     KEncodingDetector::EncodingChoiceSource m_source;
0080     KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
0081
0082     bool m_visualRTL : 1;
0083     bool m_seenBody : 1;
0084     bool m_writtingHappened : 1;
0085     bool m_analyzeCalled : 1; //for decode()
0086     int m_multiByte;
0087
0088     QByteArray m_bufferForDefferedEncDetection;
0089
0090     KEncodingDetectorPrivate()
0091         : m_codec(QTextCodec::codecForMib(MibLatin1))
0092         , m_decoder(m_codec->makeDecoder())
0093         , m_defaultCodec(m_codec)
0094         , m_source(KEncodingDetector::DefaultEncoding)
0095         , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
0096         , m_visualRTL(false)
0097         , m_seenBody(false)
0098         , m_writtingHappened(false)
0099         , m_analyzeCalled(false)
0100         , m_multiByte(0)
0101     {
0102     }
0103
0104     KEncodingDetectorPrivate(QTextCodec *codec, KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
0105         : m_codec(codec)
0106         , m_decoder(m_codec->makeDecoder())
0107         , m_defaultCodec(m_codec)
0108         , m_source(source)
0109         , m_autoDetectLanguage(script)
0110         , m_visualRTL(false)
0111         , m_seenBody(false)
0112         , m_writtingHappened(false)
0113         , m_analyzeCalled(false)
0114         , m_multiByte(0)
0115     {
0116     }
0117
0118     ~KEncodingDetectorPrivate()
0119     {
0120         delete m_decoder;
0121     }
0122
0123     // Returns true if the encoding was explicitly specified someplace.
0124     bool isExplicitlySpecifiedEncoding()
0125     {
0126         return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
0127     }
0128 };
0129
0130 static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
0131 {
0132     for (int i = 0; i < size; ++i) {
0133         if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
0134                 || (ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB) || (ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA)
0135                 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
0136                 || (ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF) || (ptr[ i ] >= 0xF3)) {
0137             return "cp1256";
0138         }
0139     }
0140
0141     return "iso-8859-6";
0142 }
0143
0144 static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
0145 {
0146     for (int i = 0; i < size; ++i) {
0147         if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E)) {
0148             return "cp1257";
0149         }
0150
0151         if (ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5) {
0152             return "iso-8859-13";
0153         }
0154     }
0155
0156     return "iso-8859-13";
0157 }
0158
0159 static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
0160 {
0161     QByteArray charset = QByteArray();
0162     for (int i = 0; i < size; ++i) {
0163         if (ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) {
0164             if (ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98) {
0165                 return "ibm852";
0166             }
0167
0168             if (i + 1 > size) {
0169                 return "cp1250";
0170             } else { // maybe ibm852 ?
0171                 charset = "cp1250";
0172                 continue;
0173             }
0174         }
0175         if (ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0) {
0176             if (i + 1 > size) {
0177                 return "iso-8859-2";
0178             } else { // maybe ibm852 ?
0179                 if (charset.isNull()) {
0180                     charset = "iso-8859-2";
0181                 }
0182                 continue;
0183             }
0184         }
0185     }
0186
0187     if (charset.isNull()) {
0188         charset = "iso-8859-3";
0189     }
0190
0191     return charset.data();
0192 }
0193
0194 static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
0195 {
0196 #ifdef DECODE_DEBUG
0197     qCWarning(KHTML_LOG) << "KEncodingDetector: Cyr heuristics";
0198 #endif
0199
0200 //     if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
0201 //         return "utf8";
0202     int utf8_mark = 0;
0203     int koi_score = 0;
0204     int cp1251_score = 0;
0205
0206     int koi_st = 0;
0207     int cp1251_st = 0;
0208
0209 //     int koi_na=0;
0210 //     int cp1251_na=0;
0211
0212     int koi_o_capital = 0;
0213     int koi_o = 0;
0214     int cp1251_o_capital = 0;
0215     int cp1251_o = 0;
0216
0217     int koi_a_capital = 0;
0218     int koi_a = 0;
0219     int cp1251_a_capital = 0;
0220     int cp1251_a = 0;
0221
0222     int koi_s_capital = 0;
0223     int koi_s = 0;
0224     int cp1251_s_capital = 0;
0225     int cp1251_s = 0;
0226
0227     int koi_i_capital = 0;
0228     int koi_i = 0;
0229     int cp1251_i_capital = 0;
0230     int cp1251_i = 0;
0231
0232     int cp1251_small_range = 0;
0233     int koi_small_range = 0;
0234     int ibm866_small_range = 0;
0235
0236     int i;
0237     for (i = 1; (i < size) && (cp1251_small_range + koi_small_range < 1000); ++i) {
0238         if (ptr[i] > 0xdf) {
0239             ++cp1251_small_range;
0240
0241             if (ptr[i] == 0xee) { //small o
0242                 ++cp1251_o;
0243             } else if (ptr[i] == 0xe0) { //small a
0244                 ++cp1251_a;
0245             } else if (ptr[i] == 0xe8) { //small i
0246                 ++cp1251_i;
0247             } else if (ptr[i] == 0xf1) { //small s
0248                 ++cp1251_s;
0249             } else if (ptr[i] == 0xf2 && ptr[i - 1] == 0xf1) { //small st
0250                 ++cp1251_st;
0251             }
0252
0253             else if (ptr[i] == 0xef) {
0254                 ++koi_o_capital;
0255             } else if (ptr[i] == 0xe1) {
0256                 ++koi_a_capital;
0257             } else if (ptr[i] == 0xe9) {
0258                 ++koi_i_capital;
0259             } else if (ptr[i] == 0xf3) {
0260                 ++koi_s_capital;
0261             }
0262
0263         } else if (ptr[i] > 0xbf) {
0264             ++koi_small_range;
0265
0266             if (ptr[i] == 0xd0 || ptr[i] == 0xd1) { //small o
0267                 ++utf8_mark;
0268             } else if (ptr[i] == 0xcf) { //small o
0269                 ++koi_o;
0270             } else if (ptr[i] == 0xc1) { //small a
0271                 ++koi_a;
0272             } else if (ptr[i] == 0xc9) { //small i
0273                 ++koi_i;
0274             } else if (ptr[i] == 0xd3) { //small s
0275                 ++koi_s;
0276             } else if (ptr[i] == 0xd4 && ptr[i - 1] == 0xd3) { //small st
0277                 ++koi_st;
0278             }
0279
0280             else if (ptr[i] == 0xce) {
0281                 ++cp1251_o_capital;
0282             } else if (ptr[i] == 0xc0) {
0283                 ++cp1251_a_capital;
0284             } else if (ptr[i] == 0xc8) {
0285                 ++cp1251_i_capital;
0286             } else if (ptr[i] == 0xd1) {
0287                 ++cp1251_s_capital;
0288             }
0289         } else if (ptr[i] > 0x9f && ptr[i] < 0xb0) { //first 16 letterz is 60%
0290             ++ibm866_small_range;
0291         }
0292
0293     }
0294
0295     //cannot decide?
0296     if (cp1251_small_range + koi_small_range + ibm866_small_range < 8) {
0297         return "";
0298     }
0299
0300     if (3 * utf8_mark > cp1251_small_range + koi_small_range + ibm866_small_range) {
0301 #ifdef DECODE_DEBUG
0302         qCWarning(KHTML_LOG) << "Cyr Enc Detection: UTF8";
0303 #endif
0304         return "UTF-8";
0305     }
0306
0307     if (ibm866_small_range > cp1251_small_range + koi_small_range) {
0308         return "ibm866";
0309     }
0310
0311 //     QByteArray koi_string = "koi8-u";
0312 //     QByteArray cp1251_string = "cp1251";
0313
0314     if (cp1251_st == 0 && koi_st > 1) {
0315         koi_score += 10;
0316     } else if (koi_st == 0 && cp1251_st > 1) {
0317         cp1251_score += 10;
0318     }
0319
0320     if (cp1251_st && koi_st) {
0321         if (cp1251_st / koi_st > 2) {
0322             cp1251_score += 20;
0323         } else if (koi_st / cp1251_st > 2) {
0324             koi_score += 20;
0325         }
0326     }
0327
0328     if (cp1251_a > koi_a) {
0329         cp1251_score += 10;
0330     } else if (cp1251_a || koi_a) {
0331         koi_score += 10;
0332     }
0333
0334     if (cp1251_o > koi_o) {
0335         cp1251_score += 10;
0336     } else if (cp1251_o || koi_o) {
0337         koi_score += 10;
0338     }
0339
0340     if (cp1251_i > koi_i) {
0341         cp1251_score += 10;
0342     } else if (cp1251_i || koi_i) {
0343         koi_score += 10;
0344     }
0345
0346     if (cp1251_s > koi_s) {
0347         cp1251_score += 10;
0348     } else if (cp1251_s || koi_s) {
0349         koi_score += 10;
0350     }
0351
0352     if (cp1251_a_capital > koi_a_capital) {
0353         cp1251_score += 9;
0354     } else if (cp1251_a_capital || koi_a_capital) {
0355         koi_score += 9;
0356     }
0357
0358     if (cp1251_o_capital > koi_o_capital) {
0359         cp1251_score += 9;
0360     } else if (cp1251_o_capital || koi_o_capital) {
0361         koi_score += 9;
0362     }
0363
0364     if (cp1251_i_capital > koi_i_capital) {
0365         cp1251_score += 9;
0366     } else if (cp1251_i_capital || koi_i_capital) {
0367         koi_score += 9;
0368     }
0369
0370     if (cp1251_s_capital > koi_s_capital) {
0371         cp1251_score += 9;
0372     } else if (cp1251_s_capital || koi_s_capital) {
0373         koi_score += 9;
0374     }
0375 #ifdef DECODE_DEBUG
0376     qCWarning(KHTML_LOG) << "koi_score " << koi_score << " cp1251_score " << cp1251_score;
0377 #endif
0378     if (abs(koi_score - cp1251_score) < 10) {
0379         //fallback...
0380         cp1251_score = cp1251_small_range;
0381         koi_score = koi_small_range;
0382     }
0383     if (cp1251_score > koi_score) {
0384         return "cp1251";
0385     } else {
0386         return "koi8-u";
0387     }
0388
0389 //     if (cp1251_score>koi_score)
0390 //         setEncoding("cp1251",AutoDetectedEncoding);
0391 //     else
0392 //         setEncoding("koi8-u",AutoDetectedEncoding);
0393 //     return true;
0394
0395 }
0396
0397 static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
0398 {
0399     for (int i = 0; i < size; ++i) {
0400         if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
0401                 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
0402                 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE) {
0403             return "cp1253";
0404         }
0405     }
0406
0407     return "iso-8859-7";
0408 }
0409
0410 static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
0411 {
0412     for (int i = 0; i < size; ++i) {
0413         if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89) || ptr[ i ] == 0x8B
0414                 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || (ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9)
0415                 || (ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8)) {
0416             return "cp1255";
0417         }
0418
0419         if (ptr[ i ] == 0xDF) {
0420             return "iso-8859-8-i";
0421         }
0422     }
0423
0424     return "iso-8859-8-i";
0425 }
0426
0427 static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
0428 {
0429     JapaneseCode kc;
0430
0431     switch (kc.guess_jp((const char *)ptr, size)) {
0432     case JapaneseCode::JIS:
0433         return "jis7";
0434     case JapaneseCode::EUC:
0435         return "eucjp";
0436     case JapaneseCode::SJIS:
0437         return "sjis";
0438     case JapaneseCode::UTF8:
0439         return "utf8";
0440     default:
0441         break;
0442     }
0443
0444     return "";
0445 }
0446
0447 static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
0448 {
0449     for (int i = 0; i < size; ++i) {
0450         if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C) || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C) || ptr[ i ] == 0x9F) {
0451             return "cp1254";
0452         }
0453     }
0454
0455     return "iso-8859-9";
0456 }
0457
0458 static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
0459 {
0460     --size;
0461     uint nonansi_count = 0;
0462     for (int i = 0; i < size; ++i) {
0463         if (ptr[i] > 0x79) {
0464             ++nonansi_count;
0465             if (ptr[i] > 0xc1 && ptr[i] < 0xf0 && ptr[i + 1] > 0x7f && ptr[i + 1] < 0xc0) {
0466                 return "UTF-8";
0467             }
0468             if (ptr[i] >= 0x78 && ptr[i] <= 0x9F) {
0469                 return "cp1252";
0470             }
0471         }
0472
0473     }
0474
0475     if (nonansi_count > 0) {
0476         return "iso-8859-15";
0477     }
0478
0479     return "";
0480 }
0481
0482 // Other browsers allow comments in the head section, so we need to also.
0483 // It's important not to look for tags inside the comments.
0484 static void skipComment(const char *&ptr, const char *pEnd)
0485 {
0486     const char *p = ptr;
0487     // Allow <!-->; other browsers do.
0488     if (*p == '>') {
0489         p++;
0490     } else {
0491         while (p != pEnd) {
0492             if (*p == '-') {
0493                 // This is the real end of comment, "-->".
0494                 if (p[1] == '-' && p[2] == '>') {
0495                     p += 3;
0496                     break;
0497                 }
0498                 // This is the incorrect end of comment that other browsers allow, "--!>".
0499                 if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
0500                     p += 4;
0501                     break;
0502                 }
0503             }
0504             p++;
0505         }
0506     }
0507     ptr = p;
0508 }
0509
0510 // Returns the position of the encoding string.
0511 static int findXMLEncoding(const QByteArray &str, int &encodingLength)
0512 {
0513     int len = str.length();
0514     int pos = str.indexOf("encoding");
0515     if (pos == -1) {
0516         return -1;
0517     }
0518     pos += 8;
0519
0520     // Skip spaces and stray control characters.
0521     while (pos < len && str[pos] <= ' ') {
0522         ++pos;
0523     }
0524
0525     //Bail out if nothing after
0526     // Skip equals sign.
0527     if (pos >= len || str[pos] != '=') {
0528         return -1;
0529     }
0530     ++pos;
0531
0532     // Skip spaces and stray control characters.
0533     while (pos < len && str[pos] <= ' ') {
0534         ++pos;
0535     }
0536
0537     //Bail out if nothing after
0538     if (pos >= len) {
0539         return -1;
0540     }
0541
0542     // Skip quotation mark.
0543     char quoteMark = str[pos];
0544     if (quoteMark != '"' && quoteMark != '\'') {
0545         return -1;
0546     }
0547     ++pos;
0548
0549     // Find the trailing quotation mark.
0550     int end = pos;
0551     while (end < len && str[end] != quoteMark) {
0552         ++end;
0553     }
0554
0555     if (end >= len) {
0556         return -1;
0557     }
0558
0559     encodingLength = end - pos;
0560     return pos;
0561 }
0562
0563 bool KEncodingDetector::processNull(char *data, int len)
0564 {
0565     bool bin = false;
0566     if (is16Bit(d->m_codec)) {
0567         for (int i = 1; i < len; i += 2) {
0568             if ((data[i] == '\0') && (data[i - 1] == '\0')) {
0569                 bin = true;
0570                 data[i] = ' ';
0571             }
0572         }
0573         return bin;
0574     }
0575     // replace '\0' by spaces, for buggy pages
0576     int i = len - 1;
0577     while (--i >= 0) {
0578         if (data[i] == 0) {
0579             bin = true;
0580             data[i] = ' ';
0581         }
0582     }
0583     return bin;
0584 }
0585
0586 bool KEncodingDetector::errorsIfUtf8(const char *data, int length)
0587 {
0588     if (d->m_codec->mibEnum() != MibUtf8) {
0589         return false;    //means no errors
0590     }
0591 // #define highest1Bits (unsigned char)0x80
0592 // #define highest2Bits (unsigned char)0xC0
0593 // #define highest3Bits (unsigned char)0xE0
0594 // #define highest4Bits (unsigned char)0xF0
0595 // #define highest5Bits (unsigned char)0xF8
0596     static const unsigned char highest1Bits = 0x80;
0597     static const unsigned char highest2Bits = 0xC0;
0598     static const unsigned char highest3Bits = 0xE0;
0599     static const unsigned char highest4Bits = 0xF0;
0600     static const unsigned char highest5Bits = 0xF8;
0601
0602     for (int i = 0; i < length; ++i) {
0603         unsigned char c = data[i];
0604
0605         if (d->m_multiByte > 0) {
0606             if ((c & highest2Bits) == 0x80) {
0607                 --(d->m_multiByte);
0608                 continue;
0609             }
0610 #ifdef DECODE_DEBUG
0611             qCWarning(KHTML_LOG) << "EncDetector: Broken UTF8";
0612 #endif
0613             return true;
0614         }
0615
0616         // most significant bit zero, single char
0617         if ((c & highest1Bits) == 0x00) {
0618             continue;
0619         }
0620
0621         // 110xxxxx => init 1 following bytes
0622         if ((c & highest3Bits) == 0xC0) {
0623             d->m_multiByte = 1;
0624             continue;
0625         }
0626
0627         // 1110xxxx => init 2 following bytes
0628         if ((c & highest4Bits) == 0xE0) {
0629             d->m_multiByte = 2;
0630             continue;
0631         }
0632
0633         // 11110xxx => init 3 following bytes
0634         if ((c & highest5Bits) == 0xF0) {
0635             d->m_multiByte = 3;
0636             continue;
0637         }
0638 #ifdef DECODE_DEBUG
0639         qCWarning(KHTML_LOG) << "EncDetector:_Broken UTF8";
0640 #endif
0641         return true;
0642     }
0643     return false;
0644 }
0645
0646 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
0647 {
0648 }
0649
0650 KEncodingDetector::KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script) :
0651     d(new KEncodingDetectorPrivate(codec, source, script))
0652 {
0653 }
0654
0655 KEncodingDetector::~KEncodingDetector()
0656 {
0657     delete d;
0658 }
0659
0660 void KEncodingDetector::setAutoDetectLanguage(KEncodingDetector::AutoDetectScript lang)
0661 {
0662     d->m_autoDetectLanguage = lang;
0663 }
0664 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
0665 {
0666     return d->m_autoDetectLanguage;
0667 }
0668
0669 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
0670 {
0671     return d->m_source;
0672 }
0673
0674 const char *KEncodingDetector::encoding() const
0675 {
0676     d->m_storeDecoderName = d->m_codec->name();
0677     return d->m_storeDecoderName.constData();
0678 }
0679
0680 bool KEncodingDetector::visuallyOrdered() const
0681 {
0682     return d->m_visualRTL;
0683 }
0684
0685 // const QTextCodec* KEncodingDetector::codec() const
0686 // {
0687 //     return d->m_codec;
0688 // }
0689
0690 QTextDecoder *KEncodingDetector::decoder()
0691 {
0692     return d->m_decoder;
0693 }
0694
0695 void KEncodingDetector::resetDecoder()
0696 {
0697     assert(d->m_defaultCodec);
0698     d->m_bufferForDefferedEncDetection.clear();
0699     d->m_writtingHappened = false;
0700     d->m_analyzeCalled = false;
0701     d->m_multiByte = 0;
0702     delete d->m_decoder;
0703     if (!d->m_codec) {
0704         d->m_codec = d->m_defaultCodec;
0705     }
0706     d->m_decoder = d->m_codec->makeDecoder();
0707 }
0708
0709 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
0710 {
0711     QTextCodec *codec;
0712     QByteArray enc(_encoding);
0713     if (/*enc.isNull() || */enc.isEmpty()) {
0714         if (type == DefaultEncoding) {
0715             codec = d->m_defaultCodec;
0716         } else {
0717             return false;
0718         }
0719     } else {
0720         //QString->QTextCodec
0721
0722         enc = enc.toLower();
0723         // hebrew visually ordered
0724         if (enc == "visual") {
0725             enc = "iso8859-8";
0726         }
0727         bool b;
0728         codec = KCharsets::charsets()->codecForName(QLatin1String(enc.data()), b);
0729         if (!b) {
0730             return false;
0731         }
0732     }
0733
0734     if (d->m_codec->mibEnum() == codec->mibEnum()) {
0735         // We already have the codec, but we still want to re-set the type,
0736         // as we may have overwritten a default with a detected
0737         d->m_source = type;
0738         return true;
0739     }
0740
0741     if ((type == EncodingFromMetaTag || type == EncodingFromXMLHeader) && is16Bit(codec)) {
0742         //Sometimes the codec specified is absurd, i.e. UTF-16 despite
0743         //us decoding a meta tag as ASCII. In that case, ignore it.
0744         return false;
0745     }
0746
0747     if (codec->mibEnum() == Mib8859_8) {
0748         //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
0749         codec = QTextCodec::codecForName("iso8859-8-i");
0750
0751         // visually ordered unless one of the following
0752         if (!(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" || enc == "csiso88598i" || enc == "logical")) {
0753             d->m_visualRTL = true;
0754         }
0755     }
0756
0757     d->m_codec = codec;
0758     d->m_source = type;
0759     delete d->m_decoder;
0760     d->m_decoder = d->m_codec->makeDecoder();
0761 #ifdef DECODE_DEBUG
0762     qCDebug(KHTML_LOG) << "KEncodingDetector::encoding used is" << d->m_codec->name();
0763 #endif
0764     return true;
0765 }
0766
0767 QString KEncodingDetector::decode(const char *data, int len)
0768 {
0769     processNull(const_cast<char *>(data), len);
0770     if (!d->m_analyzeCalled) {
0771         analyze(data, len);
0772         d->m_analyzeCalled = true;
0773     }
0774
0775     return d->m_decoder->toUnicode(data, len);
0776 }
0777
0778 QString KEncodingDetector::decode(const QByteArray &data)
0779 {
0780     processNull(const_cast<char *>(data.data()), data.size());
0781     if (!d->m_analyzeCalled) {
0782         analyze(data.data(), data.size());
0783         d->m_analyzeCalled = true;
0784     }
0785
0786     return d->m_decoder->toUnicode(data);
0787 }
0788
0789 QString KEncodingDetector::decodeWithBuffering(const char *data, int len)
0790 {
0791 #ifdef DECODE_DEBUG
0792     qCWarning(KHTML_LOG) << "KEncodingDetector: decoding " << len << " bytes";
0793 #endif
0794     if (d->m_writtingHappened) {
0795 #ifdef DECODE_DEBUG
0796         qCWarning(KHTML_LOG) << "KEncodingDetector: d->m_writtingHappened " << d->m_codec->name();
0797 #endif
0798         processNull(const_cast<char *>(data), len);
0799         return d->m_decoder->toUnicode(data, len);
0800     } else {
0801         if (d->m_bufferForDefferedEncDetection.isEmpty()) {
0802             // If encoding detection produced something, and we either got to the body or
0803             // actually saw the encoding explicitly, we're done.
0804             if (analyze(data, len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) {
0805 #ifdef DECODE_DEBUG
0806                 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened first time " << d->m_codec->name();
0807 #endif
0808                 processNull(const_cast<char *>(data), len);
0809                 d->m_writtingHappened = true;
0810                 return d->m_decoder->toUnicode(data, len);
0811             } else {
0812 #ifdef DECODE_DEBUG
0813                 qCWarning(KHTML_LOG) << "KEncodingDetector: begin deffer";
0814 #endif
0815                 d->m_bufferForDefferedEncDetection = data;
0816             }
0817         } else {
0818             d->m_bufferForDefferedEncDetection += data;
0819             // As above, but also limit the buffer size. We must use the entire buffer here,
0820             // since the boundaries might split the meta tag, etc.
0821             bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
0822             if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
0823                     d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER) {
0824                 d->m_writtingHappened = true;
0825                 d->m_bufferForDefferedEncDetection.replace('\0', ' ');
0826                 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
0827                 d->m_bufferForDefferedEncDetection.clear();
0828 #ifdef DECODE_DEBUG
0829                 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
0830 #endif
0831                 return result;
0832             }
0833         }
0834     }
0835
0836     return QString();
0837 }
0838
0839 bool KEncodingDetector::decodedInvalidCharacters() const
0840 {
0841     return d->m_decoder ? d->m_decoder->hasFailure() : false;
0842 }
0843
0844 QString KEncodingDetector::flush()
0845 {
0846     if (d->m_bufferForDefferedEncDetection.isEmpty()) {
0847         return QString();
0848     }
0849
0850     d->m_bufferForDefferedEncDetection.replace('\0', ' ');
0851     QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
0852     d->m_bufferForDefferedEncDetection.clear();
0853 #ifdef DECODE_DEBUG
0854     qCWarning(KHTML_LOG) << "KEncodingDetector:flush() " << d->m_bufferForDefferedEncDetection.length() << " bytes " << d->m_codec->name();
0855 #endif
0856     return result;
0857 }
0858
0859 bool KEncodingDetector::analyze(const char *data, int len)
0860 {
0861     // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
0862     // maximumBOMLength = 10
0863     // Even if the user has chosen utf16 we still need to auto-detect the endianness
0864     if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) {
0865         // Extract the first three bytes.
0866         const uchar *udata = (const uchar *)data;
0867         uchar c1 = *udata++;
0868         uchar c2 = *udata++;
0869         uchar c3 = *udata++;
0870
0871         // Check for the BOM
0872         const char *autoDetectedEncoding;
0873         if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
0874             autoDetectedEncoding = "UTF-16";
0875         } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
0876             autoDetectedEncoding = "UTF-8";
0877         } else if (c1 == 0x00 || c2 == 0x00) {
0878             uchar c4 = *udata++;
0879             uchar c5 = *udata++;
0880             uchar c6 = *udata++;
0881             uchar c7 = *udata++;
0882             uchar c8 = *udata++;
0883             uchar c9 = *udata++;
0884             uchar c10 = *udata++;
0885
0886             int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
0887             int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
0888             if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) {
0889                 autoDetectedEncoding = "UTF-16";
0890             } else {
0891                 autoDetectedEncoding = nullptr;
0892             }
0893         } else {
0894             autoDetectedEncoding = nullptr;
0895         }
0896
0897         // If we found a BOM, use the encoding it implies.
0898         if (autoDetectedEncoding != nullptr) {
0899             d->m_source = BOM;
0900             d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
0901             assert(d->m_codec);
0902             //enc = d->m_codec->name();
0903             delete d->m_decoder;
0904             d->m_decoder = d->m_codec->makeDecoder();
0905 #ifdef DECODE_DEBUG
0906             qCWarning(KHTML_LOG) << "Detection by BOM";
0907 #endif
0908             if (is16Bit(d->m_codec) && c2 == 0x00) {
0909                 // utf16LE, we need to put the decoder in LE mode
0910                 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
0911                 d->m_decoder->toUnicode(reverseUtf16, 2);
0912             }
0913             return true;
0914         }
0915     }
0916
0917     //exit from routine in case it was called to only detect byte order for utf-16
0918     if (d->m_source == UserChosenEncoding) {
0919 #ifdef DECODE_DEBUG
0920         qCWarning(KHTML_LOG) << "KEncodingDetector: UserChosenEncoding exit ";
0921 #endif
0922
0923         if (errorsIfUtf8(data, len)) {
0924             setEncoding("", DefaultEncoding);
0925         }
0926         return true;
0927     }
0928
0929     // HTTP header takes precedence over meta-type stuff
0930     if (d->m_source == EncodingFromHTTPHeader) {
0931         return true;
0932     }
0933
0934     if (!d->m_seenBody) {
0935         // we still don't have an encoding, and are in the head
0936         // the following tags are allowed in <head>:
0937         // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
0938         const char *ptr = data;
0939         const char *pEnd = data + len;
0940
0941         while (ptr != pEnd) {
0942             if (*ptr != '<') {
0943                 ++ptr;
0944                 continue;
0945             }
0946             ++ptr;
0947             // Handle comments.
0948             if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
0949                 ptr += 3;
0950                 skipComment(ptr, pEnd);
0951                 continue;
0952             }
0953
0954             // Handle XML header, which can have encoding in it.
0955             if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
0956                 const char *end = ptr;
0957                 while (*end != '>' && end < pEnd) {
0958                     end++;
0959                 }
0960                 if (*end == '\0' || end == pEnd) {
0961                     break;
0962                 }
0963                 QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
0964                 int length;
0965                 int pos = findXMLEncoding(str, length);
0966                 // also handles the case when specified encoding aint correct
0967                 if (pos != -1 && setEncoding(str.mid(pos, length).data(), EncodingFromXMLHeader)) {
0968                     return true;
0969                 }
0970             }
0971
0972             //look for <meta>, stop if we reach <body>
0973             while (
0974                 !(((*ptr >= 'a') && (*ptr <= 'z')) ||
0975                   ((*ptr >= 'A') && (*ptr <= 'Z')))
0976                 && ptr < pEnd
0977             ) {
0978                 ++ptr;
0979             }
0980
0981             char tmp[5];
0982             int length = 0;
0983             const char *max = ptr + 4;
0984             if (pEnd < max) {
0985                 max = pEnd;
0986             }
0987             while (
0988                 (((*ptr >= 'a') && (*ptr <= 'z')) ||
0989                  ((*ptr >= 'A') && (*ptr <= 'Z')) ||
0990                  ((*ptr >= '0') && (*ptr <= '9')))
0991                 && ptr < max
0992             ) {
0993                 tmp[length] = tolower(*ptr);
0994                 ++ptr;
0995                 ++length;
0996             }
0997             tmp[length] = 0;
0998             if (tmp[0] == 'm' && tmp[1] == 'e' && tmp[2] == 't' && tmp[3] == 'a') {
0999                 // found a meta tag...
1000                 const char *end = ptr;
1001                 while (*end != '>' && *end != '\0' && end < pEnd) {
1002                     end++;
1003                 }
1004                 //if ( *end == '\0' ) break;
1005                 const QByteArray str = QByteArray(ptr, (end - ptr) + 1).toLower();
1006                 const int strLength = str.length();
1007                 int pos = 0;
1008                 //if( (pos = str.find("http-equiv", pos)) == -1) break;
1009                 //if( (pos = str.find("content-type", pos)) == -1) break;
1010                 if ((pos = str.indexOf("charset")) == -1) {
1011                     continue;
1012                 }
1013                 pos += 6;
1014                 // skip to '='
1015                 if ((pos = str.indexOf("=", pos)) == -1) {
1016                     continue;
1017                 }
1018
1019                 // skip '='
1020                 ++pos;
1021
1022                 // skip whitespace before encoding itself
1023                 while (pos < strLength && str[pos] <= ' ') {
1024                     ++pos;
1025                 }
1026
1027                 // there may also be an opening quote, if this is a charset= and not a http-equiv.
1028                 if (pos < strLength && (str[pos] == '"' || str[pos] == '\'')) {
1029                     ++pos;
1030                 }
1031
1032                 // skip whitespace
1033                 while (pos < strLength && str[pos] <= ' ') {
1034                     ++pos;
1035                 }
1036
1037                 if (pos == strLength) {
1038                     continue;
1039                 }
1040
1041                 int endpos = pos;
1042                 while (endpos < strLength &&
1043                         (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1044                          && str[endpos] != ';' && str[endpos] != '>')) {
1045                     ++endpos;
1046                 }
1047 #ifdef DECODE_DEBUG
1048                 qCDebug(KHTML_LOG) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos, endpos - pos).data();
1049 #endif
1050                 if (setEncoding(str.mid(pos, endpos - pos).data(), EncodingFromMetaTag)) {
1051                     return true;
1052                 }
1053             } else if (tmp[0] == 'b' && tmp[1] == 'o' && tmp[2] == 'd' && tmp[3] == 'y') {
1054                 d->m_seenBody = true;
1055                 break;
1056             }
1057         }
1058     }
1059
1060     if (len < 20) {
1061         return false;
1062     }
1063
1064 #ifdef DECODE_DEBUG
1065     qCDebug(KHTML_LOG) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
1066 #endif
1067
1068     switch (d->m_autoDetectLanguage) {
1069     case KEncodingDetector::Arabic:
1070         return setEncoding(automaticDetectionForArabic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1071 //             break;
1072     case KEncodingDetector::Baltic:
1073         return setEncoding(automaticDetectionForBaltic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1074 //             break;
1075     case KEncodingDetector::CentralEuropean:
1076         return setEncoding(automaticDetectionForCentralEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1077 //            break;
1078     case KEncodingDetector::Cyrillic:
1079         return setEncoding(automaticDetectionForCyrillic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1080 //             break;
1081     case KEncodingDetector::Greek:
1082         return setEncoding(automaticDetectionForGreek((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1083 //             break;
1084     case KEncodingDetector::Hebrew:
1085         return setEncoding(automaticDetectionForHebrew((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1086 //             break;
1087     case KEncodingDetector::Japanese:
1088         return setEncoding(automaticDetectionForJapanese((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1089 //             break;
1090     case KEncodingDetector::Turkish:
1091         return setEncoding(automaticDetectionForTurkish((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1092 //             break;
1093     case KEncodingDetector::WesternEuropean:
1094         if (setEncoding(automaticDetectionForWesternEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding)) {
1095             return true;
1096         } else if (d->m_defaultCodec->mibEnum() == MibLatin1) { //detection for khtml
1097             return setEncoding("iso-8859-15", AutoDetectedEncoding);
1098         } else { //use default provided by eg katepart
1099             return setEncoding("", DefaultEncoding);
1100         }
1101 //             break;
1102     case KEncodingDetector::SemiautomaticDetection:
1103     case KEncodingDetector::ChineseSimplified:
1104     case KEncodingDetector::ChineseTraditional:
1105     case KEncodingDetector::Korean:
1106     case KEncodingDetector::Thai:
1107     case KEncodingDetector::Unicode:
1108     case KEncodingDetector::NorthernSaami:
1109     case KEncodingDetector::SouthEasternEurope:
1110     case KEncodingDetector::None:
1111         // huh. somethings broken in this code ### FIXME
1112         //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1113         break;
1114     }
1115
1116     return true;
1117 }
1118
1119 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString &lang)
1120 {
1121     if (lang.isEmpty()) {
1122         return KEncodingDetector::None;
1123     } else if (lang == i18nc("@item Text character set", "Unicode")) {
1124         return KEncodingDetector::Unicode;
1125     } else if (lang == i18nc("@item Text character set", "Cyrillic")) {
1126         return KEncodingDetector::Cyrillic;
1127     } else if (lang == i18nc("@item Text character set", "Western European")) {
1128         return KEncodingDetector::WesternEuropean;
1129     } else if (lang == i18nc("@item Text character set", "Central European")) {
1130         return KEncodingDetector::CentralEuropean;
1131     } else if (lang == i18nc("@item Text character set", "Greek")) {
1132         return KEncodingDetector::Greek;
1133     } else if (lang == i18nc("@item Text character set", "Hebrew")) {
1134         return KEncodingDetector::Hebrew;
1135     } else if (lang == i18nc("@item Text character set", "Turkish")) {
1136         return KEncodingDetector::Turkish;
1137     } else if (lang == i18nc("@item Text character set", "Japanese")) {
1138         return KEncodingDetector::Japanese;
1139     } else if (lang == i18nc("@item Text character set", "Baltic")) {
1140         return KEncodingDetector::Baltic;
1141     } else if (lang == i18nc("@item Text character set", "Arabic")) {
1142         return KEncodingDetector::Arabic;
1143     }
1144
1145     return KEncodingDetector::None;
1146 }
1147
1148 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
1149 {
1150     switch (script) {
1151     case KEncodingDetector::Arabic:
1152         return true;
1153     case KEncodingDetector::Baltic:
1154         return true;
1155     case KEncodingDetector::CentralEuropean:
1156         return true;
1157     case KEncodingDetector::Cyrillic:
1158         return true;
1159     case KEncodingDetector::Greek:
1160         return true;
1161     case KEncodingDetector::Hebrew:
1162         return true;
1163     case KEncodingDetector::Japanese:
1164         return true;
1165     case KEncodingDetector::Turkish:
1166         return true;
1167     case KEncodingDetector::WesternEuropean:
1168         return true;
1169     case KEncodingDetector::ChineseTraditional:
1170         return true;
1171     case KEncodingDetector::ChineseSimplified:
1172         return true;
1173     case KEncodingDetector::Unicode:
1174         return true;
1175         break;
1176     default:
1177         return false;
1178     }
1179 }
1180
1181 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
1182 {
1183     switch (script) {
1184     case KEncodingDetector::Arabic:
1185         return i18nc("@item Text character set", "Arabic");
1186         break;
1187     case KEncodingDetector::Baltic:
1188         return i18nc("@item Text character set", "Baltic");
1189         break;
1190     case KEncodingDetector::CentralEuropean:
1191         return i18nc("@item Text character set", "Central European");
1192         break;
1193     case KEncodingDetector::Cyrillic:
1194         return i18nc("@item Text character set", "Cyrillic");
1195         break;
1196     case KEncodingDetector::Greek:
1197         return i18nc("@item Text character set", "Greek");
1198         break;
1199     case KEncodingDetector::Hebrew:
1200         return i18nc("@item Text character set", "Hebrew");
1201         break;
1202     case KEncodingDetector::Japanese:
1203         return i18nc("@item Text character set", "Japanese");
1204         break;
1205     case KEncodingDetector::Turkish:
1206         return i18nc("@item Text character set", "Turkish");
1207         break;
1208     case KEncodingDetector::WesternEuropean:
1209         return i18nc("@item Text character set", "Western European");
1210         break;
1211     case KEncodingDetector::ChineseTraditional:
1212         return i18nc("@item Text character set", "Chinese Traditional");
1213         break;
1214     case KEncodingDetector::ChineseSimplified:
1215         return i18nc("@item Text character set", "Chinese Simplified");
1216         break;
1217     case KEncodingDetector::Korean:
1218         return i18nc("@item Text character set", "Korean");
1219         break;
1220     case KEncodingDetector::Thai:
1221         return i18nc("@item Text character set", "Thai");
1222         break;
1223     case KEncodingDetector::Unicode:
1224         return i18nc("@item Text character set", "Unicode");
1225         break;
1226     //case KEncodingDetector::SemiautomaticDetection:
1227     default:
1228         return QString();
1229
1230     }
1231 }
1232
1233 #undef DECODE_DEBUG
1234