File indexing completed on 2024-12-01 12:33:54
0001 /* 0002 This file is part of the KDE libraries 0003 0004 Copyright (C) 1999 Lars Knoll (knoll@kde.org) 0005 Copyright (C) 2003 Dirk Mueller (mueller@kde.org) 0006 Copyright (C) 2003 Apple Computer, Inc. 0007 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) 0008 0009 This library is free software; you can redistribute it and/or 0010 modify it under the terms of the GNU Library General Public 0011 License as published by the Free Software Foundation; either 0012 version 2 of the License, or (at your option) any later version. 0013 0014 This library is distributed in the hope that it will be useful, 0015 but WITHOUT ANY WARRANTY; without even the implied warranty of 0016 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0017 Library General Public License for more details. 0018 0019 You should have received a copy of the GNU Library General Public License 0020 along with this library; see the file COPYING.LIB. If not, write to 0021 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0022 Boston, MA 02110-1301, USA. 0023 */ 0024 //---------------------------------------------------------------------------- 0025 // 0026 // decoder for input stream 0027 0028 #include "kencodingdetector.h" 0029 0030 #undef DECODE_DEBUG 0031 //#define DECODE_DEBUG 0032 0033 #define MAX_BUFFER 16*1024 0034 0035 #include <assert.h> 0036 0037 #include "guess_ja_p.h" 0038 0039 #include "khtml_debug.h" 0040 #include <QRegExp> 0041 #include <QTextCodec> 0042 0043 #include "kcharsets.h" 0044 #include <klocalizedstring.h> 0045 0046 #include <ctype.h> 0047 0048 enum MIB { 0049 MibLatin1 = 4, 0050 Mib8859_8 = 85, 0051 MibUtf8 = 106, 0052 MibUcs2 = 1000, 0053 MibUtf16 = 1015, 0054 MibUtf16BE = 1013, 0055 MibUtf16LE = 1014 0056 }; 0057 0058 static bool is16Bit(QTextCodec *codec) 0059 { 0060 switch (codec->mibEnum()) { 0061 case MibUtf16: 0062 case MibUtf16BE: 0063 case MibUtf16LE: 0064 case MibUcs2: 0065 return true; 0066 default: 0067 return false; 0068 } 0069 } 0070 0071 class KEncodingDetectorPrivate 0072 { 0073 public: 0074 QTextCodec *m_codec; 0075 QTextDecoder *m_decoder; // utf16 0076 QTextCodec *m_defaultCodec; 0077 QByteArray m_storeDecoderName; 0078 0079 KEncodingDetector::EncodingChoiceSource m_source; 0080 KEncodingDetector::AutoDetectScript m_autoDetectLanguage; 0081 0082 bool m_visualRTL : 1; 0083 bool m_seenBody : 1; 0084 bool m_writtingHappened : 1; 0085 bool m_analyzeCalled : 1; //for decode() 0086 int m_multiByte; 0087 0088 QByteArray m_bufferForDefferedEncDetection; 0089 0090 KEncodingDetectorPrivate() 0091 : m_codec(QTextCodec::codecForMib(MibLatin1)) 0092 , m_decoder(m_codec->makeDecoder()) 0093 , m_defaultCodec(m_codec) 0094 , m_source(KEncodingDetector::DefaultEncoding) 0095 , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection) 0096 , m_visualRTL(false) 0097 , m_seenBody(false) 0098 , m_writtingHappened(false) 0099 , m_analyzeCalled(false) 0100 , m_multiByte(0) 0101 { 0102 } 0103 0104 KEncodingDetectorPrivate(QTextCodec *codec, KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script) 0105 : m_codec(codec) 0106 , m_decoder(m_codec->makeDecoder()) 0107 , m_defaultCodec(m_codec) 0108 , m_source(source) 0109 , m_autoDetectLanguage(script) 0110 , m_visualRTL(false) 0111 , m_seenBody(false) 0112 , m_writtingHappened(false) 0113 , m_analyzeCalled(false) 0114 , m_multiByte(0) 0115 { 0116 } 0117 0118 ~KEncodingDetectorPrivate() 0119 { 0120 delete m_decoder; 0121 } 0122 0123 // Returns true if the encoding was explicitly specified someplace. 0124 bool isExplicitlySpecifiedEncoding() 0125 { 0126 return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding; 0127 } 0128 }; 0129 0130 static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size) 0131 { 0132 for (int i = 0; i < size; ++i) { 0133 if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 0134 || (ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB) || (ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA) 0135 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 0136 || (ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF) || (ptr[ i ] >= 0xF3)) { 0137 return "cp1256"; 0138 } 0139 } 0140 0141 return "iso-8859-6"; 0142 } 0143 0144 static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size) 0145 { 0146 for (int i = 0; i < size; ++i) { 0147 if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E)) { 0148 return "cp1257"; 0149 } 0150 0151 if (ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5) { 0152 return "iso-8859-13"; 0153 } 0154 } 0155 0156 return "iso-8859-13"; 0157 } 0158 0159 static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size) 0160 { 0161 QByteArray charset = QByteArray(); 0162 for (int i = 0; i < size; ++i) { 0163 if (ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) { 0164 if (ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98) { 0165 return "ibm852"; 0166 } 0167 0168 if (i + 1 > size) { 0169 return "cp1250"; 0170 } else { // maybe ibm852 ? 0171 charset = "cp1250"; 0172 continue; 0173 } 0174 } 0175 if (ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0) { 0176 if (i + 1 > size) { 0177 return "iso-8859-2"; 0178 } else { // maybe ibm852 ? 0179 if (charset.isNull()) { 0180 charset = "iso-8859-2"; 0181 } 0182 continue; 0183 } 0184 } 0185 } 0186 0187 if (charset.isNull()) { 0188 charset = "iso-8859-3"; 0189 } 0190 0191 return charset.data(); 0192 } 0193 0194 static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size) 0195 { 0196 #ifdef DECODE_DEBUG 0197 qCWarning(KHTML_LOG) << "KEncodingDetector: Cyr heuristics"; 0198 #endif 0199 0200 // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf) 0201 // return "utf8"; 0202 int utf8_mark = 0; 0203 int koi_score = 0; 0204 int cp1251_score = 0; 0205 0206 int koi_st = 0; 0207 int cp1251_st = 0; 0208 0209 // int koi_na=0; 0210 // int cp1251_na=0; 0211 0212 int koi_o_capital = 0; 0213 int koi_o = 0; 0214 int cp1251_o_capital = 0; 0215 int cp1251_o = 0; 0216 0217 int koi_a_capital = 0; 0218 int koi_a = 0; 0219 int cp1251_a_capital = 0; 0220 int cp1251_a = 0; 0221 0222 int koi_s_capital = 0; 0223 int koi_s = 0; 0224 int cp1251_s_capital = 0; 0225 int cp1251_s = 0; 0226 0227 int koi_i_capital = 0; 0228 int koi_i = 0; 0229 int cp1251_i_capital = 0; 0230 int cp1251_i = 0; 0231 0232 int cp1251_small_range = 0; 0233 int koi_small_range = 0; 0234 int ibm866_small_range = 0; 0235 0236 int i; 0237 for (i = 1; (i < size) && (cp1251_small_range + koi_small_range < 1000); ++i) { 0238 if (ptr[i] > 0xdf) { 0239 ++cp1251_small_range; 0240 0241 if (ptr[i] == 0xee) { //small o 0242 ++cp1251_o; 0243 } else if (ptr[i] == 0xe0) { //small a 0244 ++cp1251_a; 0245 } else if (ptr[i] == 0xe8) { //small i 0246 ++cp1251_i; 0247 } else if (ptr[i] == 0xf1) { //small s 0248 ++cp1251_s; 0249 } else if (ptr[i] == 0xf2 && ptr[i - 1] == 0xf1) { //small st 0250 ++cp1251_st; 0251 } 0252 0253 else if (ptr[i] == 0xef) { 0254 ++koi_o_capital; 0255 } else if (ptr[i] == 0xe1) { 0256 ++koi_a_capital; 0257 } else if (ptr[i] == 0xe9) { 0258 ++koi_i_capital; 0259 } else if (ptr[i] == 0xf3) { 0260 ++koi_s_capital; 0261 } 0262 0263 } else if (ptr[i] > 0xbf) { 0264 ++koi_small_range; 0265 0266 if (ptr[i] == 0xd0 || ptr[i] == 0xd1) { //small o 0267 ++utf8_mark; 0268 } else if (ptr[i] == 0xcf) { //small o 0269 ++koi_o; 0270 } else if (ptr[i] == 0xc1) { //small a 0271 ++koi_a; 0272 } else if (ptr[i] == 0xc9) { //small i 0273 ++koi_i; 0274 } else if (ptr[i] == 0xd3) { //small s 0275 ++koi_s; 0276 } else if (ptr[i] == 0xd4 && ptr[i - 1] == 0xd3) { //small st 0277 ++koi_st; 0278 } 0279 0280 else if (ptr[i] == 0xce) { 0281 ++cp1251_o_capital; 0282 } else if (ptr[i] == 0xc0) { 0283 ++cp1251_a_capital; 0284 } else if (ptr[i] == 0xc8) { 0285 ++cp1251_i_capital; 0286 } else if (ptr[i] == 0xd1) { 0287 ++cp1251_s_capital; 0288 } 0289 } else if (ptr[i] > 0x9f && ptr[i] < 0xb0) { //first 16 letterz is 60% 0290 ++ibm866_small_range; 0291 } 0292 0293 } 0294 0295 //cannot decide? 0296 if (cp1251_small_range + koi_small_range + ibm866_small_range < 8) { 0297 return ""; 0298 } 0299 0300 if (3 * utf8_mark > cp1251_small_range + koi_small_range + ibm866_small_range) { 0301 #ifdef DECODE_DEBUG 0302 qCWarning(KHTML_LOG) << "Cyr Enc Detection: UTF8"; 0303 #endif 0304 return "UTF-8"; 0305 } 0306 0307 if (ibm866_small_range > cp1251_small_range + koi_small_range) { 0308 return "ibm866"; 0309 } 0310 0311 // QByteArray koi_string = "koi8-u"; 0312 // QByteArray cp1251_string = "cp1251"; 0313 0314 if (cp1251_st == 0 && koi_st > 1) { 0315 koi_score += 10; 0316 } else if (koi_st == 0 && cp1251_st > 1) { 0317 cp1251_score += 10; 0318 } 0319 0320 if (cp1251_st && koi_st) { 0321 if (cp1251_st / koi_st > 2) { 0322 cp1251_score += 20; 0323 } else if (koi_st / cp1251_st > 2) { 0324 koi_score += 20; 0325 } 0326 } 0327 0328 if (cp1251_a > koi_a) { 0329 cp1251_score += 10; 0330 } else if (cp1251_a || koi_a) { 0331 koi_score += 10; 0332 } 0333 0334 if (cp1251_o > koi_o) { 0335 cp1251_score += 10; 0336 } else if (cp1251_o || koi_o) { 0337 koi_score += 10; 0338 } 0339 0340 if (cp1251_i > koi_i) { 0341 cp1251_score += 10; 0342 } else if (cp1251_i || koi_i) { 0343 koi_score += 10; 0344 } 0345 0346 if (cp1251_s > koi_s) { 0347 cp1251_score += 10; 0348 } else if (cp1251_s || koi_s) { 0349 koi_score += 10; 0350 } 0351 0352 if (cp1251_a_capital > koi_a_capital) { 0353 cp1251_score += 9; 0354 } else if (cp1251_a_capital || koi_a_capital) { 0355 koi_score += 9; 0356 } 0357 0358 if (cp1251_o_capital > koi_o_capital) { 0359 cp1251_score += 9; 0360 } else if (cp1251_o_capital || koi_o_capital) { 0361 koi_score += 9; 0362 } 0363 0364 if (cp1251_i_capital > koi_i_capital) { 0365 cp1251_score += 9; 0366 } else if (cp1251_i_capital || koi_i_capital) { 0367 koi_score += 9; 0368 } 0369 0370 if (cp1251_s_capital > koi_s_capital) { 0371 cp1251_score += 9; 0372 } else if (cp1251_s_capital || koi_s_capital) { 0373 koi_score += 9; 0374 } 0375 #ifdef DECODE_DEBUG 0376 qCWarning(KHTML_LOG) << "koi_score " << koi_score << " cp1251_score " << cp1251_score; 0377 #endif 0378 if (abs(koi_score - cp1251_score) < 10) { 0379 //fallback... 0380 cp1251_score = cp1251_small_range; 0381 koi_score = koi_small_range; 0382 } 0383 if (cp1251_score > koi_score) { 0384 return "cp1251"; 0385 } else { 0386 return "koi8-u"; 0387 } 0388 0389 // if (cp1251_score>koi_score) 0390 // setEncoding("cp1251",AutoDetectedEncoding); 0391 // else 0392 // setEncoding("koi8-u",AutoDetectedEncoding); 0393 // return true; 0394 0395 } 0396 0397 static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size) 0398 { 0399 for (int i = 0; i < size; ++i) { 0400 if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B 0401 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 0402 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE) { 0403 return "cp1253"; 0404 } 0405 } 0406 0407 return "iso-8859-7"; 0408 } 0409 0410 static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size) 0411 { 0412 for (int i = 0; i < size; ++i) { 0413 if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89) || ptr[ i ] == 0x8B 0414 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || (ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9) 0415 || (ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8)) { 0416 return "cp1255"; 0417 } 0418 0419 if (ptr[ i ] == 0xDF) { 0420 return "iso-8859-8-i"; 0421 } 0422 } 0423 0424 return "iso-8859-8-i"; 0425 } 0426 0427 static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size) 0428 { 0429 JapaneseCode kc; 0430 0431 switch (kc.guess_jp((const char *)ptr, size)) { 0432 case JapaneseCode::JIS: 0433 return "jis7"; 0434 case JapaneseCode::EUC: 0435 return "eucjp"; 0436 case JapaneseCode::SJIS: 0437 return "sjis"; 0438 case JapaneseCode::UTF8: 0439 return "utf8"; 0440 default: 0441 break; 0442 } 0443 0444 return ""; 0445 } 0446 0447 static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size) 0448 { 0449 for (int i = 0; i < size; ++i) { 0450 if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C) || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C) || ptr[ i ] == 0x9F) { 0451 return "cp1254"; 0452 } 0453 } 0454 0455 return "iso-8859-9"; 0456 } 0457 0458 static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size) 0459 { 0460 --size; 0461 uint nonansi_count = 0; 0462 for (int i = 0; i < size; ++i) { 0463 if (ptr[i] > 0x79) { 0464 ++nonansi_count; 0465 if (ptr[i] > 0xc1 && ptr[i] < 0xf0 && ptr[i + 1] > 0x7f && ptr[i + 1] < 0xc0) { 0466 return "UTF-8"; 0467 } 0468 if (ptr[i] >= 0x78 && ptr[i] <= 0x9F) { 0469 return "cp1252"; 0470 } 0471 } 0472 0473 } 0474 0475 if (nonansi_count > 0) { 0476 return "iso-8859-15"; 0477 } 0478 0479 return ""; 0480 } 0481 0482 // Other browsers allow comments in the head section, so we need to also. 0483 // It's important not to look for tags inside the comments. 0484 static void skipComment(const char *&ptr, const char *pEnd) 0485 { 0486 const char *p = ptr; 0487 // Allow <!-->; other browsers do. 0488 if (*p == '>') { 0489 p++; 0490 } else { 0491 while (p != pEnd) { 0492 if (*p == '-') { 0493 // This is the real end of comment, "-->". 0494 if (p[1] == '-' && p[2] == '>') { 0495 p += 3; 0496 break; 0497 } 0498 // This is the incorrect end of comment that other browsers allow, "--!>". 0499 if (p[1] == '-' && p[2] == '!' && p[3] == '>') { 0500 p += 4; 0501 break; 0502 } 0503 } 0504 p++; 0505 } 0506 } 0507 ptr = p; 0508 } 0509 0510 // Returns the position of the encoding string. 0511 static int findXMLEncoding(const QByteArray &str, int &encodingLength) 0512 { 0513 int len = str.length(); 0514 int pos = str.indexOf("encoding"); 0515 if (pos == -1) { 0516 return -1; 0517 } 0518 pos += 8; 0519 0520 // Skip spaces and stray control characters. 0521 while (pos < len && str[pos] <= ' ') { 0522 ++pos; 0523 } 0524 0525 //Bail out if nothing after 0526 // Skip equals sign. 0527 if (pos >= len || str[pos] != '=') { 0528 return -1; 0529 } 0530 ++pos; 0531 0532 // Skip spaces and stray control characters. 0533 while (pos < len && str[pos] <= ' ') { 0534 ++pos; 0535 } 0536 0537 //Bail out if nothing after 0538 if (pos >= len) { 0539 return -1; 0540 } 0541 0542 // Skip quotation mark. 0543 char quoteMark = str[pos]; 0544 if (quoteMark != '"' && quoteMark != '\'') { 0545 return -1; 0546 } 0547 ++pos; 0548 0549 // Find the trailing quotation mark. 0550 int end = pos; 0551 while (end < len && str[end] != quoteMark) { 0552 ++end; 0553 } 0554 0555 if (end >= len) { 0556 return -1; 0557 } 0558 0559 encodingLength = end - pos; 0560 return pos; 0561 } 0562 0563 bool KEncodingDetector::processNull(char *data, int len) 0564 { 0565 bool bin = false; 0566 if (is16Bit(d->m_codec)) { 0567 for (int i = 1; i < len; i += 2) { 0568 if ((data[i] == '\0') && (data[i - 1] == '\0')) { 0569 bin = true; 0570 data[i] = ' '; 0571 } 0572 } 0573 return bin; 0574 } 0575 // replace '\0' by spaces, for buggy pages 0576 int i = len - 1; 0577 while (--i >= 0) { 0578 if (data[i] == 0) { 0579 bin = true; 0580 data[i] = ' '; 0581 } 0582 } 0583 return bin; 0584 } 0585 0586 bool KEncodingDetector::errorsIfUtf8(const char *data, int length) 0587 { 0588 if (d->m_codec->mibEnum() != MibUtf8) { 0589 return false; //means no errors 0590 } 0591 // #define highest1Bits (unsigned char)0x80 0592 // #define highest2Bits (unsigned char)0xC0 0593 // #define highest3Bits (unsigned char)0xE0 0594 // #define highest4Bits (unsigned char)0xF0 0595 // #define highest5Bits (unsigned char)0xF8 0596 static const unsigned char highest1Bits = 0x80; 0597 static const unsigned char highest2Bits = 0xC0; 0598 static const unsigned char highest3Bits = 0xE0; 0599 static const unsigned char highest4Bits = 0xF0; 0600 static const unsigned char highest5Bits = 0xF8; 0601 0602 for (int i = 0; i < length; ++i) { 0603 unsigned char c = data[i]; 0604 0605 if (d->m_multiByte > 0) { 0606 if ((c & highest2Bits) == 0x80) { 0607 --(d->m_multiByte); 0608 continue; 0609 } 0610 #ifdef DECODE_DEBUG 0611 qCWarning(KHTML_LOG) << "EncDetector: Broken UTF8"; 0612 #endif 0613 return true; 0614 } 0615 0616 // most significant bit zero, single char 0617 if ((c & highest1Bits) == 0x00) { 0618 continue; 0619 } 0620 0621 // 110xxxxx => init 1 following bytes 0622 if ((c & highest3Bits) == 0xC0) { 0623 d->m_multiByte = 1; 0624 continue; 0625 } 0626 0627 // 1110xxxx => init 2 following bytes 0628 if ((c & highest4Bits) == 0xE0) { 0629 d->m_multiByte = 2; 0630 continue; 0631 } 0632 0633 // 11110xxx => init 3 following bytes 0634 if ((c & highest5Bits) == 0xF0) { 0635 d->m_multiByte = 3; 0636 continue; 0637 } 0638 #ifdef DECODE_DEBUG 0639 qCWarning(KHTML_LOG) << "EncDetector:_Broken UTF8"; 0640 #endif 0641 return true; 0642 } 0643 return false; 0644 } 0645 0646 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate) 0647 { 0648 } 0649 0650 KEncodingDetector::KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script) : 0651 d(new KEncodingDetectorPrivate(codec, source, script)) 0652 { 0653 } 0654 0655 KEncodingDetector::~KEncodingDetector() 0656 { 0657 delete d; 0658 } 0659 0660 void KEncodingDetector::setAutoDetectLanguage(KEncodingDetector::AutoDetectScript lang) 0661 { 0662 d->m_autoDetectLanguage = lang; 0663 } 0664 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const 0665 { 0666 return d->m_autoDetectLanguage; 0667 } 0668 0669 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const 0670 { 0671 return d->m_source; 0672 } 0673 0674 const char *KEncodingDetector::encoding() const 0675 { 0676 d->m_storeDecoderName = d->m_codec->name(); 0677 return d->m_storeDecoderName.constData(); 0678 } 0679 0680 bool KEncodingDetector::visuallyOrdered() const 0681 { 0682 return d->m_visualRTL; 0683 } 0684 0685 // const QTextCodec* KEncodingDetector::codec() const 0686 // { 0687 // return d->m_codec; 0688 // } 0689 0690 QTextDecoder *KEncodingDetector::decoder() 0691 { 0692 return d->m_decoder; 0693 } 0694 0695 void KEncodingDetector::resetDecoder() 0696 { 0697 assert(d->m_defaultCodec); 0698 d->m_bufferForDefferedEncDetection.clear(); 0699 d->m_writtingHappened = false; 0700 d->m_analyzeCalled = false; 0701 d->m_multiByte = 0; 0702 delete d->m_decoder; 0703 if (!d->m_codec) { 0704 d->m_codec = d->m_defaultCodec; 0705 } 0706 d->m_decoder = d->m_codec->makeDecoder(); 0707 } 0708 0709 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) 0710 { 0711 QTextCodec *codec; 0712 QByteArray enc(_encoding); 0713 if (/*enc.isNull() || */enc.isEmpty()) { 0714 if (type == DefaultEncoding) { 0715 codec = d->m_defaultCodec; 0716 } else { 0717 return false; 0718 } 0719 } else { 0720 //QString->QTextCodec 0721 0722 enc = enc.toLower(); 0723 // hebrew visually ordered 0724 if (enc == "visual") { 0725 enc = "iso8859-8"; 0726 } 0727 bool b; 0728 codec = KCharsets::charsets()->codecForName(QLatin1String(enc.data()), b); 0729 if (!b) { 0730 return false; 0731 } 0732 } 0733 0734 if (d->m_codec->mibEnum() == codec->mibEnum()) { 0735 // We already have the codec, but we still want to re-set the type, 0736 // as we may have overwritten a default with a detected 0737 d->m_source = type; 0738 return true; 0739 } 0740 0741 if ((type == EncodingFromMetaTag || type == EncodingFromXMLHeader) && is16Bit(codec)) { 0742 //Sometimes the codec specified is absurd, i.e. UTF-16 despite 0743 //us decoding a meta tag as ASCII. In that case, ignore it. 0744 return false; 0745 } 0746 0747 if (codec->mibEnum() == Mib8859_8) { 0748 //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself. 0749 codec = QTextCodec::codecForName("iso8859-8-i"); 0750 0751 // visually ordered unless one of the following 0752 if (!(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" || enc == "csiso88598i" || enc == "logical")) { 0753 d->m_visualRTL = true; 0754 } 0755 } 0756 0757 d->m_codec = codec; 0758 d->m_source = type; 0759 delete d->m_decoder; 0760 d->m_decoder = d->m_codec->makeDecoder(); 0761 #ifdef DECODE_DEBUG 0762 qCDebug(KHTML_LOG) << "KEncodingDetector::encoding used is" << d->m_codec->name(); 0763 #endif 0764 return true; 0765 } 0766 0767 QString KEncodingDetector::decode(const char *data, int len) 0768 { 0769 processNull(const_cast<char *>(data), len); 0770 if (!d->m_analyzeCalled) { 0771 analyze(data, len); 0772 d->m_analyzeCalled = true; 0773 } 0774 0775 return d->m_decoder->toUnicode(data, len); 0776 } 0777 0778 QString KEncodingDetector::decode(const QByteArray &data) 0779 { 0780 processNull(const_cast<char *>(data.data()), data.size()); 0781 if (!d->m_analyzeCalled) { 0782 analyze(data.data(), data.size()); 0783 d->m_analyzeCalled = true; 0784 } 0785 0786 return d->m_decoder->toUnicode(data); 0787 } 0788 0789 QString KEncodingDetector::decodeWithBuffering(const char *data, int len) 0790 { 0791 #ifdef DECODE_DEBUG 0792 qCWarning(KHTML_LOG) << "KEncodingDetector: decoding " << len << " bytes"; 0793 #endif 0794 if (d->m_writtingHappened) { 0795 #ifdef DECODE_DEBUG 0796 qCWarning(KHTML_LOG) << "KEncodingDetector: d->m_writtingHappened " << d->m_codec->name(); 0797 #endif 0798 processNull(const_cast<char *>(data), len); 0799 return d->m_decoder->toUnicode(data, len); 0800 } else { 0801 if (d->m_bufferForDefferedEncDetection.isEmpty()) { 0802 // If encoding detection produced something, and we either got to the body or 0803 // actually saw the encoding explicitly, we're done. 0804 if (analyze(data, len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) { 0805 #ifdef DECODE_DEBUG 0806 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened first time " << d->m_codec->name(); 0807 #endif 0808 processNull(const_cast<char *>(data), len); 0809 d->m_writtingHappened = true; 0810 return d->m_decoder->toUnicode(data, len); 0811 } else { 0812 #ifdef DECODE_DEBUG 0813 qCWarning(KHTML_LOG) << "KEncodingDetector: begin deffer"; 0814 #endif 0815 d->m_bufferForDefferedEncDetection = data; 0816 } 0817 } else { 0818 d->m_bufferForDefferedEncDetection += data; 0819 // As above, but also limit the buffer size. We must use the entire buffer here, 0820 // since the boundaries might split the meta tag, etc. 0821 bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length()); 0822 if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) || 0823 d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER) { 0824 d->m_writtingHappened = true; 0825 d->m_bufferForDefferedEncDetection.replace('\0', ' '); 0826 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection)); 0827 d->m_bufferForDefferedEncDetection.clear(); 0828 #ifdef DECODE_DEBUG 0829 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name(); 0830 #endif 0831 return result; 0832 } 0833 } 0834 } 0835 0836 return QString(); 0837 } 0838 0839 bool KEncodingDetector::decodedInvalidCharacters() const 0840 { 0841 return d->m_decoder ? d->m_decoder->hasFailure() : false; 0842 } 0843 0844 QString KEncodingDetector::flush() 0845 { 0846 if (d->m_bufferForDefferedEncDetection.isEmpty()) { 0847 return QString(); 0848 } 0849 0850 d->m_bufferForDefferedEncDetection.replace('\0', ' '); 0851 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection)); 0852 d->m_bufferForDefferedEncDetection.clear(); 0853 #ifdef DECODE_DEBUG 0854 qCWarning(KHTML_LOG) << "KEncodingDetector:flush() " << d->m_bufferForDefferedEncDetection.length() << " bytes " << d->m_codec->name(); 0855 #endif 0856 return result; 0857 } 0858 0859 bool KEncodingDetector::analyze(const char *data, int len) 0860 { 0861 // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. 0862 // maximumBOMLength = 10 0863 // Even if the user has chosen utf16 we still need to auto-detect the endianness 0864 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) { 0865 // Extract the first three bytes. 0866 const uchar *udata = (const uchar *)data; 0867 uchar c1 = *udata++; 0868 uchar c2 = *udata++; 0869 uchar c3 = *udata++; 0870 0871 // Check for the BOM 0872 const char *autoDetectedEncoding; 0873 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) { 0874 autoDetectedEncoding = "UTF-16"; 0875 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { 0876 autoDetectedEncoding = "UTF-8"; 0877 } else if (c1 == 0x00 || c2 == 0x00) { 0878 uchar c4 = *udata++; 0879 uchar c5 = *udata++; 0880 uchar c6 = *udata++; 0881 uchar c7 = *udata++; 0882 uchar c8 = *udata++; 0883 uchar c9 = *udata++; 0884 uchar c10 = *udata++; 0885 0886 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); 0887 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); 0888 if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) { 0889 autoDetectedEncoding = "UTF-16"; 0890 } else { 0891 autoDetectedEncoding = nullptr; 0892 } 0893 } else { 0894 autoDetectedEncoding = nullptr; 0895 } 0896 0897 // If we found a BOM, use the encoding it implies. 0898 if (autoDetectedEncoding != nullptr) { 0899 d->m_source = BOM; 0900 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding); 0901 assert(d->m_codec); 0902 //enc = d->m_codec->name(); 0903 delete d->m_decoder; 0904 d->m_decoder = d->m_codec->makeDecoder(); 0905 #ifdef DECODE_DEBUG 0906 qCWarning(KHTML_LOG) << "Detection by BOM"; 0907 #endif 0908 if (is16Bit(d->m_codec) && c2 == 0x00) { 0909 // utf16LE, we need to put the decoder in LE mode 0910 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; 0911 d->m_decoder->toUnicode(reverseUtf16, 2); 0912 } 0913 return true; 0914 } 0915 } 0916 0917 //exit from routine in case it was called to only detect byte order for utf-16 0918 if (d->m_source == UserChosenEncoding) { 0919 #ifdef DECODE_DEBUG 0920 qCWarning(KHTML_LOG) << "KEncodingDetector: UserChosenEncoding exit "; 0921 #endif 0922 0923 if (errorsIfUtf8(data, len)) { 0924 setEncoding("", DefaultEncoding); 0925 } 0926 return true; 0927 } 0928 0929 // HTTP header takes precedence over meta-type stuff 0930 if (d->m_source == EncodingFromHTTPHeader) { 0931 return true; 0932 } 0933 0934 if (!d->m_seenBody) { 0935 // we still don't have an encoding, and are in the head 0936 // the following tags are allowed in <head>: 0937 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE 0938 const char *ptr = data; 0939 const char *pEnd = data + len; 0940 0941 while (ptr != pEnd) { 0942 if (*ptr != '<') { 0943 ++ptr; 0944 continue; 0945 } 0946 ++ptr; 0947 // Handle comments. 0948 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { 0949 ptr += 3; 0950 skipComment(ptr, pEnd); 0951 continue; 0952 } 0953 0954 // Handle XML header, which can have encoding in it. 0955 if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') { 0956 const char *end = ptr; 0957 while (*end != '>' && end < pEnd) { 0958 end++; 0959 } 0960 if (*end == '\0' || end == pEnd) { 0961 break; 0962 } 0963 QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator 0964 int length; 0965 int pos = findXMLEncoding(str, length); 0966 // also handles the case when specified encoding aint correct 0967 if (pos != -1 && setEncoding(str.mid(pos, length).data(), EncodingFromXMLHeader)) { 0968 return true; 0969 } 0970 } 0971 0972 //look for <meta>, stop if we reach <body> 0973 while ( 0974 !(((*ptr >= 'a') && (*ptr <= 'z')) || 0975 ((*ptr >= 'A') && (*ptr <= 'Z'))) 0976 && ptr < pEnd 0977 ) { 0978 ++ptr; 0979 } 0980 0981 char tmp[5]; 0982 int length = 0; 0983 const char *max = ptr + 4; 0984 if (pEnd < max) { 0985 max = pEnd; 0986 } 0987 while ( 0988 (((*ptr >= 'a') && (*ptr <= 'z')) || 0989 ((*ptr >= 'A') && (*ptr <= 'Z')) || 0990 ((*ptr >= '0') && (*ptr <= '9'))) 0991 && ptr < max 0992 ) { 0993 tmp[length] = tolower(*ptr); 0994 ++ptr; 0995 ++length; 0996 } 0997 tmp[length] = 0; 0998 if (tmp[0] == 'm' && tmp[1] == 'e' && tmp[2] == 't' && tmp[3] == 'a') { 0999 // found a meta tag... 1000 const char *end = ptr; 1001 while (*end != '>' && *end != '\0' && end < pEnd) { 1002 end++; 1003 } 1004 //if ( *end == '\0' ) break; 1005 const QByteArray str = QByteArray(ptr, (end - ptr) + 1).toLower(); 1006 const int strLength = str.length(); 1007 int pos = 0; 1008 //if( (pos = str.find("http-equiv", pos)) == -1) break; 1009 //if( (pos = str.find("content-type", pos)) == -1) break; 1010 if ((pos = str.indexOf("charset")) == -1) { 1011 continue; 1012 } 1013 pos += 6; 1014 // skip to '=' 1015 if ((pos = str.indexOf("=", pos)) == -1) { 1016 continue; 1017 } 1018 1019 // skip '=' 1020 ++pos; 1021 1022 // skip whitespace before encoding itself 1023 while (pos < strLength && str[pos] <= ' ') { 1024 ++pos; 1025 } 1026 1027 // there may also be an opening quote, if this is a charset= and not a http-equiv. 1028 if (pos < strLength && (str[pos] == '"' || str[pos] == '\'')) { 1029 ++pos; 1030 } 1031 1032 // skip whitespace 1033 while (pos < strLength && str[pos] <= ' ') { 1034 ++pos; 1035 } 1036 1037 if (pos == strLength) { 1038 continue; 1039 } 1040 1041 int endpos = pos; 1042 while (endpos < strLength && 1043 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' 1044 && str[endpos] != ';' && str[endpos] != '>')) { 1045 ++endpos; 1046 } 1047 #ifdef DECODE_DEBUG 1048 qCDebug(KHTML_LOG) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos, endpos - pos).data(); 1049 #endif 1050 if (setEncoding(str.mid(pos, endpos - pos).data(), EncodingFromMetaTag)) { 1051 return true; 1052 } 1053 } else if (tmp[0] == 'b' && tmp[1] == 'o' && tmp[2] == 'd' && tmp[3] == 'y') { 1054 d->m_seenBody = true; 1055 break; 1056 } 1057 } 1058 } 1059 1060 if (len < 20) { 1061 return false; 1062 } 1063 1064 #ifdef DECODE_DEBUG 1065 qCDebug(KHTML_LOG) << "KEncodingDetector: using heuristics (" << strlen(data) << ")"; 1066 #endif 1067 1068 switch (d->m_autoDetectLanguage) { 1069 case KEncodingDetector::Arabic: 1070 return setEncoding(automaticDetectionForArabic((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1071 // break; 1072 case KEncodingDetector::Baltic: 1073 return setEncoding(automaticDetectionForBaltic((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1074 // break; 1075 case KEncodingDetector::CentralEuropean: 1076 return setEncoding(automaticDetectionForCentralEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1077 // break; 1078 case KEncodingDetector::Cyrillic: 1079 return setEncoding(automaticDetectionForCyrillic((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1080 // break; 1081 case KEncodingDetector::Greek: 1082 return setEncoding(automaticDetectionForGreek((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1083 // break; 1084 case KEncodingDetector::Hebrew: 1085 return setEncoding(automaticDetectionForHebrew((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1086 // break; 1087 case KEncodingDetector::Japanese: 1088 return setEncoding(automaticDetectionForJapanese((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1089 // break; 1090 case KEncodingDetector::Turkish: 1091 return setEncoding(automaticDetectionForTurkish((const unsigned char *) data, len).data(), AutoDetectedEncoding); 1092 // break; 1093 case KEncodingDetector::WesternEuropean: 1094 if (setEncoding(automaticDetectionForWesternEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding)) { 1095 return true; 1096 } else if (d->m_defaultCodec->mibEnum() == MibLatin1) { //detection for khtml 1097 return setEncoding("iso-8859-15", AutoDetectedEncoding); 1098 } else { //use default provided by eg katepart 1099 return setEncoding("", DefaultEncoding); 1100 } 1101 // break; 1102 case KEncodingDetector::SemiautomaticDetection: 1103 case KEncodingDetector::ChineseSimplified: 1104 case KEncodingDetector::ChineseTraditional: 1105 case KEncodingDetector::Korean: 1106 case KEncodingDetector::Thai: 1107 case KEncodingDetector::Unicode: 1108 case KEncodingDetector::NorthernSaami: 1109 case KEncodingDetector::SouthEasternEurope: 1110 case KEncodingDetector::None: 1111 // huh. somethings broken in this code ### FIXME 1112 //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. 1113 break; 1114 } 1115 1116 return true; 1117 } 1118 1119 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString &lang) 1120 { 1121 if (lang.isEmpty()) { 1122 return KEncodingDetector::None; 1123 } else if (lang == i18nc("@item Text character set", "Unicode")) { 1124 return KEncodingDetector::Unicode; 1125 } else if (lang == i18nc("@item Text character set", "Cyrillic")) { 1126 return KEncodingDetector::Cyrillic; 1127 } else if (lang == i18nc("@item Text character set", "Western European")) { 1128 return KEncodingDetector::WesternEuropean; 1129 } else if (lang == i18nc("@item Text character set", "Central European")) { 1130 return KEncodingDetector::CentralEuropean; 1131 } else if (lang == i18nc("@item Text character set", "Greek")) { 1132 return KEncodingDetector::Greek; 1133 } else if (lang == i18nc("@item Text character set", "Hebrew")) { 1134 return KEncodingDetector::Hebrew; 1135 } else if (lang == i18nc("@item Text character set", "Turkish")) { 1136 return KEncodingDetector::Turkish; 1137 } else if (lang == i18nc("@item Text character set", "Japanese")) { 1138 return KEncodingDetector::Japanese; 1139 } else if (lang == i18nc("@item Text character set", "Baltic")) { 1140 return KEncodingDetector::Baltic; 1141 } else if (lang == i18nc("@item Text character set", "Arabic")) { 1142 return KEncodingDetector::Arabic; 1143 } 1144 1145 return KEncodingDetector::None; 1146 } 1147 1148 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script) 1149 { 1150 switch (script) { 1151 case KEncodingDetector::Arabic: 1152 return true; 1153 case KEncodingDetector::Baltic: 1154 return true; 1155 case KEncodingDetector::CentralEuropean: 1156 return true; 1157 case KEncodingDetector::Cyrillic: 1158 return true; 1159 case KEncodingDetector::Greek: 1160 return true; 1161 case KEncodingDetector::Hebrew: 1162 return true; 1163 case KEncodingDetector::Japanese: 1164 return true; 1165 case KEncodingDetector::Turkish: 1166 return true; 1167 case KEncodingDetector::WesternEuropean: 1168 return true; 1169 case KEncodingDetector::ChineseTraditional: 1170 return true; 1171 case KEncodingDetector::ChineseSimplified: 1172 return true; 1173 case KEncodingDetector::Unicode: 1174 return true; 1175 break; 1176 default: 1177 return false; 1178 } 1179 } 1180 1181 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script) 1182 { 1183 switch (script) { 1184 case KEncodingDetector::Arabic: 1185 return i18nc("@item Text character set", "Arabic"); 1186 break; 1187 case KEncodingDetector::Baltic: 1188 return i18nc("@item Text character set", "Baltic"); 1189 break; 1190 case KEncodingDetector::CentralEuropean: 1191 return i18nc("@item Text character set", "Central European"); 1192 break; 1193 case KEncodingDetector::Cyrillic: 1194 return i18nc("@item Text character set", "Cyrillic"); 1195 break; 1196 case KEncodingDetector::Greek: 1197 return i18nc("@item Text character set", "Greek"); 1198 break; 1199 case KEncodingDetector::Hebrew: 1200 return i18nc("@item Text character set", "Hebrew"); 1201 break; 1202 case KEncodingDetector::Japanese: 1203 return i18nc("@item Text character set", "Japanese"); 1204 break; 1205 case KEncodingDetector::Turkish: 1206 return i18nc("@item Text character set", "Turkish"); 1207 break; 1208 case KEncodingDetector::WesternEuropean: 1209 return i18nc("@item Text character set", "Western European"); 1210 break; 1211 case KEncodingDetector::ChineseTraditional: 1212 return i18nc("@item Text character set", "Chinese Traditional"); 1213 break; 1214 case KEncodingDetector::ChineseSimplified: 1215 return i18nc("@item Text character set", "Chinese Simplified"); 1216 break; 1217 case KEncodingDetector::Korean: 1218 return i18nc("@item Text character set", "Korean"); 1219 break; 1220 case KEncodingDetector::Thai: 1221 return i18nc("@item Text character set", "Thai"); 1222 break; 1223 case KEncodingDetector::Unicode: 1224 return i18nc("@item Text character set", "Unicode"); 1225 break; 1226 //case KEncodingDetector::SemiautomaticDetection: 1227 default: 1228 return QString(); 1229 1230 } 1231 } 1232 1233 #undef DECODE_DEBUG 1234