File indexing completed on 2024-04-28 03:53:04
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "nsCodingStateMachine.h" 0008 0009 /* 0010 Modification from frank tang's original work: 0011 . 0x00 is allowed as a legal character. Since some web pages contains this char in 0012 text stream. 0013 */ 0014 0015 // BIG5 0016 0017 namespace kencodingprober 0018 { 0019 static const unsigned int BIG5_cls[256 / 8] = { 0020 // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 0021 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 //allow 0x00 as legal value 0022 PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f 0023 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 0024 PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f 0025 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 0026 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f 0027 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 0028 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f 0029 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47 0030 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f 0031 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57 0032 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f 0033 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67 0034 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f 0035 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77 0036 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 1), // 78 - 7f 0037 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 80 - 87 0038 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 88 - 8f 0039 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 90 - 97 0040 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 98 - 9f 0041 PCK4BITS(4, 3, 3, 3, 3, 3, 3, 3), // a0 - a7 0042 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // a8 - af 0043 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // b0 - b7 0044 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // b8 - bf 0045 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // c0 - c7 0046 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // c8 - cf 0047 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // d0 - d7 0048 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // d8 - df 0049 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e0 - e7 0050 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e8 - ef 0051 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // f0 - f7 0052 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 0) // f8 - ff 0053 }; 0054 0055 static const unsigned int BIG5_st[3] = { 0056 PCK4BITS(eError, eStart, eStart, 3, eError, eError, eError, eError), // 00-07 0057 PCK4BITS(eError, eError, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eError), // 08-0f 0058 PCK4BITS(eError, eStart, eStart, eStart, eStart, eStart, eStart, eStart) // 10-17 0059 }; 0060 0061 static const unsigned int Big5CharLenTable[] = {0, 1, 1, 2, 0}; 0062 0063 const SMModel Big5SMModel = { 0064 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls}, 0065 5, 0066 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st}, 0067 Big5CharLenTable, 0068 "Big5", 0069 }; 0070 0071 static const unsigned int EUCJP_cls[256 / 8] = { 0072 // PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 0073 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 00 - 07 0074 PCK4BITS(4, 4, 4, 4, 4, 4, 5, 5), // 08 - 0f 0075 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 10 - 17 0076 PCK4BITS(4, 4, 4, 5, 4, 4, 4, 4), // 18 - 1f 0077 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 20 - 27 0078 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 28 - 2f 0079 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 30 - 37 0080 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 38 - 3f 0081 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 40 - 47 0082 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 48 - 4f 0083 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 50 - 57 0084 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 58 - 5f 0085 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 60 - 67 0086 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 68 - 6f 0087 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 70 - 77 0088 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 78 - 7f 0089 PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 80 - 87 0090 PCK4BITS(5, 5, 5, 5, 5, 5, 1, 3), // 88 - 8f 0091 PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 90 - 97 0092 PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 98 - 9f 0093 PCK4BITS(5, 2, 2, 2, 2, 2, 2, 2), // a0 - a7 0094 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a8 - af 0095 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7 0096 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf 0097 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7 0098 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c8 - cf 0099 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7 0100 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df 0101 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7 0102 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef 0103 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7 0104 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 5) // f8 - ff 0105 }; 0106 0107 static const unsigned int EUCJP_st[5] = { 0108 PCK4BITS(3, 4, 3, 5, eStart, eError, eError, eError), // 00-07 0109 PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f 0110 PCK4BITS(eItsMe, eItsMe, eStart, eError, eStart, eError, eError, eError), // 10-17 0111 PCK4BITS(eError, eError, eStart, eError, eError, eError, 3, eError), // 18-1f 0112 PCK4BITS(3, eError, eError, eError, eStart, eStart, eStart, eStart) // 20-27 0113 }; 0114 0115 static const unsigned int EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; 0116 0117 const SMModel EUCJPSMModel = { 0118 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls}, 0119 6, 0120 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st}, 0121 EUCJPCharLenTable, 0122 "EUC-JP", 0123 }; 0124 0125 static const unsigned int EUCKR_cls[256 / 8] = { 0126 // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 0127 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 0128 PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f 0129 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 0130 PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f 0131 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 0132 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f 0133 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 0134 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f 0135 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47 0136 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f 0137 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57 0138 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f 0139 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67 0140 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f 0141 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77 0142 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f 0143 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87 0144 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f 0145 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97 0146 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f 0147 PCK4BITS(0, 2, 2, 2, 2, 2, 2, 2), // a0 - a7 0148 PCK4BITS(2, 2, 2, 2, 2, 3, 3, 3), // a8 - af 0149 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7 0150 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf 0151 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7 0152 PCK4BITS(2, 3, 2, 2, 2, 2, 2, 2), // c8 - cf 0153 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7 0154 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df 0155 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // e0 - e7 0156 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // e8 - ef 0157 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // f0 - f7 0158 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 0) // f8 - ff 0159 }; 0160 0161 static const unsigned int EUCKR_st[2] = { 0162 PCK4BITS(eError, eStart, 3, eError, eError, eError, eError, eError), // 00-07 0163 PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eError, eError, eStart, eStart) // 08-0f 0164 }; 0165 0166 static const unsigned int EUCKRCharLenTable[] = {0, 1, 2, 0}; 0167 0168 const SMModel EUCKRSMModel = { 0169 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls}, 0170 4, 0171 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st}, 0172 EUCKRCharLenTable, 0173 "EUC-KR", 0174 }; 0175 0176 /* obsolete GB2312 by gb18030 0177 static unsigned int GB2312_cls [ 256 / 8 ] = { 0178 //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 0179 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 0180 PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f 0181 PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 0182 PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f 0183 PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 0184 PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f 0185 PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 0186 PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f 0187 PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 0188 PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f 0189 PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 0190 PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f 0191 PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 0192 PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f 0193 PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 0194 PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f 0195 PCK4BITS(1,0,0,0,0,0,0,0), // 80 - 87 0196 PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f 0197 PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 0198 PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f 0199 PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 0200 PCK4BITS(2,2,3,3,3,3,3,3), // a8 - af 0201 PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 0202 PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf 0203 PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 0204 PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf 0205 PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 0206 PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df 0207 PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 0208 PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef 0209 PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 0210 PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff 0211 }; 0212 0213 static unsigned int GB2312_st [ 2] = { 0214 PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 0215 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f 0216 }; 0217 0218 static const unsigned int GB2312CharLenTable[] = {0, 1, 2, 0}; 0219 0220 SMModel GB2312SMModel = { 0221 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_cls }, 0222 4, 0223 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_st }, 0224 GB2312CharLenTable, 0225 "GB2312", 0226 }; 0227 */ 0228 0229 // the following state machine data was created by perl script in 0230 // intl/chardet/tools. It should be the same as in PSM detector. 0231 static const unsigned int GB18030_cls[256 / 8] = { 0232 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 0233 PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f 0234 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 0235 PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f 0236 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 0237 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f 0238 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 30 - 37 0239 PCK4BITS(3, 3, 1, 1, 1, 1, 1, 1), // 38 - 3f 0240 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47 0241 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f 0242 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57 0243 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f 0244 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67 0245 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f 0246 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77 0247 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 4), // 78 - 7f 0248 PCK4BITS(5, 6, 6, 6, 6, 6, 6, 6), // 80 - 87 0249 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 88 - 8f 0250 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 90 - 97 0251 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 98 - 9f 0252 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // a0 - a7 0253 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // a8 - af 0254 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // b0 - b7 0255 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // b8 - bf 0256 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c0 - c7 0257 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c8 - cf 0258 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d0 - d7 0259 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d8 - df 0260 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // e0 - e7 0261 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // e8 - ef 0262 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // f0 - f7 0263 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 0) // f8 - ff 0264 }; 0265 0266 static const unsigned int GB18030_st[6] = { 0267 PCK4BITS(eError, eStart, eStart, eStart, eStart, eStart, 3, eError), // 00-07 0268 PCK4BITS(eError, eError, eError, eError, eError, eError, eItsMe, eItsMe), // 08-0f 0269 PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eError, eError, eStart), // 10-17 0270 PCK4BITS(4, eError, eStart, eStart, eError, eError, eError, eError), // 18-1f 0271 PCK4BITS(eError, eError, 5, eError, eError, eError, eItsMe, eError), // 20-27 0272 PCK4BITS(eError, eError, eStart, eStart, eStart, eStart, eStart, eStart) // 28-2f 0273 }; 0274 0275 // To be accurate, the length of class 6 can be either 2 or 4. 0276 // But it is not necessary to discriminate between the two since 0277 // it is used for frequency analysis only, and we are validating 0278 // each code range there as well. So it is safe to set it to be 0279 // 2 here. 0280 static const unsigned int GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; 0281 0282 const SMModel GB18030SMModel = { 0283 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls}, 0284 7, 0285 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st}, 0286 GB18030CharLenTable, 0287 "GB18030", 0288 }; 0289 0290 // sjis 0291 0292 static const unsigned int SJIS_cls[256 / 8] = { 0293 // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 0294 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 0295 PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f 0296 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 0297 PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f 0298 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 0299 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f 0300 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 0301 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f 0302 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47 0303 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f 0304 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57 0305 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f 0306 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67 0307 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f 0308 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77 0309 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 1), // 78 - 7f 0310 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 80 - 87 0311 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 88 - 8f 0312 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 90 - 97 0313 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 98 - 9f 0314 // 0xa0 is illegal in sjis encoding, but some pages does 0315 // contain such byte. We need to be more error forgiven. 0316 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a0 - a7 0317 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a8 - af 0318 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7 0319 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf 0320 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7 0321 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c8 - cf 0322 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7 0323 PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df 0324 PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e0 - e7 0325 PCK4BITS(3, 3, 3, 3, 3, 4, 4, 4), // e8 - ef 0326 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // f0 - f7 0327 PCK4BITS(4, 4, 4, 4, 4, 0, 0, 0) // f8 - ff 0328 }; 0329 0330 static const unsigned int SJIS_st[3] = { 0331 PCK4BITS(eError, eStart, eStart, 3, eError, eError, eError, eError), // 00-07 0332 PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f 0333 PCK4BITS(eItsMe, eItsMe, eError, eError, eStart, eStart, eStart, eStart) // 10-17 0334 }; 0335 0336 static const unsigned int SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; 0337 0338 const SMModel SJISSMModel = { 0339 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls}, 0340 6, 0341 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st}, 0342 SJISCharLenTable, 0343 "Shift_JIS", 0344 }; 0345 0346 static const unsigned int UCS2BE_cls[256 / 8] = { 0347 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 00 - 07 0348 PCK4BITS(0, 0, 1, 0, 0, 2, 0, 0), // 08 - 0f 0349 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 10 - 17 0350 PCK4BITS(0, 0, 0, 3, 0, 0, 0, 0), // 18 - 1f 0351 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 20 - 27 0352 PCK4BITS(0, 3, 3, 3, 3, 3, 0, 0), // 28 - 2f 0353 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 30 - 37 0354 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 38 - 3f 0355 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 40 - 47 0356 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 48 - 4f 0357 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 50 - 57 0358 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 58 - 5f 0359 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 60 - 67 0360 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 68 - 6f 0361 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 70 - 77 0362 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 78 - 7f 0363 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87 0364 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f 0365 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97 0366 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f 0367 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a0 - a7 0368 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a8 - af 0369 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b0 - b7 0370 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b8 - bf 0371 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c0 - c7 0372 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c8 - cf 0373 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d0 - d7 0374 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d8 - df 0375 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7 0376 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef 0377 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7 0378 PCK4BITS(0, 0, 0, 0, 0, 0, 4, 5) // f8 - ff 0379 }; 0380 0381 static const unsigned int UCS2BE_st[7] = { 0382 PCK4BITS(5, 7, 7, eError, 4, 3, eError, eError), // 00-07 0383 PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f 0384 PCK4BITS(eItsMe, eItsMe, 6, 6, 6, 6, eError, eError), // 10-17 0385 PCK4BITS(6, 6, 6, 6, 6, eItsMe, 6, 6), // 18-1f 0386 PCK4BITS(6, 6, 6, 6, 5, 7, 7, eError), // 20-27 0387 PCK4BITS(5, 8, 6, 6, eError, 6, 6, 6), // 28-2f 0388 PCK4BITS(6, 6, 6, 6, eError, eError, eStart, eStart) // 30-37 0389 }; 0390 0391 static const unsigned int UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2}; 0392 0393 const SMModel UCS2BESMModel = { 0394 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls}, 0395 6, 0396 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st}, 0397 UCS2BECharLenTable, 0398 "UTF-16BE", 0399 }; 0400 0401 static const unsigned int UCS2LE_cls[256 / 8] = { 0402 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 00 - 07 0403 PCK4BITS(0, 0, 1, 0, 0, 2, 0, 0), // 08 - 0f 0404 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 10 - 17 0405 PCK4BITS(0, 0, 0, 3, 0, 0, 0, 0), // 18 - 1f 0406 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 20 - 27 0407 PCK4BITS(0, 3, 3, 3, 3, 3, 0, 0), // 28 - 2f 0408 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 30 - 37 0409 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 38 - 3f 0410 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 40 - 47 0411 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 48 - 4f 0412 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 50 - 57 0413 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 58 - 5f 0414 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 60 - 67 0415 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 68 - 6f 0416 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 70 - 77 0417 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 78 - 7f 0418 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87 0419 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f 0420 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97 0421 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f 0422 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a0 - a7 0423 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a8 - af 0424 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b0 - b7 0425 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b8 - bf 0426 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c0 - c7 0427 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c8 - cf 0428 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d0 - d7 0429 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d8 - df 0430 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7 0431 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef 0432 PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7 0433 PCK4BITS(0, 0, 0, 0, 0, 0, 4, 5) // f8 - ff 0434 }; 0435 0436 static const unsigned int UCS2LE_st[7] = { 0437 PCK4BITS(6, 6, 7, 6, 4, 3, eError, eError), // 00-07 0438 PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f 0439 PCK4BITS(eItsMe, eItsMe, 5, 5, 5, eError, eItsMe, eError), // 10-17 0440 PCK4BITS(5, 5, 5, eError, 5, eError, 6, 6), // 18-1f 0441 PCK4BITS(7, 6, 8, 8, 5, 5, 5, eError), // 20-27 0442 PCK4BITS(5, 5, 5, eError, eError, eError, 5, 5), // 28-2f 0443 PCK4BITS(5, 5, 5, eError, 5, eError, eStart, eStart) // 30-37 0444 }; 0445 0446 static const unsigned int UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2}; 0447 0448 const SMModel UCS2LESMModel = { 0449 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls}, 0450 6, 0451 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st}, 0452 UCS2LECharLenTable, 0453 "UTF-16LE", 0454 }; 0455 0456 static const unsigned int UTF8_cls[256 / 8] = { 0457 // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 0458 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 //allow 0x00 as a legal value 0459 PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f 0460 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 0461 PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f 0462 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 0463 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f 0464 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 0465 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f 0466 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47 0467 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f 0468 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57 0469 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f 0470 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67 0471 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f 0472 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77 0473 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f 0474 PCK4BITS(2, 2, 2, 2, 3, 3, 3, 3), // 80 - 87 0475 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 88 - 8f 0476 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 90 - 97 0477 PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 98 - 9f 0478 PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a0 - a7 0479 PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a8 - af 0480 PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b0 - b7 0481 PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b8 - bf 0482 PCK4BITS(0, 0, 6, 6, 6, 6, 6, 6), // c0 - c7 0483 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c8 - cf 0484 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d0 - d7 0485 PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d8 - df 0486 PCK4BITS(7, 8, 8, 8, 8, 8, 8, 8), // e0 - e7 0487 PCK4BITS(8, 8, 8, 8, 8, 9, 8, 8), // e8 - ef 0488 PCK4BITS(10, 11, 11, 11, 11, 11, 11, 11), // f0 - f7 0489 PCK4BITS(12, 13, 13, 13, 14, 15, 0, 0) // f8 - ff 0490 }; 0491 0492 static const unsigned int UTF8_st[26] = { 0493 PCK4BITS(eError, eStart, eError, eError, eError, eError, 12, 10), // 00-07 0494 PCK4BITS(9, 11, 8, 7, 6, 5, 4, 3), // 08-0f 0495 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 10-17 0496 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 18-1f 0497 PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 20-27 0498 PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 28-2f 0499 PCK4BITS(eError, eError, 5, 5, 5, 5, eError, eError), // 30-37 0500 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 38-3f 0501 PCK4BITS(eError, eError, eError, 5, 5, 5, eError, eError), // 40-47 0502 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 48-4f 0503 PCK4BITS(eError, eError, 7, 7, 7, 7, eError, eError), // 50-57 0504 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 58-5f 0505 PCK4BITS(eError, eError, eError, eError, 7, 7, eError, eError), // 60-67 0506 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 68-6f 0507 PCK4BITS(eError, eError, 9, 9, 9, 9, eError, eError), // 70-77 0508 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 78-7f 0509 PCK4BITS(eError, eError, eError, eError, eError, 9, eError, eError), // 80-87 0510 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 88-8f 0511 PCK4BITS(eError, eError, 12, 12, 12, 12, eError, eError), // 90-97 0512 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 98-9f 0513 PCK4BITS(eError, eError, eError, eError, eError, 12, eError, eError), // a0-a7 0514 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // a8-af 0515 PCK4BITS(eError, eError, 12, 12, 12, eError, eError, eError), // b0-b7 0516 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // b8-bf 0517 PCK4BITS(eError, eError, eStart, eStart, eStart, eStart, eError, eError), // c0-c7 0518 PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError) // c8-cf 0519 }; 0520 0521 static const unsigned int UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6}; 0522 0523 const SMModel UTF8SMModel = { 0524 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls}, 0525 16, 0526 {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st}, 0527 UTF8CharLenTable, 0528 "UTF-8", 0529 }; 0530 }