File indexing completed on 2024-04-28 03:53:04

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #include "nsCodingStateMachine.h"
0008 
0009 /*
0010 Modification from frank tang's original work:
0011 . 0x00 is allowed as a legal character. Since some web pages contains this char in
0012   text stream.
0013 */
0014 
0015 // BIG5
0016 
0017 namespace kencodingprober
0018 {
0019 static const unsigned int BIG5_cls[256 / 8] = {
0020     // PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07
0021     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07    //allow 0x00 as legal value
0022     PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f
0023     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17
0024     PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f
0025     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27
0026     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f
0027     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37
0028     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f
0029     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47
0030     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f
0031     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57
0032     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f
0033     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67
0034     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f
0035     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77
0036     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 1), // 78 - 7f
0037     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 80 - 87
0038     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 88 - 8f
0039     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 90 - 97
0040     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 98 - 9f
0041     PCK4BITS(4, 3, 3, 3, 3, 3, 3, 3), // a0 - a7
0042     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // a8 - af
0043     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // b0 - b7
0044     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // b8 - bf
0045     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // c0 - c7
0046     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // c8 - cf
0047     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // d0 - d7
0048     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // d8 - df
0049     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e0 - e7
0050     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e8 - ef
0051     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // f0 - f7
0052     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 0) // f8 - ff
0053 };
0054 
0055 static const unsigned int BIG5_st[3] = {
0056     PCK4BITS(eError, eStart, eStart, 3, eError, eError, eError, eError), // 00-07
0057     PCK4BITS(eError, eError, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eError), // 08-0f
0058     PCK4BITS(eError, eStart, eStart, eStart, eStart, eStart, eStart, eStart) // 10-17
0059 };
0060 
0061 static const unsigned int Big5CharLenTable[] = {0, 1, 1, 2, 0};
0062 
0063 const SMModel Big5SMModel = {
0064     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls},
0065     5,
0066     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st},
0067     Big5CharLenTable,
0068     "Big5",
0069 };
0070 
0071 static const unsigned int EUCJP_cls[256 / 8] = {
0072     // PCK4BITS(5,4,4,4,4,4,4,4),  // 00 - 07
0073     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 00 - 07
0074     PCK4BITS(4, 4, 4, 4, 4, 4, 5, 5), // 08 - 0f
0075     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 10 - 17
0076     PCK4BITS(4, 4, 4, 5, 4, 4, 4, 4), // 18 - 1f
0077     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 20 - 27
0078     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 28 - 2f
0079     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 30 - 37
0080     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 38 - 3f
0081     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 40 - 47
0082     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 48 - 4f
0083     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 50 - 57
0084     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 58 - 5f
0085     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 60 - 67
0086     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 68 - 6f
0087     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 70 - 77
0088     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 78 - 7f
0089     PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 80 - 87
0090     PCK4BITS(5, 5, 5, 5, 5, 5, 1, 3), // 88 - 8f
0091     PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 90 - 97
0092     PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 98 - 9f
0093     PCK4BITS(5, 2, 2, 2, 2, 2, 2, 2), // a0 - a7
0094     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a8 - af
0095     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7
0096     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf
0097     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7
0098     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c8 - cf
0099     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7
0100     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df
0101     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7
0102     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef
0103     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7
0104     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 5) // f8 - ff
0105 };
0106 
0107 static const unsigned int EUCJP_st[5] = {
0108     PCK4BITS(3, 4, 3, 5, eStart, eError, eError, eError), // 00-07
0109     PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f
0110     PCK4BITS(eItsMe, eItsMe, eStart, eError, eStart, eError, eError, eError), // 10-17
0111     PCK4BITS(eError, eError, eStart, eError, eError, eError, 3, eError), // 18-1f
0112     PCK4BITS(3, eError, eError, eError, eStart, eStart, eStart, eStart) // 20-27
0113 };
0114 
0115 static const unsigned int EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
0116 
0117 const SMModel EUCJPSMModel = {
0118     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls},
0119     6,
0120     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st},
0121     EUCJPCharLenTable,
0122     "EUC-JP",
0123 };
0124 
0125 static const unsigned int EUCKR_cls[256 / 8] = {
0126     // PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07
0127     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07
0128     PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f
0129     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17
0130     PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f
0131     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27
0132     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f
0133     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37
0134     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f
0135     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47
0136     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f
0137     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57
0138     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f
0139     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67
0140     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f
0141     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77
0142     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f
0143     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87
0144     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f
0145     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97
0146     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f
0147     PCK4BITS(0, 2, 2, 2, 2, 2, 2, 2), // a0 - a7
0148     PCK4BITS(2, 2, 2, 2, 2, 3, 3, 3), // a8 - af
0149     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7
0150     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf
0151     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7
0152     PCK4BITS(2, 3, 2, 2, 2, 2, 2, 2), // c8 - cf
0153     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7
0154     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df
0155     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // e0 - e7
0156     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // e8 - ef
0157     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // f0 - f7
0158     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 0) // f8 - ff
0159 };
0160 
0161 static const unsigned int EUCKR_st[2] = {
0162     PCK4BITS(eError, eStart, 3, eError, eError, eError, eError, eError), // 00-07
0163     PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eError, eError, eStart, eStart) // 08-0f
0164 };
0165 
0166 static const unsigned int EUCKRCharLenTable[] = {0, 1, 2, 0};
0167 
0168 const SMModel EUCKRSMModel = {
0169     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls},
0170     4,
0171     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st},
0172     EUCKRCharLenTable,
0173     "EUC-KR",
0174 };
0175 
0176 /* obsolete GB2312 by gb18030
0177 static unsigned int GB2312_cls [ 256 / 8 ] = {
0178 //PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07
0179 PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07
0180 PCK4BITS(1,1,1,1,1,1,0,0),  // 08 - 0f
0181 PCK4BITS(1,1,1,1,1,1,1,1),  // 10 - 17
0182 PCK4BITS(1,1,1,0,1,1,1,1),  // 18 - 1f
0183 PCK4BITS(1,1,1,1,1,1,1,1),  // 20 - 27
0184 PCK4BITS(1,1,1,1,1,1,1,1),  // 28 - 2f
0185 PCK4BITS(1,1,1,1,1,1,1,1),  // 30 - 37
0186 PCK4BITS(1,1,1,1,1,1,1,1),  // 38 - 3f
0187 PCK4BITS(1,1,1,1,1,1,1,1),  // 40 - 47
0188 PCK4BITS(1,1,1,1,1,1,1,1),  // 48 - 4f
0189 PCK4BITS(1,1,1,1,1,1,1,1),  // 50 - 57
0190 PCK4BITS(1,1,1,1,1,1,1,1),  // 58 - 5f
0191 PCK4BITS(1,1,1,1,1,1,1,1),  // 60 - 67
0192 PCK4BITS(1,1,1,1,1,1,1,1),  // 68 - 6f
0193 PCK4BITS(1,1,1,1,1,1,1,1),  // 70 - 77
0194 PCK4BITS(1,1,1,1,1,1,1,1),  // 78 - 7f
0195 PCK4BITS(1,0,0,0,0,0,0,0),  // 80 - 87
0196 PCK4BITS(0,0,0,0,0,0,0,0),  // 88 - 8f
0197 PCK4BITS(0,0,0,0,0,0,0,0),  // 90 - 97
0198 PCK4BITS(0,0,0,0,0,0,0,0),  // 98 - 9f
0199 PCK4BITS(0,2,2,2,2,2,2,2),  // a0 - a7
0200 PCK4BITS(2,2,3,3,3,3,3,3),  // a8 - af
0201 PCK4BITS(2,2,2,2,2,2,2,2),  // b0 - b7
0202 PCK4BITS(2,2,2,2,2,2,2,2),  // b8 - bf
0203 PCK4BITS(2,2,2,2,2,2,2,2),  // c0 - c7
0204 PCK4BITS(2,2,2,2,2,2,2,2),  // c8 - cf
0205 PCK4BITS(2,2,2,2,2,2,2,2),  // d0 - d7
0206 PCK4BITS(2,2,2,2,2,2,2,2),  // d8 - df
0207 PCK4BITS(2,2,2,2,2,2,2,2),  // e0 - e7
0208 PCK4BITS(2,2,2,2,2,2,2,2),  // e8 - ef
0209 PCK4BITS(2,2,2,2,2,2,2,2),  // f0 - f7
0210 PCK4BITS(2,2,2,2,2,2,2,0)   // f8 - ff
0211 };
0212 
0213 static unsigned int GB2312_st [ 2] = {
0214 PCK4BITS(eError,eStart,     3,eError,eError,eError,eError,eError),//00-07
0215 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
0216 };
0217 
0218 static const unsigned int GB2312CharLenTable[] = {0, 1, 2, 0};
0219 
0220 SMModel GB2312SMModel = {
0221   {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_cls },
0222    4,
0223   {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_st },
0224   GB2312CharLenTable,
0225   "GB2312",
0226 };
0227 */
0228 
0229 // the following state machine data was created by perl script in
0230 // intl/chardet/tools. It should be the same as in PSM detector.
0231 static const unsigned int GB18030_cls[256 / 8] = {
0232     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07
0233     PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f
0234     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17
0235     PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f
0236     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27
0237     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f
0238     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 30 - 37
0239     PCK4BITS(3, 3, 1, 1, 1, 1, 1, 1), // 38 - 3f
0240     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47
0241     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f
0242     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57
0243     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f
0244     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67
0245     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f
0246     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77
0247     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 4), // 78 - 7f
0248     PCK4BITS(5, 6, 6, 6, 6, 6, 6, 6), // 80 - 87
0249     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 88 - 8f
0250     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 90 - 97
0251     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 98 - 9f
0252     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // a0 - a7
0253     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // a8 - af
0254     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // b0 - b7
0255     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // b8 - bf
0256     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c0 - c7
0257     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c8 - cf
0258     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d0 - d7
0259     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d8 - df
0260     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // e0 - e7
0261     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // e8 - ef
0262     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // f0 - f7
0263     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 0) // f8 - ff
0264 };
0265 
0266 static const unsigned int GB18030_st[6] = {
0267     PCK4BITS(eError, eStart, eStart, eStart, eStart, eStart, 3, eError), // 00-07
0268     PCK4BITS(eError, eError, eError, eError, eError, eError, eItsMe, eItsMe), // 08-0f
0269     PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eError, eError, eStart), // 10-17
0270     PCK4BITS(4, eError, eStart, eStart, eError, eError, eError, eError), // 18-1f
0271     PCK4BITS(eError, eError, 5, eError, eError, eError, eItsMe, eError), // 20-27
0272     PCK4BITS(eError, eError, eStart, eStart, eStart, eStart, eStart, eStart) // 28-2f
0273 };
0274 
0275 // To be accurate, the length of class 6 can be either 2 or 4.
0276 // But it is not necessary to discriminate between the two since
0277 // it is used for frequency analysis only, and we are validating
0278 // each code range there as well. So it is safe to set it to be
0279 // 2 here.
0280 static const unsigned int GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
0281 
0282 const SMModel GB18030SMModel = {
0283     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls},
0284     7,
0285     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st},
0286     GB18030CharLenTable,
0287     "GB18030",
0288 };
0289 
0290 // sjis
0291 
0292 static const unsigned int SJIS_cls[256 / 8] = {
0293     // PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07
0294     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07
0295     PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f
0296     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17
0297     PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f
0298     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27
0299     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f
0300     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37
0301     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f
0302     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47
0303     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f
0304     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57
0305     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f
0306     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67
0307     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f
0308     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77
0309     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 1), // 78 - 7f
0310     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 80 - 87
0311     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 88 - 8f
0312     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 90 - 97
0313     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 98 - 9f
0314     // 0xa0 is illegal in sjis encoding, but some pages does
0315     // contain such byte. We need to be more error forgiven.
0316     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a0 - a7
0317     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a8 - af
0318     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7
0319     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf
0320     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7
0321     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c8 - cf
0322     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7
0323     PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df
0324     PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e0 - e7
0325     PCK4BITS(3, 3, 3, 3, 3, 4, 4, 4), // e8 - ef
0326     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // f0 - f7
0327     PCK4BITS(4, 4, 4, 4, 4, 0, 0, 0) // f8 - ff
0328 };
0329 
0330 static const unsigned int SJIS_st[3] = {
0331     PCK4BITS(eError, eStart, eStart, 3, eError, eError, eError, eError), // 00-07
0332     PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f
0333     PCK4BITS(eItsMe, eItsMe, eError, eError, eStart, eStart, eStart, eStart) // 10-17
0334 };
0335 
0336 static const unsigned int SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
0337 
0338 const SMModel SJISSMModel = {
0339     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls},
0340     6,
0341     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st},
0342     SJISCharLenTable,
0343     "Shift_JIS",
0344 };
0345 
0346 static const unsigned int UCS2BE_cls[256 / 8] = {
0347     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 00 - 07
0348     PCK4BITS(0, 0, 1, 0, 0, 2, 0, 0), // 08 - 0f
0349     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 10 - 17
0350     PCK4BITS(0, 0, 0, 3, 0, 0, 0, 0), // 18 - 1f
0351     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 20 - 27
0352     PCK4BITS(0, 3, 3, 3, 3, 3, 0, 0), // 28 - 2f
0353     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 30 - 37
0354     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 38 - 3f
0355     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 40 - 47
0356     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 48 - 4f
0357     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 50 - 57
0358     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 58 - 5f
0359     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 60 - 67
0360     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 68 - 6f
0361     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 70 - 77
0362     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 78 - 7f
0363     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87
0364     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f
0365     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97
0366     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f
0367     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a0 - a7
0368     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a8 - af
0369     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b0 - b7
0370     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b8 - bf
0371     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c0 - c7
0372     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c8 - cf
0373     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d0 - d7
0374     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d8 - df
0375     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7
0376     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef
0377     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7
0378     PCK4BITS(0, 0, 0, 0, 0, 0, 4, 5) // f8 - ff
0379 };
0380 
0381 static const unsigned int UCS2BE_st[7] = {
0382     PCK4BITS(5, 7, 7, eError, 4, 3, eError, eError), // 00-07
0383     PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f
0384     PCK4BITS(eItsMe, eItsMe, 6, 6, 6, 6, eError, eError), // 10-17
0385     PCK4BITS(6, 6, 6, 6, 6, eItsMe, 6, 6), // 18-1f
0386     PCK4BITS(6, 6, 6, 6, 5, 7, 7, eError), // 20-27
0387     PCK4BITS(5, 8, 6, 6, eError, 6, 6, 6), // 28-2f
0388     PCK4BITS(6, 6, 6, 6, eError, eError, eStart, eStart) // 30-37
0389 };
0390 
0391 static const unsigned int UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
0392 
0393 const SMModel UCS2BESMModel = {
0394     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls},
0395     6,
0396     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st},
0397     UCS2BECharLenTable,
0398     "UTF-16BE",
0399 };
0400 
0401 static const unsigned int UCS2LE_cls[256 / 8] = {
0402     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 00 - 07
0403     PCK4BITS(0, 0, 1, 0, 0, 2, 0, 0), // 08 - 0f
0404     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 10 - 17
0405     PCK4BITS(0, 0, 0, 3, 0, 0, 0, 0), // 18 - 1f
0406     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 20 - 27
0407     PCK4BITS(0, 3, 3, 3, 3, 3, 0, 0), // 28 - 2f
0408     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 30 - 37
0409     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 38 - 3f
0410     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 40 - 47
0411     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 48 - 4f
0412     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 50 - 57
0413     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 58 - 5f
0414     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 60 - 67
0415     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 68 - 6f
0416     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 70 - 77
0417     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 78 - 7f
0418     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87
0419     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f
0420     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97
0421     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f
0422     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a0 - a7
0423     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a8 - af
0424     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b0 - b7
0425     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b8 - bf
0426     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c0 - c7
0427     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c8 - cf
0428     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d0 - d7
0429     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d8 - df
0430     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7
0431     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef
0432     PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7
0433     PCK4BITS(0, 0, 0, 0, 0, 0, 4, 5) // f8 - ff
0434 };
0435 
0436 static const unsigned int UCS2LE_st[7] = {
0437     PCK4BITS(6, 6, 7, 6, 4, 3, eError, eError), // 00-07
0438     PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f
0439     PCK4BITS(eItsMe, eItsMe, 5, 5, 5, eError, eItsMe, eError), // 10-17
0440     PCK4BITS(5, 5, 5, eError, 5, eError, 6, 6), // 18-1f
0441     PCK4BITS(7, 6, 8, 8, 5, 5, 5, eError), // 20-27
0442     PCK4BITS(5, 5, 5, eError, eError, eError, 5, 5), // 28-2f
0443     PCK4BITS(5, 5, 5, eError, 5, eError, eStart, eStart) // 30-37
0444 };
0445 
0446 static const unsigned int UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
0447 
0448 const SMModel UCS2LESMModel = {
0449     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls},
0450     6,
0451     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st},
0452     UCS2LECharLenTable,
0453     "UTF-16LE",
0454 };
0455 
0456 static const unsigned int UTF8_cls[256 / 8] = {
0457     // PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07
0458     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07  //allow 0x00 as a legal value
0459     PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f
0460     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17
0461     PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f
0462     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27
0463     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f
0464     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37
0465     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f
0466     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47
0467     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f
0468     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57
0469     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f
0470     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67
0471     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f
0472     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77
0473     PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f
0474     PCK4BITS(2, 2, 2, 2, 3, 3, 3, 3), // 80 - 87
0475     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 88 - 8f
0476     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 90 - 97
0477     PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 98 - 9f
0478     PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a0 - a7
0479     PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a8 - af
0480     PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b0 - b7
0481     PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b8 - bf
0482     PCK4BITS(0, 0, 6, 6, 6, 6, 6, 6), // c0 - c7
0483     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c8 - cf
0484     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d0 - d7
0485     PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d8 - df
0486     PCK4BITS(7, 8, 8, 8, 8, 8, 8, 8), // e0 - e7
0487     PCK4BITS(8, 8, 8, 8, 8, 9, 8, 8), // e8 - ef
0488     PCK4BITS(10, 11, 11, 11, 11, 11, 11, 11), // f0 - f7
0489     PCK4BITS(12, 13, 13, 13, 14, 15, 0, 0) // f8 - ff
0490 };
0491 
0492 static const unsigned int UTF8_st[26] = {
0493     PCK4BITS(eError, eStart, eError, eError, eError, eError, 12, 10), // 00-07
0494     PCK4BITS(9, 11, 8, 7, 6, 5, 4, 3), // 08-0f
0495     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 10-17
0496     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 18-1f
0497     PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 20-27
0498     PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 28-2f
0499     PCK4BITS(eError, eError, 5, 5, 5, 5, eError, eError), // 30-37
0500     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 38-3f
0501     PCK4BITS(eError, eError, eError, 5, 5, 5, eError, eError), // 40-47
0502     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 48-4f
0503     PCK4BITS(eError, eError, 7, 7, 7, 7, eError, eError), // 50-57
0504     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 58-5f
0505     PCK4BITS(eError, eError, eError, eError, 7, 7, eError, eError), // 60-67
0506     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 68-6f
0507     PCK4BITS(eError, eError, 9, 9, 9, 9, eError, eError), // 70-77
0508     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 78-7f
0509     PCK4BITS(eError, eError, eError, eError, eError, 9, eError, eError), // 80-87
0510     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 88-8f
0511     PCK4BITS(eError, eError, 12, 12, 12, 12, eError, eError), // 90-97
0512     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 98-9f
0513     PCK4BITS(eError, eError, eError, eError, eError, 12, eError, eError), // a0-a7
0514     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // a8-af
0515     PCK4BITS(eError, eError, 12, 12, 12, eError, eError, eError), // b0-b7
0516     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // b8-bf
0517     PCK4BITS(eError, eError, eStart, eStart, eStart, eStart, eError, eError), // c0-c7
0518     PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError) // c8-cf
0519 };
0520 
0521 static const unsigned int UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6};
0522 
0523 const SMModel UTF8SMModel = {
0524     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls},
0525     16,
0526     {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st},
0527     UTF8CharLenTable,
0528     "UTF-8",
0529 };
0530 }