File indexing completed on 2024-05-19 04:06:42
0001 /* nfkc.c Unicode normalization utilities. 0002 * Copyright (C) 2002, 2003 Simon Josefsson 0003 * 0004 * This file is part of GNU Libidn. 0005 * 0006 * GNU Libidn is free software; you can redistribute it and/or 0007 * modify it under the terms of the GNU Lesser General Public 0008 * License as published by the Free Software Foundation; either 0009 * version 2.1 of the License, or (at your option) any later version. 0010 * 0011 * GNU Libidn is distributed in the hope that it will be useful, 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0014 * Lesser General Public License for more details. 0015 * 0016 * You should have received a copy of the GNU Lesser General Public 0017 * License along with GNU Libidn; if not, write to the Free Software 0018 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 0019 * 0020 */ 0021 0022 #include "internal.h" 0023 0024 /* This file contains functions from GLIB, including gutf8.c and 0025 * gunidecomp.c, all licensed under LGPL and copyright hold by: 0026 * 0027 * Copyright (C) 1999, 2000 Tom Tromey 0028 * Copyright 2000 Red Hat, Inc. 0029 */ 0030 0031 /* Hacks to make syncing with GLIB code easier. */ 0032 #define gboolean int 0033 #define gchar char 0034 #define guchar unsigned char 0035 #define glong long 0036 #define gint int 0037 #define guint unsigned int 0038 #define gushort unsigned short 0039 #define gint16 my_int16_t 0040 #define guint16 my_uint16_t 0041 #define gunichar my_uint32_t 0042 #define gsize size_t 0043 #define gssize ssize_t 0044 #define g_malloc malloc 0045 #define g_free free 0046 #define GError void 0047 #define g_set_error(a,b,c,d) 0 0048 #define g_new(struct_type, n_structs) \ 0049 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs)))) 0050 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus) 0051 # define G_STMT_START (void)( 0052 # define G_STMT_END ) 0053 # else 0054 # if (defined (sun) || defined (__sun__)) 0055 # define G_STMT_START if (1) 0056 # define G_STMT_END else (void)0 0057 # else 0058 # define G_STMT_START do 0059 # define G_STMT_END while (0) 0060 # endif 0061 # endif 0062 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END 0063 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) 0064 #define TRUE 1 0065 #define FALSE 0 0066 0067 /* Code from GLIB gunicode.h starts here. */ 0068 0069 typedef enum 0070 { 0071 G_NORMALIZE_DEFAULT, 0072 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, 0073 G_NORMALIZE_DEFAULT_COMPOSE, 0074 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, 0075 G_NORMALIZE_ALL, 0076 G_NORMALIZE_NFKD = G_NORMALIZE_ALL, 0077 G_NORMALIZE_ALL_COMPOSE, 0078 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE 0079 } 0080 GNormalizeMode; 0081 0082 /* Code from GLIB gutf8.c starts here. */ 0083 0084 #define UTF8_COMPUTE(Char, Mask, Len) \ 0085 if (Char < 128) \ 0086 { \ 0087 Len = 1; \ 0088 Mask = 0x7f; \ 0089 } \ 0090 else if ((Char & 0xe0) == 0xc0) \ 0091 { \ 0092 Len = 2; \ 0093 Mask = 0x1f; \ 0094 } \ 0095 else if ((Char & 0xf0) == 0xe0) \ 0096 { \ 0097 Len = 3; \ 0098 Mask = 0x0f; \ 0099 } \ 0100 else if ((Char & 0xf8) == 0xf0) \ 0101 { \ 0102 Len = 4; \ 0103 Mask = 0x07; \ 0104 } \ 0105 else if ((Char & 0xfc) == 0xf8) \ 0106 { \ 0107 Len = 5; \ 0108 Mask = 0x03; \ 0109 } \ 0110 else if ((Char & 0xfe) == 0xfc) \ 0111 { \ 0112 Len = 6; \ 0113 Mask = 0x01; \ 0114 } \ 0115 else \ 0116 Len = -1; 0117 0118 #define UTF8_LENGTH(Char) \ 0119 ((Char) < 0x80 ? 1 : \ 0120 ((Char) < 0x800 ? 2 : \ 0121 ((Char) < 0x10000 ? 3 : \ 0122 ((Char) < 0x200000 ? 4 : \ 0123 ((Char) < 0x4000000 ? 5 : 6))))) 0124 0125 0126 #define UTF8_GET(Result, Chars, Count, Mask, Len) \ 0127 (Result) = (Chars)[0] & (Mask); \ 0128 for ((Count) = 1; (Count) < (Len); ++(Count)) \ 0129 { \ 0130 if (((Chars)[(Count)] & 0xc0) != 0x80) \ 0131 { \ 0132 (Result) = -1; \ 0133 break; \ 0134 } \ 0135 (Result) <<= 6; \ 0136 (Result) |= ((Chars)[(Count)] & 0x3f); \ 0137 } 0138 0139 #define UNICODE_VALID(Char) \ 0140 ((Char) < 0x110000 && \ 0141 (((Char) & 0xFFFFF800) != 0xD800) && \ 0142 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ 0143 ((Char) & 0xFFFE) != 0xFFFE) 0144 0145 0146 static const gchar utf8_skip_data[256] = { 0147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0148 1, 1, 1, 1, 1, 1, 1, 0149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0150 1, 1, 1, 1, 1, 1, 1, 0151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0152 1, 1, 1, 1, 1, 1, 1, 0153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0154 1, 1, 1, 1, 1, 1, 1, 0155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0156 1, 1, 1, 1, 1, 1, 1, 0157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0158 1, 1, 1, 1, 1, 1, 1, 0159 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0160 2, 2, 2, 2, 2, 2, 2, 0161 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 0162 5, 5, 5, 6, 6, 1, 1 0163 }; 0164 0165 const gchar *const g_utf8_skip = utf8_skip_data; 0166 0167 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)]) 0168 0169 /** 0170 * g_utf8_strlen: 0171 * @p: pointer to the start of a UTF-8 encoded string. 0172 * @max: the maximum number of bytes to examine. If @max 0173 * is less than 0, then the string is assumed to be 0174 * nul-terminated. If @max is 0, @p will not be examined and 0175 * may be %NULL. 0176 * 0177 * Returns the length of the string in characters. 0178 * 0179 * Return value: the length of the string in characters 0180 **/ 0181 static glong 0182 g_utf8_strlen (const gchar * p, gssize max) 0183 { 0184 glong len = 0; 0185 const gchar *start = p; 0186 g_return_val_if_fail (p != NULL || max == 0, 0); 0187 0188 if (max < 0) 0189 { 0190 while (*p) 0191 { 0192 p = g_utf8_next_char (p); 0193 ++len; 0194 } 0195 } 0196 else 0197 { 0198 if (max == 0 || !*p) 0199 return 0; 0200 0201 p = g_utf8_next_char (p); 0202 0203 while (p - start < max && *p) 0204 { 0205 ++len; 0206 p = g_utf8_next_char (p); 0207 } 0208 0209 /* only do the last len increment if we got a complete 0210 * char (don't count partial chars) 0211 */ 0212 if (p - start == max) 0213 ++len; 0214 } 0215 0216 return len; 0217 } 0218 0219 /** 0220 * g_utf8_get_char: 0221 * @p: a pointer to Unicode character encoded as UTF-8 0222 * 0223 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 0224 * If @p does not point to a valid UTF-8 encoded character, results are 0225 * undefined. If you are not sure that the bytes are complete 0226 * valid Unicode characters, you should use g_utf8_get_char_validated() 0227 * instead. 0228 * 0229 * Return value: the resulting character 0230 **/ 0231 static gunichar 0232 g_utf8_get_char (const gchar * p) 0233 { 0234 int i, mask = 0, len; 0235 gunichar result; 0236 unsigned char c = (unsigned char) *p; 0237 0238 UTF8_COMPUTE (c, mask, len); 0239 if (len == -1) 0240 return (gunichar) - 1; 0241 UTF8_GET (result, p, i, mask, len); 0242 0243 return result; 0244 } 0245 0246 /** 0247 * g_unichar_to_utf8: 0248 * @c: a ISO10646 character code 0249 * @outbuf: output buffer, must have at least 6 bytes of space. 0250 * If %NULL, the length will be computed and returned 0251 * and nothing will be written to @outbuf. 0252 * 0253 * Converts a single character to UTF-8. 0254 * 0255 * Return value: number of bytes written 0256 **/ 0257 static int 0258 g_unichar_to_utf8 (gunichar c, gchar * outbuf) 0259 { 0260 guint len = 0; 0261 int first; 0262 int i; 0263 0264 if (c < 0x80) 0265 { 0266 first = 0; 0267 len = 1; 0268 } 0269 else if (c < 0x800) 0270 { 0271 first = 0xc0; 0272 len = 2; 0273 } 0274 else if (c < 0x10000) 0275 { 0276 first = 0xe0; 0277 len = 3; 0278 } 0279 else if (c < 0x200000) 0280 { 0281 first = 0xf0; 0282 len = 4; 0283 } 0284 else if (c < 0x4000000) 0285 { 0286 first = 0xf8; 0287 len = 5; 0288 } 0289 else 0290 { 0291 first = 0xfc; 0292 len = 6; 0293 } 0294 0295 if (outbuf) 0296 { 0297 for (i = len - 1; i > 0; --i) 0298 { 0299 outbuf[i] = (c & 0x3f) | 0x80; 0300 c >>= 6; 0301 } 0302 outbuf[0] = c | first; 0303 } 0304 0305 return len; 0306 } 0307 0308 /** 0309 * g_utf8_to_ucs4_fast: 0310 * @str: a UTF-8 encoded string 0311 * @len: the maximum length of @str to use. If @len < 0, then 0312 * the string is nul-terminated. 0313 * @items_written: location to store the number of characters in the 0314 * result, or %NULL. 0315 * 0316 * Convert a string from UTF-8 to a 32-bit fixed width 0317 * representation as UCS-4, assuming valid UTF-8 input. 0318 * This function is roughly twice as fast as g_utf8_to_ucs4() 0319 * but does no error checking on the input. 0320 * 0321 * Return value: a pointer to a newly allocated UCS-4 string. 0322 * This value must be freed with g_free(). 0323 **/ 0324 static gunichar * 0325 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written) 0326 { 0327 gint j, charlen; 0328 gunichar *result; 0329 gint n_chars, i; 0330 const gchar *p; 0331 0332 g_return_val_if_fail (str != NULL, NULL); 0333 0334 p = str; 0335 n_chars = 0; 0336 if (len < 0) 0337 { 0338 while (*p) 0339 { 0340 p = g_utf8_next_char (p); 0341 ++n_chars; 0342 } 0343 } 0344 else 0345 { 0346 while (p < str + len && *p) 0347 { 0348 p = g_utf8_next_char (p); 0349 ++n_chars; 0350 } 0351 } 0352 0353 result = g_new (gunichar, n_chars + 1); 0354 0355 p = str; 0356 for (i = 0; i < n_chars; i++) 0357 { 0358 gunichar wc = ((unsigned char *) p)[0]; 0359 0360 if (wc < 0x80) 0361 { 0362 result[i] = wc; 0363 p++; 0364 } 0365 else 0366 { 0367 if (wc < 0xe0) 0368 { 0369 charlen = 2; 0370 wc &= 0x1f; 0371 } 0372 else if (wc < 0xf0) 0373 { 0374 charlen = 3; 0375 wc &= 0x0f; 0376 } 0377 else if (wc < 0xf8) 0378 { 0379 charlen = 4; 0380 wc &= 0x07; 0381 } 0382 else if (wc < 0xfc) 0383 { 0384 charlen = 5; 0385 wc &= 0x03; 0386 } 0387 else 0388 { 0389 charlen = 6; 0390 wc &= 0x01; 0391 } 0392 0393 for (j = 1; j < charlen; j++) 0394 { 0395 wc <<= 6; 0396 wc |= ((unsigned char *) p)[j] & 0x3f; 0397 } 0398 0399 result[i] = wc; 0400 p += charlen; 0401 } 0402 } 0403 result[i] = 0; 0404 0405 if (items_written) 0406 *items_written = i; 0407 0408 return result; 0409 } 0410 0411 /** 0412 * g_ucs4_to_utf8: 0413 * @str: a UCS-4 encoded string 0414 * @len: the maximum length of @str to use. If @len < 0, then 0415 * the string is terminated with a 0 character. 0416 * @items_read: location to store number of characters read read, or %NULL. 0417 * @items_written: location to store number of bytes written or %NULL. 0418 * The value here stored does not include the trailing 0 0419 * byte. 0420 * @error: location to store the error occuring, or %NULL to ignore 0421 * errors. Any of the errors in #GConvertError other than 0422 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 0423 * 0424 * Convert a string from a 32-bit fixed width representation as UCS-4. 0425 * to UTF-8. The result will be terminated with a 0 byte. 0426 * 0427 * Return value: a pointer to a newly allocated UTF-8 string. 0428 * This value must be freed with g_free(). If an 0429 * error occurs, %NULL will be returned and 0430 * @error set. 0431 **/ 0432 static gchar * 0433 g_ucs4_to_utf8 (const gunichar * str, 0434 glong len, 0435 glong * items_read, glong * items_written, GError ** error) 0436 { 0437 gint result_length; 0438 gchar *result = NULL; 0439 gchar *p; 0440 gint i; 0441 0442 result_length = 0; 0443 for (i = 0; len < 0 || i < len; i++) 0444 { 0445 if (!str[i]) 0446 break; 0447 0448 if (str[i] >= 0x80000000) 0449 { 0450 if (items_read) 0451 *items_read = i; 0452 0453 /*g_set_error (error, G_CONVERT_ERROR, 0454 G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 0455 _("Character out of range for UTF-8"));*/ 0456 goto err_out; 0457 } 0458 0459 result_length += UTF8_LENGTH (str[i]); 0460 } 0461 0462 result = g_malloc (result_length + 1); 0463 p = result; 0464 0465 i = 0; 0466 while (p < result + result_length) 0467 p += g_unichar_to_utf8 (str[i++], p); 0468 0469 *p = '\0'; 0470 0471 if (items_written) 0472 *items_written = p - result; 0473 0474 err_out: 0475 if (items_read) 0476 *items_read = i; 0477 0478 return result; 0479 } 0480 0481 /* Code from GLIB gunidecomp.c starts here. */ 0482 0483 #include "gunidecomp.h" 0484 #include "gunicomp.h" 0485 0486 #define CC_PART1(Page, Char) \ 0487 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 0488 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 0489 : (cclass_data[combining_class_table_part1[Page]][Char])) 0490 0491 #define CC_PART2(Page, Char) \ 0492 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 0493 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 0494 : (cclass_data[combining_class_table_part2[Page]][Char])) 0495 0496 #define COMBINING_CLASS(Char) \ 0497 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ 0498 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ 0499 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ 0500 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ 0501 : 0)) 0502 0503 /* constants for hangul syllable [de]composition */ 0504 #define SBase 0xAC00 0505 #define LBase 0x1100 0506 #define VBase 0x1161 0507 #define TBase 0x11A7 0508 #define LCount 19 0509 #define VCount 21 0510 #define TCount 28 0511 #define NCount (VCount * TCount) 0512 #define SCount (LCount * NCount) 0513 0514 /** 0515 * g_unicode_canonical_ordering: 0516 * @string: a UCS-4 encoded string. 0517 * @len: the maximum length of @string to use. 0518 * 0519 * Computes the canonical ordering of a string in-place. 0520 * This rearranges decomposed characters in the string 0521 * according to their combining classes. See the Unicode 0522 * manual for more information. 0523 **/ 0524 static void 0525 g_unicode_canonical_ordering (gunichar * string, gsize len) 0526 { 0527 gsize i; 0528 int swap = 1; 0529 0530 while (swap) 0531 { 0532 int last; 0533 swap = 0; 0534 last = COMBINING_CLASS (string[0]); 0535 for (i = 0; i < len - 1; ++i) 0536 { 0537 int next = COMBINING_CLASS (string[i + 1]); 0538 if (next != 0 && last > next) 0539 { 0540 gsize j; 0541 /* Percolate item leftward through string. */ 0542 for (j = i + 1; j > 0; --j) 0543 { 0544 gunichar t; 0545 if (COMBINING_CLASS (string[j - 1]) <= next) 0546 break; 0547 t = string[j]; 0548 string[j] = string[j - 1]; 0549 string[j - 1] = t; 0550 swap = 1; 0551 } 0552 /* We're re-entering the loop looking at the old 0553 character again. */ 0554 next = last; 0555 } 0556 last = next; 0557 } 0558 } 0559 } 0560 0561 /* http://www.unicode.org/unicode/reports/tr15/#Hangul 0562 * r should be null or have sufficient space. Calling with r == NULL will 0563 * only calculate the result_len; however, a buffer with space for three 0564 * characters will always be big enough. */ 0565 static void 0566 decompose_hangul (gunichar s, gunichar * r, gsize * result_len) 0567 { 0568 gint SIndex = s - SBase; 0569 0570 /* not a hangul syllable */ 0571 if (SIndex < 0 || SIndex >= SCount) 0572 { 0573 if (r) 0574 r[0] = s; 0575 *result_len = 1; 0576 } 0577 else 0578 { 0579 gunichar L = LBase + SIndex / NCount; 0580 gunichar V = VBase + (SIndex % NCount) / TCount; 0581 gunichar T = TBase + SIndex % TCount; 0582 0583 if (r) 0584 { 0585 r[0] = L; 0586 r[1] = V; 0587 } 0588 0589 if (T != TBase) 0590 { 0591 if (r) 0592 r[2] = T; 0593 *result_len = 3; 0594 } 0595 else 0596 *result_len = 2; 0597 } 0598 } 0599 0600 /* returns a pointer to a null-terminated UTF-8 string */ 0601 static const gchar * 0602 find_decomposition (gunichar ch, gboolean compat) 0603 { 0604 int start = 0; 0605 int end = G_N_ELEMENTS (decomp_table); 0606 0607 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch) 0608 { 0609 while (TRUE) 0610 { 0611 int half = (start + end) / 2; 0612 if (ch == decomp_table[half].ch) 0613 { 0614 int offset; 0615 0616 if (compat) 0617 { 0618 offset = decomp_table[half].compat_offset; 0619 if (offset == G_UNICODE_NOT_PRESENT_OFFSET) 0620 offset = decomp_table[half].canon_offset; 0621 } 0622 else 0623 { 0624 offset = decomp_table[half].canon_offset; 0625 if (offset == G_UNICODE_NOT_PRESENT_OFFSET) 0626 return NULL; 0627 } 0628 0629 return &(decomp_expansion_string[offset]); 0630 } 0631 else if (half == start) 0632 break; 0633 else if (ch > decomp_table[half].ch) 0634 start = half; 0635 else 0636 end = half; 0637 } 0638 } 0639 0640 return NULL; 0641 } 0642 0643 /* L,V => LV and LV,T => LVT */ 0644 static gboolean 0645 combine_hangul (gunichar a, gunichar b, gunichar * result) 0646 { 0647 gint LIndex = a - LBase; 0648 gint SIndex = a - SBase; 0649 0650 gint VIndex = b - VBase; 0651 gint TIndex = b - TBase; 0652 0653 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount) 0654 { 0655 *result = SBase + (LIndex * VCount + VIndex) * TCount; 0656 return TRUE; 0657 } 0658 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 0659 && 0 <= TIndex && TIndex <= TCount) 0660 { 0661 *result = a + TIndex; 0662 return TRUE; 0663 } 0664 0665 return FALSE; 0666 } 0667 0668 #define CI(Page, Char) \ 0669 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 0670 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 0671 : (compose_data[compose_table[Page]][Char])) 0672 0673 #define COMPOSE_INDEX(Char) \ 0674 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) 0675 0676 static gboolean 0677 combine (gunichar a, gunichar b, gunichar * result) 0678 { 0679 gushort index_a, index_b; 0680 0681 if (combine_hangul (a, b, result)) 0682 return TRUE; 0683 0684 index_a = COMPOSE_INDEX (a); 0685 0686 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) 0687 { 0688 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) 0689 { 0690 *result = 0691 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; 0692 return TRUE; 0693 } 0694 else 0695 return FALSE; 0696 } 0697 0698 index_b = COMPOSE_INDEX (b); 0699 0700 if (index_b >= COMPOSE_SECOND_SINGLE_START) 0701 { 0702 if (a == 0703 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) 0704 { 0705 *result = 0706 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; 0707 return TRUE; 0708 } 0709 else 0710 return FALSE; 0711 } 0712 0713 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START 0714 && index_b >= COMPOSE_SECOND_START 0715 && index_b < COMPOSE_SECOND_SINGLE_START) 0716 { 0717 gunichar res = 0718 compose_array[index_a - COMPOSE_FIRST_START][index_b - 0719 COMPOSE_SECOND_START]; 0720 0721 if (res) 0722 { 0723 *result = res; 0724 return TRUE; 0725 } 0726 } 0727 0728 return FALSE; 0729 } 0730 0731 static gunichar * 0732 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode) 0733 { 0734 gsize n_wc; 0735 gunichar *wc_buffer; 0736 const char *p; 0737 gsize last_start; 0738 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); 0739 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC); 0740 0741 n_wc = 0; 0742 p = str; 0743 while ((max_len < 0 || p < str + max_len) && *p) 0744 { 0745 const gchar *decomp; 0746 gunichar wc = g_utf8_get_char (p); 0747 0748 if (wc >= 0xac00 && wc <= 0xd7af) 0749 { 0750 gsize result_len; 0751 decompose_hangul (wc, NULL, &result_len); 0752 n_wc += result_len; 0753 } 0754 else 0755 { 0756 decomp = find_decomposition (wc, do_compat); 0757 0758 if (decomp) 0759 n_wc += g_utf8_strlen (decomp, -1); 0760 else 0761 n_wc++; 0762 } 0763 0764 p = g_utf8_next_char (p); 0765 } 0766 0767 wc_buffer = g_new (gunichar, n_wc + 1); 0768 0769 last_start = 0; 0770 n_wc = 0; 0771 p = str; 0772 while ((max_len < 0 || p < str + max_len) && *p) 0773 { 0774 gunichar wc = g_utf8_get_char (p); 0775 const gchar *decomp; 0776 int cc; 0777 gsize old_n_wc = n_wc; 0778 0779 if (wc >= 0xac00 && wc <= 0xd7af) 0780 { 0781 gsize result_len; 0782 decompose_hangul (wc, wc_buffer + n_wc, &result_len); 0783 n_wc += result_len; 0784 } 0785 else 0786 { 0787 decomp = find_decomposition (wc, do_compat); 0788 0789 if (decomp) 0790 { 0791 const char *pd; 0792 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) 0793 wc_buffer[n_wc++] = g_utf8_get_char (pd); 0794 } 0795 else 0796 wc_buffer[n_wc++] = wc; 0797 } 0798 0799 if (n_wc > 0) 0800 { 0801 cc = COMBINING_CLASS (wc_buffer[old_n_wc]); 0802 0803 if (cc == 0) 0804 { 0805 g_unicode_canonical_ordering (wc_buffer + last_start, 0806 n_wc - last_start); 0807 last_start = old_n_wc; 0808 } 0809 } 0810 0811 p = g_utf8_next_char (p); 0812 } 0813 0814 if (n_wc > 0) 0815 { 0816 g_unicode_canonical_ordering (wc_buffer + last_start, 0817 n_wc - last_start); 0818 last_start = n_wc; 0819 } 0820 0821 wc_buffer[n_wc] = 0; 0822 0823 /* All decomposed and reordered */ 0824 0825 if (do_compose && n_wc > 0) 0826 { 0827 gsize i, j; 0828 int last_cc = 0; 0829 last_start = 0; 0830 0831 for (i = 0; i < n_wc; i++) 0832 { 0833 int cc = COMBINING_CLASS (wc_buffer[i]); 0834 0835 if (i > 0 && 0836 (last_cc == 0 || last_cc != cc) && 0837 combine (wc_buffer[last_start], wc_buffer[i], 0838 &wc_buffer[last_start])) 0839 { 0840 for (j = i + 1; j < n_wc; j++) 0841 wc_buffer[j - 1] = wc_buffer[j]; 0842 n_wc--; 0843 i--; 0844 0845 if (i == last_start) 0846 last_cc = 0; 0847 else 0848 last_cc = COMBINING_CLASS (wc_buffer[i - 1]); 0849 0850 continue; 0851 } 0852 0853 if (cc == 0) 0854 last_start = i; 0855 0856 last_cc = cc; 0857 } 0858 } 0859 0860 wc_buffer[n_wc] = 0; 0861 0862 return wc_buffer; 0863 } 0864 0865 /** 0866 * g_utf8_normalize: 0867 * @str: a UTF-8 encoded string. 0868 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. 0869 * @mode: the type of normalization to perform. 0870 * 0871 * Converts a string into canonical form, standardizing 0872 * such issues as whether a character with an accent 0873 * is represented as a base character and combining 0874 * accent or as a single precomposed character. You 0875 * should generally call g_utf8_normalize() before 0876 * comparing two Unicode strings. 0877 * 0878 * The normalization mode %G_NORMALIZE_DEFAULT only 0879 * standardizes differences that do not affect the 0880 * text content, such as the above-mentioned accent 0881 * representation. %G_NORMALIZE_ALL also standardizes 0882 * the "compatibility" characters in Unicode, such 0883 * as SUPERSCRIPT THREE to the standard forms 0884 * (in this case DIGIT THREE). Formatting information 0885 * may be lost but for most text operations such 0886 * characters should be considered the same. 0887 * For example, g_utf8_collate() normalizes 0888 * with %G_NORMALIZE_ALL as its first step. 0889 * 0890 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 0891 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 0892 * but returned a result with composed forms rather 0893 * than a maximally decomposed form. This is often 0894 * useful if you intend to convert the string to 0895 * a legacy encoding or pass it to a system with 0896 * less capable Unicode handling. 0897 * 0898 * Return value: a newly allocated string, that is the 0899 * normalized form of @str. 0900 **/ 0901 static gchar * 0902 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode) 0903 { 0904 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); 0905 gchar *result; 0906 0907 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); 0908 g_free (result_wc); 0909 0910 return result; 0911 } 0912 0913 /* Public Libidn API starts here. */ 0914 0915 /** 0916 * stringprep_utf8_to_unichar: 0917 * @p: a pointer to Unicode character encoded as UTF-8 0918 * 0919 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 0920 * If @p does not point to a valid UTF-8 encoded character, results are 0921 * undefined. If you are not sure that the bytes are complete 0922 * valid Unicode characters, you should use g_utf8_get_char_validated() 0923 * instead. 0924 * 0925 * Return value: the resulting character 0926 **/ 0927 my_uint32_t 0928 stringprep_utf8_to_unichar (const char *p) 0929 { 0930 return g_utf8_get_char (p); 0931 } 0932 0933 /** 0934 * stringprep_unichar_to_utf8: 0935 * @c: a ISO10646 character code 0936 * @outbuf: output buffer, must have at least 6 bytes of space. 0937 * If %NULL, the length will be computed and returned 0938 * and nothing will be written to @outbuf. 0939 * 0940 * Converts a single character to UTF-8. 0941 * 0942 * Return value: number of bytes written 0943 **/ 0944 int 0945 stringprep_unichar_to_utf8 (my_uint32_t c, char *outbuf) 0946 { 0947 return g_unichar_to_utf8 (c, outbuf); 0948 } 0949 0950 /** 0951 * stringprep_utf8_to_ucs4: 0952 * @str: a UTF-8 encoded string 0953 * @len: the maximum length of @str to use. If @len < 0, then 0954 * the string is nul-terminated. 0955 * @items_written: location to store the number of characters in the 0956 * result, or %NULL. 0957 * 0958 * Convert a string from UTF-8 to a 32-bit fixed width 0959 * representation as UCS-4, assuming valid UTF-8 input. 0960 * This function does no error checking on the input. 0961 * 0962 * Return value: a pointer to a newly allocated UCS-4 string. 0963 * This value must be freed with free(). 0964 **/ 0965 my_uint32_t * 0966 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written) 0967 { 0968 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written); 0969 } 0970 0971 /** 0972 * stringprep_ucs4_to_utf8: 0973 * @str: a UCS-4 encoded string 0974 * @len: the maximum length of @str to use. If @len < 0, then 0975 * the string is terminated with a 0 character. 0976 * @items_read: location to store number of characters read read, or %NULL. 0977 * @items_written: location to store number of bytes written or %NULL. 0978 * The value here stored does not include the trailing 0 0979 * byte. 0980 * 0981 * Convert a string from a 32-bit fixed width representation as UCS-4. 0982 * to UTF-8. The result will be terminated with a 0 byte. 0983 * 0984 * Return value: a pointer to a newly allocated UTF-8 string. 0985 * This value must be freed with free(). If an 0986 * error occurs, %NULL will be returned and 0987 * @error set. 0988 **/ 0989 char * 0990 stringprep_ucs4_to_utf8 (const my_uint32_t * str, ssize_t len, 0991 size_t * items_read, size_t * items_written) 0992 { 0993 return g_ucs4_to_utf8 (str, len, (glong *) items_read, 0994 (glong *) items_written, NULL); 0995 } 0996 0997 /** 0998 * stringprep_utf8_nfkc_normalize: 0999 * @str: a UTF-8 encoded string. 1000 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. 1001 * 1002 * Converts a string into canonical form, standardizing 1003 * such issues as whether a character with an accent 1004 * is represented as a base character and combining 1005 * accent or as a single precomposed character. 1006 * 1007 * The normalization mode is NFKC (ALL COMPOSE). It standardizes 1008 * differences that do not affect the text content, such as the 1009 * above-mentioned accent representation. It standardizes the 1010 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to 1011 * the standard forms (in this case DIGIT THREE). Formatting 1012 * information may be lost but for most text operations such 1013 * characters should be considered the same. It returns a result with 1014 * composed forms rather than a maximally decomposed form. 1015 * 1016 * Return value: a newly allocated string, that is the 1017 * NFKC normalized form of @str. 1018 **/ 1019 char * 1020 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len) 1021 { 1022 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC); 1023 } 1024 1025 /** 1026 * stringprep_ucs4_nfkc_normalize: 1027 * @str: a Unicode string. 1028 * @len: length of @str array, or -1 if @str is nul-terminated. 1029 * 1030 * Converts UCS4 string into UTF-8 and runs 1031 * stringprep_utf8_nfkc_normalize(). 1032 * 1033 * Return value: a newly allocated Unicode string, that is the NFKC 1034 * normalized form of @str. 1035 **/ 1036 my_uint32_t * 1037 stringprep_ucs4_nfkc_normalize (my_uint32_t * str, ssize_t len) 1038 { 1039 char *p; 1040 my_uint32_t *result_wc; 1041 1042 p = stringprep_ucs4_to_utf8 (str, len, 0, 0); 1043 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC); 1044 free (p); 1045 1046 return result_wc; 1047 }