src/libidn/nfkc.c

0001 /* nfkc.c   Unicode normalization utilities.
0002  * Copyright (C) 2002, 2003  Simon Josefsson
0003  *
0004  * This file is part of GNU Libidn.
0005  *
0006  * GNU Libidn is free software; you can redistribute it and/or
0007  * modify it under the terms of the GNU Lesser General Public
0008  * License as published by the Free Software Foundation; either
0009  * version 2.1 of the License, or (at your option) any later version.
0010  *
0011  * GNU Libidn is distributed in the hope that it will be useful,
0012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0014  * Lesser General Public License for more details.
0015  *
0016  * You should have received a copy of the GNU Lesser General Public
0017  * License along with GNU Libidn; if not, write to the Free Software
0018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0019  *
0020  */
0021
0022 #include "internal.h"
0023
0024 /* This file contains functions from GLIB, including gutf8.c and
0025  * gunidecomp.c, all licensed under LGPL and copyright hold by:
0026  *
0027  *  Copyright (C) 1999, 2000 Tom Tromey
0028  *  Copyright 2000 Red Hat, Inc.
0029  */
0030
0031 /* Hacks to make syncing with GLIB code easier. */
0032 #define gboolean int
0033 #define gchar char
0034 #define guchar unsigned char
0035 #define glong long
0036 #define gint int
0037 #define guint unsigned int
0038 #define gushort unsigned short
0039 #define gint16 my_int16_t
0040 #define guint16 my_uint16_t
0041 #define gunichar my_uint32_t
0042 #define gsize size_t
0043 #define gssize ssize_t
0044 #define g_malloc malloc
0045 #define g_free free
0046 #define GError void
0047 #define g_set_error(a,b,c,d) 0
0048 #define g_new(struct_type, n_structs)                   \
0049   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
0050 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
0051 #    define G_STMT_START    (void)(
0052 #    define G_STMT_END      )
0053 #  else
0054 #    if (defined (sun) || defined (__sun__))
0055 #      define G_STMT_START  if (1)
0056 #      define G_STMT_END    else (void)0
0057 #    else
0058 #      define G_STMT_START  do
0059 #      define G_STMT_END    while (0)
0060 #    endif
0061 #  endif
0062 #define g_return_val_if_fail(expr,val)      G_STMT_START{ (void)0; }G_STMT_END
0063 #define G_N_ELEMENTS(arr)       (sizeof (arr) / sizeof ((arr)[0]))
0064 #define TRUE 1
0065 #define FALSE 0
0066
0067 /* Code from GLIB gunicode.h starts here. */
0068
0069 typedef enum
0070 {
0071   G_NORMALIZE_DEFAULT,
0072   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
0073   G_NORMALIZE_DEFAULT_COMPOSE,
0074   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
0075   G_NORMALIZE_ALL,
0076   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
0077   G_NORMALIZE_ALL_COMPOSE,
0078   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
0079 }
0080 GNormalizeMode;
0081
0082 /* Code from GLIB gutf8.c starts here. */
0083
0084 #define UTF8_COMPUTE(Char, Mask, Len)       \
0085   if (Char < 128)               \
0086     {                       \
0087       Len = 1;                  \
0088       Mask = 0x7f;              \
0089     }                       \
0090   else if ((Char & 0xe0) == 0xc0)       \
0091     {                       \
0092       Len = 2;                  \
0093       Mask = 0x1f;              \
0094     }                       \
0095   else if ((Char & 0xf0) == 0xe0)       \
0096     {                       \
0097       Len = 3;                  \
0098       Mask = 0x0f;              \
0099     }                       \
0100   else if ((Char & 0xf8) == 0xf0)       \
0101     {                       \
0102       Len = 4;                  \
0103       Mask = 0x07;              \
0104     }                       \
0105   else if ((Char & 0xfc) == 0xf8)       \
0106     {                       \
0107       Len = 5;                  \
0108       Mask = 0x03;              \
0109     }                       \
0110   else if ((Char & 0xfe) == 0xfc)       \
0111     {                       \
0112       Len = 6;                  \
0113       Mask = 0x01;              \
0114     }                       \
0115   else                      \
0116     Len = -1;
0117
0118 #define UTF8_LENGTH(Char)           \
0119   ((Char) < 0x80 ? 1 :              \
0120    ((Char) < 0x800 ? 2 :            \
0121     ((Char) < 0x10000 ? 3 :         \
0122      ((Char) < 0x200000 ? 4 :           \
0123       ((Char) < 0x4000000 ? 5 : 6)))))
0124
0125
0126 #define UTF8_GET(Result, Chars, Count, Mask, Len)   \
0127   (Result) = (Chars)[0] & (Mask);           \
0128   for ((Count) = 1; (Count) < (Len); ++(Count))     \
0129     {                           \
0130       if (((Chars)[(Count)] & 0xc0) != 0x80)        \
0131     {                       \
0132       (Result) = -1;                \
0133       break;                    \
0134     }                       \
0135       (Result) <<= 6;                   \
0136       (Result) |= ((Chars)[(Count)] & 0x3f);        \
0137     }
0138
0139 #define UNICODE_VALID(Char)         \
0140   ((Char) < 0x110000 &&             \
0141    (((Char) & 0xFFFFF800) != 0xD800) &&     \
0142    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
0143    ((Char) & 0xFFFE) != 0xFFFE)
0144
0145
0146 static const gchar utf8_skip_data[256] = {
0147   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0148   1, 1, 1, 1, 1, 1, 1,
0149   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0150   1, 1, 1, 1, 1, 1, 1,
0151   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0152   1, 1, 1, 1, 1, 1, 1,
0153   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0154   1, 1, 1, 1, 1, 1, 1,
0155   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0156   1, 1, 1, 1, 1, 1, 1,
0157   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0158   1, 1, 1, 1, 1, 1, 1,
0159   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0160   2, 2, 2, 2, 2, 2, 2,
0161   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
0162   5, 5, 5, 6, 6, 1, 1
0163 };
0164
0165 const gchar *const g_utf8_skip = utf8_skip_data;
0166
0167 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
0168
0169 /**
0170  * g_utf8_strlen:
0171  * @p: pointer to the start of a UTF-8 encoded string.
0172  * @max: the maximum number of bytes to examine. If @max
0173  *       is less than 0, then the string is assumed to be
0174  *       nul-terminated. If @max is 0, @p will not be examined and
0175  *       may be %NULL.
0176  *
0177  * Returns the length of the string in characters.
0178  *
0179  * Return value: the length of the string in characters
0180  **/
0181 static glong
0182 g_utf8_strlen (const gchar * p, gssize max)
0183 {
0184   glong len = 0;
0185   const gchar *start = p;
0186   g_return_val_if_fail (p != NULL || max == 0, 0);
0187
0188   if (max < 0)
0189     {
0190       while (*p)
0191     {
0192       p = g_utf8_next_char (p);
0193       ++len;
0194     }
0195     }
0196   else
0197     {
0198       if (max == 0 || !*p)
0199     return 0;
0200
0201       p = g_utf8_next_char (p);
0202
0203       while (p - start < max && *p)
0204     {
0205       ++len;
0206       p = g_utf8_next_char (p);
0207     }
0208
0209       /* only do the last len increment if we got a complete
0210        * char (don't count partial chars)
0211        */
0212       if (p - start == max)
0213     ++len;
0214     }
0215
0216   return len;
0217 }
0218
0219 /**
0220  * g_utf8_get_char:
0221  * @p: a pointer to Unicode character encoded as UTF-8
0222  *
0223  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
0224  * If @p does not point to a valid UTF-8 encoded character, results are
0225  * undefined. If you are not sure that the bytes are complete
0226  * valid Unicode characters, you should use g_utf8_get_char_validated()
0227  * instead.
0228  *
0229  * Return value: the resulting character
0230  **/
0231 static gunichar
0232 g_utf8_get_char (const gchar * p)
0233 {
0234   int i, mask = 0, len;
0235   gunichar result;
0236   unsigned char c = (unsigned char) *p;
0237
0238   UTF8_COMPUTE (c, mask, len);
0239   if (len == -1)
0240     return (gunichar) - 1;
0241   UTF8_GET (result, p, i, mask, len);
0242
0243   return result;
0244 }
0245
0246 /**
0247  * g_unichar_to_utf8:
0248  * @c: a ISO10646 character code
0249  * @outbuf: output buffer, must have at least 6 bytes of space.
0250  *       If %NULL, the length will be computed and returned
0251  *       and nothing will be written to @outbuf.
0252  *
0253  * Converts a single character to UTF-8.
0254  *
0255  * Return value: number of bytes written
0256  **/
0257 static int
0258 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
0259 {
0260   guint len = 0;
0261   int first;
0262   int i;
0263
0264   if (c < 0x80)
0265     {
0266       first = 0;
0267       len = 1;
0268     }
0269   else if (c < 0x800)
0270     {
0271       first = 0xc0;
0272       len = 2;
0273     }
0274   else if (c < 0x10000)
0275     {
0276       first = 0xe0;
0277       len = 3;
0278     }
0279   else if (c < 0x200000)
0280     {
0281       first = 0xf0;
0282       len = 4;
0283     }
0284   else if (c < 0x4000000)
0285     {
0286       first = 0xf8;
0287       len = 5;
0288     }
0289   else
0290     {
0291       first = 0xfc;
0292       len = 6;
0293     }
0294
0295   if (outbuf)
0296     {
0297       for (i = len - 1; i > 0; --i)
0298     {
0299       outbuf[i] = (c & 0x3f) | 0x80;
0300       c >>= 6;
0301     }
0302       outbuf[0] = c | first;
0303     }
0304
0305   return len;
0306 }
0307
0308 /**
0309  * g_utf8_to_ucs4_fast:
0310  * @str: a UTF-8 encoded string
0311  * @len: the maximum length of @str to use. If @len < 0, then
0312  *       the string is nul-terminated.
0313  * @items_written: location to store the number of characters in the
0314  *                 result, or %NULL.
0315  *
0316  * Convert a string from UTF-8 to a 32-bit fixed width
0317  * representation as UCS-4, assuming valid UTF-8 input.
0318  * This function is roughly twice as fast as g_utf8_to_ucs4()
0319  * but does no error checking on the input.
0320  *
0321  * Return value: a pointer to a newly allocated UCS-4 string.
0322  *               This value must be freed with g_free().
0323  **/
0324 static gunichar *
0325 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
0326 {
0327   gint j, charlen;
0328   gunichar *result;
0329   gint n_chars, i;
0330   const gchar *p;
0331
0332   g_return_val_if_fail (str != NULL, NULL);
0333
0334   p = str;
0335   n_chars = 0;
0336   if (len < 0)
0337     {
0338       while (*p)
0339     {
0340       p = g_utf8_next_char (p);
0341       ++n_chars;
0342     }
0343     }
0344   else
0345     {
0346       while (p < str + len && *p)
0347     {
0348       p = g_utf8_next_char (p);
0349       ++n_chars;
0350     }
0351     }
0352
0353   result = g_new (gunichar, n_chars + 1);
0354
0355   p = str;
0356   for (i = 0; i < n_chars; i++)
0357     {
0358       gunichar wc = ((unsigned char *) p)[0];
0359
0360       if (wc < 0x80)
0361     {
0362       result[i] = wc;
0363       p++;
0364     }
0365       else
0366     {
0367       if (wc < 0xe0)
0368         {
0369           charlen = 2;
0370           wc &= 0x1f;
0371         }
0372       else if (wc < 0xf0)
0373         {
0374           charlen = 3;
0375           wc &= 0x0f;
0376         }
0377       else if (wc < 0xf8)
0378         {
0379           charlen = 4;
0380           wc &= 0x07;
0381         }
0382       else if (wc < 0xfc)
0383         {
0384           charlen = 5;
0385           wc &= 0x03;
0386         }
0387       else
0388         {
0389           charlen = 6;
0390           wc &= 0x01;
0391         }
0392
0393       for (j = 1; j < charlen; j++)
0394         {
0395           wc <<= 6;
0396           wc |= ((unsigned char *) p)[j] & 0x3f;
0397         }
0398
0399       result[i] = wc;
0400       p += charlen;
0401     }
0402     }
0403   result[i] = 0;
0404
0405   if (items_written)
0406     *items_written = i;
0407
0408   return result;
0409 }
0410
0411 /**
0412  * g_ucs4_to_utf8:
0413  * @str: a UCS-4 encoded string
0414  * @len: the maximum length of @str to use. If @len < 0, then
0415  *       the string is terminated with a 0 character.
0416  * @items_read: location to store number of characters read read, or %NULL.
0417  * @items_written: location to store number of bytes written or %NULL.
0418  *                 The value here stored does not include the trailing 0
0419  *                 byte.
0420  * @error: location to store the error occuring, or %NULL to ignore
0421  *         errors. Any of the errors in #GConvertError other than
0422  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
0423  *
0424  * Convert a string from a 32-bit fixed width representation as UCS-4.
0425  * to UTF-8. The result will be terminated with a 0 byte.
0426  *
0427  * Return value: a pointer to a newly allocated UTF-8 string.
0428  *               This value must be freed with g_free(). If an
0429  *               error occurs, %NULL will be returned and
0430  *               @error set.
0431  **/
0432 static gchar *
0433 g_ucs4_to_utf8 (const gunichar * str,
0434         glong len,
0435         glong * items_read, glong * items_written, GError ** error)
0436 {
0437   gint result_length;
0438   gchar *result = NULL;
0439   gchar *p;
0440   gint i;
0441
0442   result_length = 0;
0443   for (i = 0; len < 0 || i < len; i++)
0444     {
0445       if (!str[i])
0446     break;
0447
0448       if (str[i] >= 0x80000000)
0449     {
0450       if (items_read)
0451         *items_read = i;
0452
0453       /*g_set_error (error, G_CONVERT_ERROR,
0454                G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
0455                _("Character out of range for UTF-8"));*/
0456       goto err_out;
0457     }
0458
0459       result_length += UTF8_LENGTH (str[i]);
0460     }
0461
0462   result = g_malloc (result_length + 1);
0463   p = result;
0464
0465   i = 0;
0466   while (p < result + result_length)
0467     p += g_unichar_to_utf8 (str[i++], p);
0468
0469   *p = '\0';
0470
0471   if (items_written)
0472     *items_written = p - result;
0473
0474 err_out:
0475   if (items_read)
0476     *items_read = i;
0477
0478   return result;
0479 }
0480
0481 /* Code from GLIB gunidecomp.c starts here. */
0482
0483 #include "gunidecomp.h"
0484 #include "gunicomp.h"
0485
0486 #define CC_PART1(Page, Char) \
0487   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
0488    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
0489    : (cclass_data[combining_class_table_part1[Page]][Char]))
0490
0491 #define CC_PART2(Page, Char) \
0492   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
0493    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
0494    : (cclass_data[combining_class_table_part2[Page]][Char]))
0495
0496 #define COMBINING_CLASS(Char) \
0497   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
0498    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
0499    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
0500       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
0501       : 0))
0502
0503 /* constants for hangul syllable [de]composition */
0504 #define SBase 0xAC00
0505 #define LBase 0x1100
0506 #define VBase 0x1161
0507 #define TBase 0x11A7
0508 #define LCount 19
0509 #define VCount 21
0510 #define TCount 28
0511 #define NCount (VCount * TCount)
0512 #define SCount (LCount * NCount)
0513
0514 /**
0515  * g_unicode_canonical_ordering:
0516  * @string: a UCS-4 encoded string.
0517  * @len: the maximum length of @string to use.
0518  *
0519  * Computes the canonical ordering of a string in-place.
0520  * This rearranges decomposed characters in the string
0521  * according to their combining classes.  See the Unicode
0522  * manual for more information.
0523  **/
0524 static void
0525 g_unicode_canonical_ordering (gunichar * string, gsize len)
0526 {
0527   gsize i;
0528   int swap = 1;
0529
0530   while (swap)
0531     {
0532       int last;
0533       swap = 0;
0534       last = COMBINING_CLASS (string[0]);
0535       for (i = 0; i < len - 1; ++i)
0536     {
0537       int next = COMBINING_CLASS (string[i + 1]);
0538       if (next != 0 && last > next)
0539         {
0540           gsize j;
0541           /* Percolate item leftward through string.  */
0542           for (j = i + 1; j > 0; --j)
0543         {
0544           gunichar t;
0545           if (COMBINING_CLASS (string[j - 1]) <= next)
0546             break;
0547           t = string[j];
0548           string[j] = string[j - 1];
0549           string[j - 1] = t;
0550           swap = 1;
0551         }
0552           /* We're re-entering the loop looking at the old
0553              character again.  */
0554           next = last;
0555         }
0556       last = next;
0557     }
0558     }
0559 }
0560
0561 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
0562  * r should be null or have sufficient space. Calling with r == NULL will
0563  * only calculate the result_len; however, a buffer with space for three
0564  * characters will always be big enough. */
0565 static void
0566 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
0567 {
0568   gint SIndex = s - SBase;
0569
0570   /* not a hangul syllable */
0571   if (SIndex < 0 || SIndex >= SCount)
0572     {
0573       if (r)
0574     r[0] = s;
0575       *result_len = 1;
0576     }
0577   else
0578     {
0579       gunichar L = LBase + SIndex / NCount;
0580       gunichar V = VBase + (SIndex % NCount) / TCount;
0581       gunichar T = TBase + SIndex % TCount;
0582
0583       if (r)
0584     {
0585       r[0] = L;
0586       r[1] = V;
0587     }
0588
0589       if (T != TBase)
0590     {
0591       if (r)
0592         r[2] = T;
0593       *result_len = 3;
0594     }
0595       else
0596     *result_len = 2;
0597     }
0598 }
0599
0600 /* returns a pointer to a null-terminated UTF-8 string */
0601 static const gchar *
0602 find_decomposition (gunichar ch, gboolean compat)
0603 {
0604   int start = 0;
0605   int end = G_N_ELEMENTS (decomp_table);
0606
0607   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
0608     {
0609       while (TRUE)
0610     {
0611       int half = (start + end) / 2;
0612       if (ch == decomp_table[half].ch)
0613         {
0614           int offset;
0615
0616           if (compat)
0617         {
0618           offset = decomp_table[half].compat_offset;
0619           if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
0620             offset = decomp_table[half].canon_offset;
0621         }
0622           else
0623         {
0624           offset = decomp_table[half].canon_offset;
0625           if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
0626             return NULL;
0627         }
0628
0629           return &(decomp_expansion_string[offset]);
0630         }
0631       else if (half == start)
0632         break;
0633       else if (ch > decomp_table[half].ch)
0634         start = half;
0635       else
0636         end = half;
0637     }
0638     }
0639
0640   return NULL;
0641 }
0642
0643 /* L,V => LV and LV,T => LVT  */
0644 static gboolean
0645 combine_hangul (gunichar a, gunichar b, gunichar * result)
0646 {
0647   gint LIndex = a - LBase;
0648   gint SIndex = a - SBase;
0649
0650   gint VIndex = b - VBase;
0651   gint TIndex = b - TBase;
0652
0653   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
0654     {
0655       *result = SBase + (LIndex * VCount + VIndex) * TCount;
0656       return TRUE;
0657     }
0658   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
0659        && 0 <= TIndex && TIndex <= TCount)
0660     {
0661       *result = a + TIndex;
0662       return TRUE;
0663     }
0664
0665   return FALSE;
0666 }
0667
0668 #define CI(Page, Char) \
0669   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
0670    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
0671    : (compose_data[compose_table[Page]][Char]))
0672
0673 #define COMPOSE_INDEX(Char) \
0674      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
0675
0676 static gboolean
0677 combine (gunichar a, gunichar b, gunichar * result)
0678 {
0679   gushort index_a, index_b;
0680
0681   if (combine_hangul (a, b, result))
0682     return TRUE;
0683
0684   index_a = COMPOSE_INDEX (a);
0685
0686   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
0687     {
0688       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
0689     {
0690       *result =
0691         compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
0692       return TRUE;
0693     }
0694       else
0695     return FALSE;
0696     }
0697
0698   index_b = COMPOSE_INDEX (b);
0699
0700   if (index_b >= COMPOSE_SECOND_SINGLE_START)
0701     {
0702       if (a ==
0703       compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
0704     {
0705       *result =
0706         compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
0707       return TRUE;
0708     }
0709       else
0710     return FALSE;
0711     }
0712
0713   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
0714       && index_b >= COMPOSE_SECOND_START
0715       && index_b < COMPOSE_SECOND_SINGLE_START)
0716     {
0717       gunichar res =
0718     compose_array[index_a - COMPOSE_FIRST_START][index_b -
0719                              COMPOSE_SECOND_START];
0720
0721       if (res)
0722     {
0723       *result = res;
0724       return TRUE;
0725     }
0726     }
0727
0728   return FALSE;
0729 }
0730
0731 static gunichar *
0732 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
0733 {
0734   gsize n_wc;
0735   gunichar *wc_buffer;
0736   const char *p;
0737   gsize last_start;
0738   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
0739   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
0740
0741   n_wc = 0;
0742   p = str;
0743   while ((max_len < 0 || p < str + max_len) && *p)
0744     {
0745       const gchar *decomp;
0746       gunichar wc = g_utf8_get_char (p);
0747
0748       if (wc >= 0xac00 && wc <= 0xd7af)
0749     {
0750       gsize result_len;
0751       decompose_hangul (wc, NULL, &result_len);
0752       n_wc += result_len;
0753     }
0754       else
0755     {
0756       decomp = find_decomposition (wc, do_compat);
0757
0758       if (decomp)
0759         n_wc += g_utf8_strlen (decomp, -1);
0760       else
0761         n_wc++;
0762     }
0763
0764       p = g_utf8_next_char (p);
0765     }
0766
0767   wc_buffer = g_new (gunichar, n_wc + 1);
0768
0769   last_start = 0;
0770   n_wc = 0;
0771   p = str;
0772   while ((max_len < 0 || p < str + max_len) && *p)
0773     {
0774       gunichar wc = g_utf8_get_char (p);
0775       const gchar *decomp;
0776       int cc;
0777       gsize old_n_wc = n_wc;
0778
0779       if (wc >= 0xac00 && wc <= 0xd7af)
0780     {
0781       gsize result_len;
0782       decompose_hangul (wc, wc_buffer + n_wc, &result_len);
0783       n_wc += result_len;
0784     }
0785       else
0786     {
0787       decomp = find_decomposition (wc, do_compat);
0788
0789       if (decomp)
0790         {
0791           const char *pd;
0792           for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
0793         wc_buffer[n_wc++] = g_utf8_get_char (pd);
0794         }
0795       else
0796         wc_buffer[n_wc++] = wc;
0797     }
0798
0799       if (n_wc > 0)
0800     {
0801       cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
0802
0803       if (cc == 0)
0804         {
0805           g_unicode_canonical_ordering (wc_buffer + last_start,
0806                         n_wc - last_start);
0807           last_start = old_n_wc;
0808         }
0809     }
0810
0811       p = g_utf8_next_char (p);
0812     }
0813
0814   if (n_wc > 0)
0815     {
0816       g_unicode_canonical_ordering (wc_buffer + last_start,
0817                     n_wc - last_start);
0818       last_start = n_wc;
0819     }
0820
0821   wc_buffer[n_wc] = 0;
0822
0823   /* All decomposed and reordered */
0824
0825   if (do_compose && n_wc > 0)
0826     {
0827       gsize i, j;
0828       int last_cc = 0;
0829       last_start = 0;
0830
0831       for (i = 0; i < n_wc; i++)
0832     {
0833       int cc = COMBINING_CLASS (wc_buffer[i]);
0834
0835       if (i > 0 &&
0836           (last_cc == 0 || last_cc != cc) &&
0837           combine (wc_buffer[last_start], wc_buffer[i],
0838                &wc_buffer[last_start]))
0839         {
0840           for (j = i + 1; j < n_wc; j++)
0841         wc_buffer[j - 1] = wc_buffer[j];
0842           n_wc--;
0843           i--;
0844
0845           if (i == last_start)
0846         last_cc = 0;
0847           else
0848         last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
0849
0850           continue;
0851         }
0852
0853       if (cc == 0)
0854         last_start = i;
0855
0856       last_cc = cc;
0857     }
0858     }
0859
0860   wc_buffer[n_wc] = 0;
0861
0862   return wc_buffer;
0863 }
0864
0865 /**
0866  * g_utf8_normalize:
0867  * @str: a UTF-8 encoded string.
0868  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
0869  * @mode: the type of normalization to perform.
0870  *
0871  * Converts a string into canonical form, standardizing
0872  * such issues as whether a character with an accent
0873  * is represented as a base character and combining
0874  * accent or as a single precomposed character. You
0875  * should generally call g_utf8_normalize() before
0876  * comparing two Unicode strings.
0877  *
0878  * The normalization mode %G_NORMALIZE_DEFAULT only
0879  * standardizes differences that do not affect the
0880  * text content, such as the above-mentioned accent
0881  * representation. %G_NORMALIZE_ALL also standardizes
0882  * the "compatibility" characters in Unicode, such
0883  * as SUPERSCRIPT THREE to the standard forms
0884  * (in this case DIGIT THREE). Formatting information
0885  * may be lost but for most text operations such
0886  * characters should be considered the same.
0887  * For example, g_utf8_collate() normalizes
0888  * with %G_NORMALIZE_ALL as its first step.
0889  *
0890  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
0891  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
0892  * but returned a result with composed forms rather
0893  * than a maximally decomposed form. This is often
0894  * useful if you intend to convert the string to
0895  * a legacy encoding or pass it to a system with
0896  * less capable Unicode handling.
0897  *
0898  * Return value: a newly allocated string, that is the
0899  *   normalized form of @str.
0900  **/
0901 static gchar *
0902 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
0903 {
0904   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
0905   gchar *result;
0906
0907   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
0908   g_free (result_wc);
0909
0910   return result;
0911 }
0912
0913 /* Public Libidn API starts here. */
0914
0915 /**
0916  * stringprep_utf8_to_unichar:
0917  * @p: a pointer to Unicode character encoded as UTF-8
0918  *
0919  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
0920  * If @p does not point to a valid UTF-8 encoded character, results are
0921  * undefined. If you are not sure that the bytes are complete
0922  * valid Unicode characters, you should use g_utf8_get_char_validated()
0923  * instead.
0924  *
0925  * Return value: the resulting character
0926  **/
0927 my_uint32_t
0928 stringprep_utf8_to_unichar (const char *p)
0929 {
0930   return g_utf8_get_char (p);
0931 }
0932
0933 /**
0934  * stringprep_unichar_to_utf8:
0935  * @c: a ISO10646 character code
0936  * @outbuf: output buffer, must have at least 6 bytes of space.
0937  *       If %NULL, the length will be computed and returned
0938  *       and nothing will be written to @outbuf.
0939  *
0940  * Converts a single character to UTF-8.
0941  *
0942  * Return value: number of bytes written
0943  **/
0944 int
0945 stringprep_unichar_to_utf8 (my_uint32_t c, char *outbuf)
0946 {
0947   return g_unichar_to_utf8 (c, outbuf);
0948 }
0949
0950 /**
0951  * stringprep_utf8_to_ucs4:
0952  * @str: a UTF-8 encoded string
0953  * @len: the maximum length of @str to use. If @len < 0, then
0954  *       the string is nul-terminated.
0955  * @items_written: location to store the number of characters in the
0956  *                 result, or %NULL.
0957  *
0958  * Convert a string from UTF-8 to a 32-bit fixed width
0959  * representation as UCS-4, assuming valid UTF-8 input.
0960  * This function does no error checking on the input.
0961  *
0962  * Return value: a pointer to a newly allocated UCS-4 string.
0963  *               This value must be freed with free().
0964  **/
0965 my_uint32_t *
0966 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
0967 {
0968   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
0969 }
0970
0971 /**
0972  * stringprep_ucs4_to_utf8:
0973  * @str: a UCS-4 encoded string
0974  * @len: the maximum length of @str to use. If @len < 0, then
0975  *       the string is terminated with a 0 character.
0976  * @items_read: location to store number of characters read read, or %NULL.
0977  * @items_written: location to store number of bytes written or %NULL.
0978  *                 The value here stored does not include the trailing 0
0979  *                 byte.
0980  *
0981  * Convert a string from a 32-bit fixed width representation as UCS-4.
0982  * to UTF-8. The result will be terminated with a 0 byte.
0983  *
0984  * Return value: a pointer to a newly allocated UTF-8 string.
0985  *               This value must be freed with free(). If an
0986  *               error occurs, %NULL will be returned and
0987  *               @error set.
0988  **/
0989 char *
0990 stringprep_ucs4_to_utf8 (const my_uint32_t * str, ssize_t len,
0991              size_t * items_read, size_t * items_written)
0992 {
0993   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
0994              (glong *) items_written, NULL);
0995 }
0996
0997 /**
0998  * stringprep_utf8_nfkc_normalize:
0999  * @str: a UTF-8 encoded string.
1000  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1001  *
1002  * Converts a string into canonical form, standardizing
1003  * such issues as whether a character with an accent
1004  * is represented as a base character and combining
1005  * accent or as a single precomposed character.
1006  *
1007  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1008  * differences that do not affect the text content, such as the
1009  * above-mentioned accent representation. It standardizes the
1010  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1011  * the standard forms (in this case DIGIT THREE). Formatting
1012  * information may be lost but for most text operations such
1013  * characters should be considered the same. It returns a result with
1014  * composed forms rather than a maximally decomposed form.
1015  *
1016  * Return value: a newly allocated string, that is the
1017  *   NFKC normalized form of @str.
1018  **/
1019 char *
1020 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1021 {
1022   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1023 }
1024
1025 /**
1026  * stringprep_ucs4_nfkc_normalize:
1027  * @str: a Unicode string.
1028  * @len: length of @str array, or -1 if @str is nul-terminated.
1029  *
1030  * Converts UCS4 string into UTF-8 and runs
1031  * stringprep_utf8_nfkc_normalize().
1032  *
1033  * Return value: a newly allocated Unicode string, that is the NFKC
1034  *   normalized form of @str.
1035  **/
1036 my_uint32_t *
1037 stringprep_ucs4_nfkc_normalize (my_uint32_t * str, ssize_t len)
1038 {
1039   char *p;
1040   my_uint32_t *result_wc;
1041
1042   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1043   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1044   free (p);
1045
1046   return result_wc;
1047 }