kfontinst/viewpart/generate-unicode-tables.pl

0001 #!/usr/bin/perl -w
0002 #
0003 # Note: This file is taken, and modified, from gucharmap/gen-guch-unicode-tables.pl - svn revision 1040
0004 #
0005 # $Id$
0006 #
0007 # generates in the current directory:
0008 #  - UnicodeBlocks.h
0009 #  - unicode-names.h
0010 #  - unicode-nameslist.h
0011 #  - unicode-unihan.h
0012 #  - UnicodeCategories.h
0013 #  - UnicodeScripts.h
0014 #
0015 # usage: ./gen-guch-unicode-tables.pl UNICODE-VERSION DIRECTORY
0016 # where DIRECTORY contains UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt
0017 #
0018
0019 use strict;
0020 use vars ('$UNZIP', '$ICONV');
0021
0022 # if these things aren't in your path you can put full paths to them here
0023 $UNZIP = 'unzip';
0024 $ICONV = 'iconv';
0025
0026 sub process_unicode_data_txt ($);
0027 sub process_unihan_zip ($);
0028 sub process_nameslist_txt ($);
0029 sub process_blocks_txt ($);
0030 sub process_scripts_txt ($);
0031
0032 $| = 1;  # flush stdout buffer
0033
0034 if (@ARGV != 2)
0035 {
0036     $0 =~ s@.*/@@;
0037     die <<EOF
0038
0039 Usage: $0 UNICODE-VERSION DIRECTORY
0040
0041 DIRECTORY should contain the following Unicode data files:
0042 UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt
0043
0044 which can be found at https://www.unicode.org/Public/UNIDATA/
0045
0046 EOF
0047 }
0048
0049 my ($unicodedata_txt, $unihan_zip, $nameslist_txt, $blocks_txt, $scripts_txt);
0050
0051 my $v = $ARGV[0];
0052 my $d = $ARGV[1];
0053 opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n";
0054 for my $f (readdir ($dir))
0055 {
0056     $unicodedata_txt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/);
0057 #     $unihan_zip = "$d/$f" if ($f =~ /Unihan.*\.zip/);
0058 #     $nameslist_txt = "$d/$f" if ($f =~ /NamesList.*\.txt/);
0059     $blocks_txt = "$d/$f" if ($f =~ /Blocks.*\.txt/);
0060     $scripts_txt = "$d/$f" if ($f =~ /Scripts.*\.txt/);
0061 }
0062
0063 defined $unicodedata_txt or die "Did not find $d/UnicodeData.txt";
0064 # defined $unihan_zip or die "Did not find $d/Unihan.zip";
0065 # defined $nameslist_txt or die "Did not find $d/NamesList.txt";
0066 defined $blocks_txt or die "Did not find $d/Blocks.txt";
0067 defined $scripts_txt or die "Did not find $d/Scripts.txt";
0068
0069 process_unicode_data_txt ($unicodedata_txt);
0070 # process_nameslist_txt ($nameslist_txt);
0071 process_blocks_txt ($blocks_txt);
0072 process_scripts_txt ($scripts_txt);
0073 # process_unihan_zip ($unihan_zip);
0074
0075 exit;
0076
0077
0078 #------------------------#
0079
0080 sub process_unicode_data_txt ($)
0081 {
0082     my ($unicodedata_txt) = @_;
0083
0084     # part 1: names
0085
0086     open (my $unicodedata, $unicodedata_txt) or die;
0087 #     open (my $out, "> unicode-names.h") or die;
0088
0089     print "processing $unicodedata_txt...";
0090 #
0091 #     print $out "/* unicode-names.h */\n";
0092 #     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
0093 #     print $out "/* Generated by $0 */\n";
0094 #     print $out "/* Generated from UCD version $v */\n\n";
0095 #
0096 #     print $out "#pragma once\n\n";
0097 #
0098 #     print $out "#include <glib/gunicode.h>\n\n";
0099 #     print $out "#include \"gucharmap-intl.h\"\n\n";
0100 #
0101 #     my @unicode_pairs;
0102 #     my %names;
0103 #
0104 #     while (my $line = <$unicodedata>)
0105 #     {
0106 #         chomp $line;
0107 #         $line =~ /^([^;]+);([^;]+)/ or die;
0108 #
0109 #         my $hex = $1;
0110 #         my $name = $2;
0111 #
0112 #         $names{$name} = 1;
0113 #         push @unicode_pairs, [$hex, $name];
0114 #     }
0115 #
0116 #     print $out "static const char unicode_names_strings[] = \\\n";
0117 #
0118 #     my $offset = 0;
0119 #
0120 #     foreach my $name (sort keys %names) {
0121 #   print $out "  \"$name\\0\"\n";
0122 #   $names{$name} = $offset;
0123 #   $offset += length($name) + 1;
0124 #     }
0125 #
0126 #     undef $offset;
0127 #
0128 #     print $out ";\n";
0129 #
0130 #     print $out "typedef struct _UnicodeName UnicodeName;\n\n";
0131 #
0132 #     print $out "static const struct _UnicodeName\n";
0133 #     print $out "{\n";
0134 #     print $out "  gunichar index;\n";
0135 #     print $out "  guint32 name_offset;\n";
0136 #     print $out "} \n";
0137 #     print $out "unicode_names[] =\n";
0138 #     print $out "{\n";
0139 #
0140 #     my $first_line = 1;
0141 #
0142 #     foreach my $pair (@unicode_pairs) {
0143 #   if (!$first_line) {
0144 #       print $out ",\n";
0145 #   } else {
0146 #       $first_line = 0;
0147 #   }
0148 #
0149 #   my ($hex, $name) = @{$pair};
0150 #   my $offset = $names{$name};
0151 #   print $out "  {0x$hex, $offset}";
0152 #     }
0153 #
0154 #     print $out "\n};\n\n";
0155 #
0156 #     print $out <<EOT;
0157 # static inline const char * unicode_name_get_name(const UnicodeName *entry)
0158 # {
0159 #   guint32 offset = entry->name_offset;
0160 #   return unicode_names_strings + offset;
0161 # }
0162 #
0163 # EOT
0164 #
0165 #
0166 #     undef %names;
0167 #     undef @unicode_pairs;
0168 #
0169 #     close ($unicodedata);
0170 #     close ($out);
0171
0172     # part 2: categories
0173
0174     open ($unicodedata, $unicodedata_txt) or die;
0175     open (my $out, "> UnicodeCategories.h") or die;
0176
0177     # Map general category code onto symbolic name.
0178     my %mappings =
0179     (
0180         # Normative.
0181         'Lu' => "UNICODE_UPPERCASE_LETTER",
0182         'Ll' => "UNICODE_LOWERCASE_LETTER",
0183         'Lt' => "UNICODE_TITLECASE_LETTER",
0184         'Mn' => "UNICODE_NON_SPACING_MARK",
0185         'Mc' => "UNICODE_COMBINING_MARK",
0186         'Me' => "UNICODE_ENCLOSING_MARK",
0187         'Nd' => "UNICODE_DECIMAL_NUMBER",
0188         'Nl' => "UNICODE_LETTER_NUMBER",
0189         'No' => "UNICODE_OTHER_NUMBER",
0190         'Zs' => "UNICODE_SPACE_SEPARATOR",
0191         'Zl' => "UNICODE_LINE_SEPARATOR",
0192         'Zp' => "UNICODE_PARAGRAPH_SEPARATOR",
0193         'Cc' => "UNICODE_CONTROL",
0194         'Cf' => "UNICODE_FORMAT",
0195         'Cs' => "UNICODE_SURROGATE",
0196         'Co' => "UNICODE_PRIVATE_USE",
0197         'Cn' => "UNICODE_UNASSIGNED",
0198
0199         # Informative.
0200         'Lm' => "UNICODE_MODIFIER_LETTER",
0201         'Lo' => "UNICODE_OTHER_LETTER",
0202         'Pc' => "UNICODE_CONNECT_PUNCTUATION",
0203         'Pd' => "UNICODE_DASH_PUNCTUATION",
0204         'Ps' => "UNICODE_OPEN_PUNCTUATION",
0205         'Pe' => "UNICODE_CLOSE_PUNCTUATION",
0206         'Pi' => "UNICODE_INITIAL_PUNCTUATION",
0207         'Pf' => "UNICODE_FINAL_PUNCTUATION",
0208         'Po' => "UNICODE_OTHER_PUNCTUATION",
0209         'Sm' => "UNICODE_MATH_SYMBOL",
0210         'Sc' => "UNICODE_CURRENCY_SYMBOL",
0211         'Sk' => "UNICODE_MODIFIER_SYMBOL",
0212         'So' => "UNICODE_OTHER_SYMBOL"
0213     );
0214
0215     # these shouldn't be -1
0216     my ($codepoint, $last_codepoint, $start_codepoint) = (-999, -999, -999);
0217
0218     my ($category, $last_category) = ("FAKE1", "FAKE2");
0219     my ($started_range, $finished_range) = (undef, undef);
0220
0221     print $out "/* UnicodeCategories.h */\n";
0222     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
0223     print $out "/* Generated by $0 */\n";
0224     print $out "/* Generated from UCD version $v */\n\n";
0225
0226     print $out "#pragma once\n\n";
0227     print $out "#include <QtGlobal>\n\n";
0228     print $out "enum EUnicodeCategory\n";
0229     print $out "{\n";
0230     print $out "    UNICODE_UPPERCASE_LETTER,\n";
0231     print $out "    UNICODE_LOWERCASE_LETTER,\n";
0232     print $out "    UNICODE_TITLECASE_LETTER,\n";
0233     print $out "    UNICODE_NON_SPACING_MARK,\n";
0234     print $out "    UNICODE_COMBINING_MARK,\n";
0235     print $out "    UNICODE_ENCLOSING_MARK,\n";
0236     print $out "    UNICODE_DECIMAL_NUMBER,\n";
0237     print $out "    UNICODE_LETTER_NUMBER,\n";
0238     print $out "    UNICODE_OTHER_NUMBER,\n";
0239     print $out "    UNICODE_SPACE_SEPARATOR,\n";
0240     print $out "    UNICODE_LINE_SEPARATOR,\n";
0241     print $out "    UNICODE_PARAGRAPH_SEPARATOR,\n";
0242     print $out "    UNICODE_CONTROL,\n";
0243     print $out "    UNICODE_FORMAT,\n";
0244     print $out "    UNICODE_SURROGATE,\n";
0245     print $out "    UNICODE_PRIVATE_USE,\n";
0246     print $out "    UNICODE_UNASSIGNED,\n";
0247     print $out "    UNICODE_MODIFIER_LETTER,\n";
0248     print $out "    UNICODE_OTHER_LETTER,\n";
0249     print $out "    UNICODE_CONNECT_PUNCTUATION,\n";
0250     print $out "    UNICODE_DASH_PUNCTUATION,\n";
0251     print $out "    UNICODE_OPEN_PUNCTUATION,\n";
0252     print $out "    UNICODE_CLOSE_PUNCTUATION,\n";
0253     print $out "    UNICODE_INITIAL_PUNCTUATION,\n";
0254     print $out "    UNICODE_FINAL_PUNCTUATION,\n";
0255     print $out "    UNICODE_OTHER_PUNCTUATION,\n";
0256     print $out "    UNICODE_MATH_SYMBOL,\n";
0257     print $out "    UNICODE_CURRENCY_SYMBOL,\n";
0258     print $out "    UNICODE_MODIFIER_SYMBOL,\n";
0259     print $out "    UNICODE_OTHER_SYMBOL,\n";
0260     print $out "\n";
0261     print $out "    UNICODE_INVALID\n";
0262     print $out "};\n\n";
0263     print $out "struct TUnicodeCategory\n";
0264     print $out "{\n";
0265     print $out "    quint32 start;\n";
0266     print $out "    quint32 end;\n";
0267     print $out "    EUnicodeCategory category;\n";
0268     print $out "};\n\n";
0269     print $out "static const TUnicodeCategory constUnicodeCategoryList[] =\n";
0270     print $out "{\n";
0271
0272     while (my $line = <$unicodedata>)
0273     {
0274         $line =~ /^([0-9A-F]*);([^;]*);([^;]*);/ or die;
0275         my $codepoint = hex ($1);
0276         my $name = $2;
0277         my $category = $mappings{$3};
0278
0279         if ($finished_range
0280             or ($category ne $last_category)
0281             or (not $started_range and $codepoint != $last_codepoint + 1))
0282         {
0283             if ($last_codepoint >= 0) {
0284                 printf $out ("    { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);
0285             }
0286
0287             $start_codepoint = $codepoint;
0288         }
0289
0290         if ($name =~ /^<.*First>$/) {
0291             $started_range = 1;
0292             $finished_range = undef;
0293         }
0294         elsif ($name =~ /^<.*Last>$/) {
0295             $started_range = undef;
0296             $finished_range = 1;
0297         }
0298         elsif ($finished_range) {
0299             $finished_range = undef;
0300         }
0301
0302         $last_codepoint = $codepoint;
0303         $last_category = $category;
0304     }
0305     printf $out ("    { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);
0306     printf $out "    { 0x0, 0x0, UNICODE_INVALID }\n";
0307     print $out "};\n\n";
0308
0309
0310     close ($out);
0311     print " done.\n";
0312 }
0313
0314 #------------------------#
0315
0316 # XXX should do kFrequency too
0317 sub process_unihan_zip ($)
0318 {
0319     my ($unihan_zip) = @_;
0320
0321     open (my $unihan, "$UNZIP -c $unihan_zip |") or die;
0322     open (my $out, "> unicode-unihan.h") or die;
0323
0324     print "processing $unihan_zip";
0325
0326     print $out "/* unicode-unihan.h */\n";
0327     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
0328     print $out "/* Generated by $0 */\n";
0329     print $out "/* Generated from UCD version $v */\n\n";
0330
0331     print $out "#pragma once\n\n";
0332
0333     print $out "#include <glib/gunicode.h>\n\n";
0334
0335     print $out "typedef struct _Unihan Unihan;\n\n";
0336
0337     print $out "static const struct _Unihan\n";
0338     print $out "{\n";
0339     print $out "  gunichar index;\n";
0340     print $out "  gint32 kDefinition;\n";
0341     print $out "  gint32 kCantonese;\n";
0342     print $out "  gint32 kMandarin;\n";
0343     print $out "  gint32 kTang;\n";
0344     print $out "  gint32 kKorean;\n";
0345     print $out "  gint32 kJapaneseKun;\n";
0346     print $out "  gint32 kJapaneseOn;\n";
0347     print $out "} \n";
0348     print $out "unihan[] =\n";
0349     print $out "{\n";
0350
0351     my @strings;
0352     my $offset = 0;
0353
0354     my $wc = 0;
0355     my ($kDefinition, $kCantonese, $kMandarin, $kTang, $kKorean, $kJapaneseKun, $kJapaneseOn);
0356
0357     my $i = 0;
0358     while (my $line = <$unihan>)
0359     {
0360         chomp $line;
0361         $line =~ /^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$/ or next;
0362
0363         my $new_wc = hex ($1);
0364         my $field = $2;
0365
0366         my $value = $3;
0367         $value =~ s/\\/\\\\/g;
0368         $value =~ s/\"/\\"/g;
0369
0370         if ($new_wc != $wc)
0371         {
0372             if (defined $kDefinition or defined $kCantonese or defined $kMandarin
0373                 or defined $kTang or defined $kKorean or defined $kJapaneseKun
0374                 or defined $kJapaneseOn)
0375             {
0376                 printf $out ("  { 0x%04X, \%d, \%d, \%d, \%d, \%d, \%d, \%d },\n",
0377                              $wc,
0378                              (defined($kDefinition) ? $kDefinition : -1),
0379                              (defined($kCantonese) ? $kCantonese: -1),
0380                              (defined($kMandarin) ? $kMandarin : -1),
0381                              (defined($kTang) ? $kTang : -1),
0382                              (defined($kKorean) ? $kKorean : -1),
0383                              (defined($kJapaneseKun) ? $kJapaneseKun : -1),
0384                              (defined($kJapaneseOn) ? $kJapaneseOn : -1));
0385             }
0386
0387             $wc = $new_wc;
0388
0389             undef $kDefinition;
0390             undef $kCantonese;
0391             undef $kMandarin;
0392             undef $kTang;
0393             undef $kKorean;
0394             undef $kJapaneseKun;
0395             undef $kJapaneseOn;
0396         }
0397
0398         for my $f qw(kDefinition kCantonese kMandarin
0399                      kTang kKorean kJapaneseKun kJapaneseOn) {
0400
0401             if ($field eq $f) {
0402             push @strings, $value;
0403         my $last_offset = $offset;
0404         $offset += length($value) + 1;
0405         $value = $last_offset;
0406         last;
0407         }
0408     }
0409
0410         if ($field eq "kDefinition") {
0411             $kDefinition = $value;
0412         }
0413         elsif ($field eq "kCantonese") {
0414             $kCantonese = $value;
0415         }
0416         elsif ($field eq "kMandarin") {
0417             $kMandarin = $value;
0418         }
0419         elsif ($field eq "kTang") {
0420             $kTang = $value;
0421         }
0422         elsif ($field eq "kKorean") {
0423             $kKorean = $value;
0424         }
0425         elsif ($field eq "kJapaneseKun") {
0426             $kJapaneseKun = $value;
0427         }
0428         elsif ($field eq "kJapaneseOn") {
0429             $kJapaneseOn = $value;
0430         }
0431
0432         if ($i++ % 32768 == 0) {
0433             print ".";
0434         }
0435     }
0436
0437     print $out "};\n\n";
0438
0439     print $out "static const char unihan_strings[] = \\\n";
0440
0441     for my $s (@strings) {
0442     print $out "  \"$s\\0\"\n";
0443     }
0444     print $out ";\n\n";
0445
0446     print $out "static const Unihan *_get_unihan (gunichar uc)\n;";
0447
0448     for my $name qw(kDefinition kCantonese kMandarin
0449             kTang kKorean kJapaneseKun kJapaneseOn) {
0450     print $out <<EOT;
0451
0452 static inline const char * unihan_get_$name (const Unihan *uh)
0453 {
0454     gint32 offset = uh->$name;
0455     if (offset == -1)
0456       return NULL;
0457     return unihan_strings + offset;
0458 }
0459
0460 G_CONST_RETURN gchar *
0461 gucharmap_get_unicode_$name (gunichar uc)
0462 {
0463   const Unihan *uh = _get_unihan (uc);
0464   if (uh == NULL)
0465     return NULL;
0466   else
0467     return unihan_get_$name (uh);
0468 }
0469
0470 EOT
0471     }
0472
0473
0474     close ($unihan);
0475     close ($out);
0476
0477     print " done.\n";
0478 }
0479
0480 #------------------------#
0481
0482 # $nameslist_hash =
0483 # {
0484 #     0x0027 => { '=' => {
0485 #                          'index'  => 30,
0486 #                          'values' => [ 'APOSTROPHE-QUOTE', 'APL quote' ]
0487 #                        }
0488 #                 '*' => {
0489 #                          'index'  => 50,
0490 #                          'values' => [ 'neutral (vertical) glyph with mixed usage',
0491 #                                        '2019 is preferred for apostrophe',
0492 #                                        'preferred characters in English for paired quotation marks are 2018 & 2019'
0493 #                                      ]
0494 #                         }
0495 #                  # etc
0496 #                }
0497 #     # etc
0498 # };
0499 #
0500 sub process_nameslist_txt ($)
0501 {
0502     my ($nameslist_txt) = @_;
0503
0504     open (my $nameslist, "$ICONV -f 'ISO8859-1' -t 'UTF-8' $nameslist_txt |") or die;
0505
0506     print "processing $nameslist_txt...";
0507
0508     my ($equal_i, $ex_i, $star_i, $pound_i, $colon_i) = (0, 0, 0, 0, 0);
0509     my $wc = 0;
0510
0511     my $nameslist_hash;
0512
0513     while (my $line = <$nameslist>)
0514     {
0515         chomp ($line);
0516
0517         if ($line =~ /^@/)
0518         {
0519             next;
0520         }
0521         elsif ($line =~ /^([0-9A-F]+)/)
0522         {
0523             $wc = hex ($1);
0524         }
0525         elsif ($line =~ /^\s+=\s+(.+)$/)
0526         {
0527             my $value = $1;
0528             $value =~ s/\\/\\\\/g;
0529             $value =~ s/\"/\\"/g;
0530
0531             if (not defined $nameslist_hash->{$wc}->{'='}->{'index'}) {
0532                 $nameslist_hash->{$wc}->{'='}->{'index'} = $equal_i;
0533             }
0534             push (@{$nameslist_hash->{$wc}->{'='}->{'values'}}, $value);
0535
0536             $equal_i++;
0537         }
0538         elsif ($line =~ /^\s+\*\s+(.+)$/)
0539         {
0540             my $value = $1;
0541             $value =~ s/\\/\\\\/g;
0542             $value =~ s/\"/\\"/g;
0543
0544             if (not defined $nameslist_hash->{$wc}->{'*'}->{'index'}) {
0545                 $nameslist_hash->{$wc}->{'*'}->{'index'} = $star_i;
0546             }
0547             push (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}, $value);
0548
0549             $star_i++;
0550         }
0551         elsif ($line =~ /^\s+#\s+(.+)$/)
0552         {
0553             my $value = $1;
0554             $value =~ s/\\/\\\\/g;
0555             $value =~ s/\"/\\"/g;
0556
0557             if (not defined $nameslist_hash->{$wc}->{'#'}->{'index'}) {
0558                 $nameslist_hash->{$wc}->{'#'}->{'index'} = $pound_i;
0559             }
0560             push (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}, $value);
0561
0562             $pound_i++;
0563         }
0564         elsif ($line =~ /^\s+:\s+(.+)$/)
0565         {
0566             my $value = $1;
0567             $value =~ s/\\/\\\\/g;
0568             $value =~ s/\"/\\"/g;
0569
0570             if (not defined $nameslist_hash->{$wc}->{':'}->{'index'}) {
0571                 $nameslist_hash->{$wc}->{':'}->{'index'} = $colon_i;
0572             }
0573             push (@{$nameslist_hash->{$wc}->{':'}->{'values'}}, $value);
0574
0575             $colon_i++;
0576         }
0577         elsif ($line =~ /^\s+x\s+.*([0-9A-F]{4,6})\)$/)  # this one is different
0578         {
0579             my $value = hex ($1);
0580
0581             if (not defined $nameslist_hash->{$wc}->{'x'}->{'index'}) {
0582                 $nameslist_hash->{$wc}->{'x'}->{'index'} = $ex_i;
0583             }
0584             push (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}, $value);
0585
0586             $ex_i++;
0587         }
0588     }
0589
0590     close ($nameslist);
0591
0592     open (my $out, "> unicode-nameslist.h") or die;
0593
0594     print $out "/* unicode-nameslist.h */\n";
0595     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
0596     print $out "/* Generated by $0 */\n";
0597     print $out "/* Generated from UCD version $v */\n\n";
0598
0599     print $out "#pragma once\n\n";
0600
0601     print $out "#include <glib/gunicode.h>\n\n";
0602
0603     print $out "typedef struct _UnicharString UnicharString;\n";
0604     print $out "typedef struct _UnicharUnichar UnicharUnichar;\n";
0605     print $out "typedef struct _NamesList NamesList;\n\n";
0606
0607     print $out "struct _UnicharString\n";
0608     print $out "{\n";
0609     print $out "  gunichar index;\n";
0610     print $out "  const gchar *value;\n";
0611     print $out "}; \n\n";
0612
0613     print $out "struct _UnicharUnichar\n";
0614     print $out "{\n";
0615     print $out "  gunichar index;\n";
0616     print $out "  gunichar value;\n";
0617     print $out "}; \n\n";
0618
0619     print $out "struct _NamesList\n";
0620     print $out "{\n";
0621     print $out "  gunichar index;\n";
0622     print $out "  gint equals_index;  /* -1 means */\n";
0623     print $out "  gint stars_index;   /* this character */\n";
0624     print $out "  gint exes_index;    /* doesn't */\n";
0625     print $out "  gint pounds_index;  /* have any */\n";
0626     print $out "  gint colons_index;\n";
0627     print $out "};\n\n";
0628
0629     print $out "static const UnicharString names_list_equals[] = \n";
0630     print $out "{\n";
0631     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
0632     {
0633         next if not exists $nameslist_hash->{$wc}->{'='};
0634         for my $value (@{$nameslist_hash->{$wc}->{'='}->{'values'}}) {
0635             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
0636         }
0637     }
0638     print $out "  { (gunichar)(-1), 0 }\n";
0639     print $out "};\n\n";
0640
0641     print $out "static const UnicharString names_list_stars[] = \n";
0642     print $out "{\n";
0643     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
0644     {
0645         next if not exists $nameslist_hash->{$wc}->{'*'};
0646         for my $value (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}) {
0647             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
0648         }
0649     }
0650     print $out "  { (gunichar)(-1), 0 }\n";
0651     print $out "};\n\n";
0652
0653     print $out "static const UnicharString names_list_pounds[] = \n";
0654     print $out "{\n";
0655     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
0656     {
0657         next if not exists $nameslist_hash->{$wc}->{'#'};
0658         for my $value (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}) {
0659             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
0660         }
0661     }
0662     print $out "  { (gunichar)(-1), 0 }\n";
0663     print $out "};\n\n";
0664
0665     print $out "static const UnicharUnichar names_list_exes[] = \n";
0666     print $out "{\n";
0667     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
0668     {
0669         next if not exists $nameslist_hash->{$wc}->{'x'};
0670         for my $value (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}) {
0671             printf $out (qq/  { 0x%04X, 0x%04X },\n/, $wc, $value);
0672         }
0673     }
0674     print $out "  { (gunichar)(-1), 0 }\n";
0675     print $out "};\n\n";
0676
0677     print $out "static const UnicharString names_list_colons[] = \n";
0678     print $out "{\n";
0679     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
0680     {
0681         next if not exists $nameslist_hash->{$wc}->{':'};
0682         for my $value (@{$nameslist_hash->{$wc}->{':'}->{'values'}}) {
0683             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
0684         }
0685     }
0686     print $out "  { (gunichar)(-1), 0 }\n";
0687     print $out "};\n\n";
0688
0689     print $out "static const NamesList names_list[] =\n";
0690     print $out "{\n";
0691     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
0692     {
0693         my $eq    = exists $nameslist_hash->{$wc}->{'='}->{'index'} ? $nameslist_hash->{$wc}->{'='}->{'index'} : -1;
0694         my $star  = exists $nameslist_hash->{$wc}->{'*'}->{'index'} ? $nameslist_hash->{$wc}->{'*'}->{'index'} : -1;
0695         my $ex    = exists $nameslist_hash->{$wc}->{'x'}->{'index'} ? $nameslist_hash->{$wc}->{'x'}->{'index'} : -1;
0696         my $pound = exists $nameslist_hash->{$wc}->{'#'}->{'index'} ? $nameslist_hash->{$wc}->{'#'}->{'index'} : -1;
0697         my $colon = exists $nameslist_hash->{$wc}->{':'}->{'index'} ? $nameslist_hash->{$wc}->{':'}->{'index'} : -1;
0698
0699         printf $out ("  { 0x%04X, \%d, \%d, \%d, \%d, \%d },\n", $wc, $eq, $star, $ex, $pound, $colon);
0700     }
0701     print $out "};\n\n";
0702
0703
0704     close ($out);
0705
0706     print " done.\n";
0707 }
0708
0709 #------------------------#
0710
0711 sub process_blocks_txt ($)
0712 {
0713     my ($blocks_txt) = @_;
0714
0715     open (my $blocks, $blocks_txt) or die;
0716     open (my $out, "> UnicodeBlocks.h") or die;
0717
0718     print "processing $blocks_txt...";
0719
0720     print $out "/* UnicodeBlocks.h */\n";
0721     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
0722     print $out "/* Generated by $0 */\n";
0723     print $out "/* Generated from UCD version $v */\n\n";
0724
0725     print $out "#pragma once\n\n";
0726
0727     print $out "#include <qglobal.h>\n";
0728     print $out "#include <KLocalizedString>\n\n";
0729
0730     print $out "struct TUnicodeBlock\n";
0731     print $out "{\n";
0732     print $out "    quint32    start,\n";
0733     print $out "               end;\n";
0734     print $out "    const char *blockName;\n";
0735     print $out "};\n\n";
0736     print $out "static const struct TUnicodeBlock constUnicodeBlocks[] =\n";
0737     print $out "{\n";
0738     while (my $line = <$blocks>)
0739     {
0740         $line =~ /^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$/ or next;
0741         print $out qq/    { 0x$1, 0x$2, kli18n("$3") },\n/;
0742     }
0743     print $out "    { 0x0, 0x0, NULL }\n";
0744     print $out "};\n\n";
0745
0746
0747     close ($blocks);
0748     close ($out);
0749
0750     print " done.\n";
0751 }
0752
0753 #------------------------#
0754
0755 sub process_scripts_txt ($)
0756 {
0757     my ($scripts_txt) = @_;
0758
0759     my %script_hash;
0760     my %scripts;
0761
0762     open (my $scripts, $scripts_txt) or die;
0763     open (my $out, "> UnicodeScripts.h") or die;
0764
0765     print "processing $scripts_txt...";
0766
0767     while (my $line = <$scripts>)
0768     {
0769         my ($start, $end, $raw_script);
0770
0771         if ($line =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)/)
0772         {
0773             $start = hex ($1);
0774             $end = hex ($2);
0775             $raw_script = $3;
0776         }
0777         elsif ($line =~ /^([0-9A-F]+)\s+;\s+(\S+)/)
0778         {
0779             $start = hex ($1);
0780             $end = $start;
0781             $raw_script = $2;
0782         }
0783         else
0784         {
0785             next;
0786         }
0787
0788         my $script = $raw_script;
0789         $script =~ tr/_/ /;
0790         $script =~ s/(\w+)/\u\L$1/g;
0791
0792         $script_hash{$start} = { 'end' => $end, 'script' => $script };
0793         $scripts{$script} = 1;
0794     }
0795
0796     close ($scripts);
0797
0798     # Adds Common to make sure works with UCD <= 4.0.0
0799     $scripts{"Common"} = 1;
0800
0801     print $out "/* UnicodeScripts.h */\n";
0802     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
0803     print $out "/* Generated by $0 */\n";
0804     print $out "/* Generated from UCD version $v */\n\n";
0805
0806     print $out "#pragma once\n\n";
0807
0808     print $out "#include <QtGlobal>\n";
0809     print $out "#include <KLocalizedString>\n\n";
0810
0811     print $out "static const char * const constUnicodeScriptList[] =\n";
0812     print $out "{\n";
0813     my $i = 0;
0814     for my $script (sort keys %scripts)
0815     {
0816         $scripts{$script} = $i;
0817         print $out qq/    kli18n("$script"),\n/;
0818         $i++;
0819     }
0820     print $out "    NULL\n";
0821     print $out "};\n\n";
0822
0823     print $out "struct TUnicodeScript\n";
0824     print $out "{\n";
0825     print $out "    quint32 start,\n";
0826     print $out "            end;\n";
0827     print $out "    int     scriptIndex;   /* index into constUnicodeScriptList */\n";
0828     print $out "};\n\n";
0829     print $out "static const TUnicodeScript constUnicodeScripts[] =\n";
0830     print $out "{\n";
0831     for my $start (sort { $a <=> $b } keys %script_hash)
0832     {
0833         printf $out (qq/    { 0x%04X, 0x%04X, \%2d },\n/,
0834                      $start, $script_hash{$start}->{'end'}, $scripts{$script_hash{$start}->{'script'}});
0835     }
0836     printf $out "    { 0x0, 0x0, -1 }\n";
0837     print $out "};\n\n";
0838
0839
0840     close ($out);
0841     print " done.\n";
0842 }