File indexing completed on 2024-06-02 05:43:57
0001 #!/usr/bin/perl -w 0002 # 0003 # Note: This file is taken, and modified, from gucharmap/gen-guch-unicode-tables.pl - svn revision 1040 0004 # 0005 # $Id$ 0006 # 0007 # generates in the current directory: 0008 # - UnicodeBlocks.h 0009 # - unicode-names.h 0010 # - unicode-nameslist.h 0011 # - unicode-unihan.h 0012 # - UnicodeCategories.h 0013 # - UnicodeScripts.h 0014 # 0015 # usage: ./gen-guch-unicode-tables.pl UNICODE-VERSION DIRECTORY 0016 # where DIRECTORY contains UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt 0017 # 0018 0019 use strict; 0020 use vars ('$UNZIP', '$ICONV'); 0021 0022 # if these things aren't in your path you can put full paths to them here 0023 $UNZIP = 'unzip'; 0024 $ICONV = 'iconv'; 0025 0026 sub process_unicode_data_txt ($); 0027 sub process_unihan_zip ($); 0028 sub process_nameslist_txt ($); 0029 sub process_blocks_txt ($); 0030 sub process_scripts_txt ($); 0031 0032 $| = 1; # flush stdout buffer 0033 0034 if (@ARGV != 2) 0035 { 0036 $0 =~ s@.*/@@; 0037 die <<EOF 0038 0039 Usage: $0 UNICODE-VERSION DIRECTORY 0040 0041 DIRECTORY should contain the following Unicode data files: 0042 UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt 0043 0044 which can be found at https://www.unicode.org/Public/UNIDATA/ 0045 0046 EOF 0047 } 0048 0049 my ($unicodedata_txt, $unihan_zip, $nameslist_txt, $blocks_txt, $scripts_txt); 0050 0051 my $v = $ARGV[0]; 0052 my $d = $ARGV[1]; 0053 opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; 0054 for my $f (readdir ($dir)) 0055 { 0056 $unicodedata_txt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/); 0057 # $unihan_zip = "$d/$f" if ($f =~ /Unihan.*\.zip/); 0058 # $nameslist_txt = "$d/$f" if ($f =~ /NamesList.*\.txt/); 0059 $blocks_txt = "$d/$f" if ($f =~ /Blocks.*\.txt/); 0060 $scripts_txt = "$d/$f" if ($f =~ /Scripts.*\.txt/); 0061 } 0062 0063 defined $unicodedata_txt or die "Did not find $d/UnicodeData.txt"; 0064 # defined $unihan_zip or die "Did not find $d/Unihan.zip"; 0065 # defined $nameslist_txt or die "Did not find $d/NamesList.txt"; 0066 defined $blocks_txt or die "Did not find $d/Blocks.txt"; 0067 defined $scripts_txt or die "Did not find $d/Scripts.txt"; 0068 0069 process_unicode_data_txt ($unicodedata_txt); 0070 # process_nameslist_txt ($nameslist_txt); 0071 process_blocks_txt ($blocks_txt); 0072 process_scripts_txt ($scripts_txt); 0073 # process_unihan_zip ($unihan_zip); 0074 0075 exit; 0076 0077 0078 #------------------------# 0079 0080 sub process_unicode_data_txt ($) 0081 { 0082 my ($unicodedata_txt) = @_; 0083 0084 # part 1: names 0085 0086 open (my $unicodedata, $unicodedata_txt) or die; 0087 # open (my $out, "> unicode-names.h") or die; 0088 0089 print "processing $unicodedata_txt..."; 0090 # 0091 # print $out "/* unicode-names.h */\n"; 0092 # print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n"; 0093 # print $out "/* Generated by $0 */\n"; 0094 # print $out "/* Generated from UCD version $v */\n\n"; 0095 # 0096 # print $out "#pragma once\n\n"; 0097 # 0098 # print $out "#include <glib/gunicode.h>\n\n"; 0099 # print $out "#include \"gucharmap-intl.h\"\n\n"; 0100 # 0101 # my @unicode_pairs; 0102 # my %names; 0103 # 0104 # while (my $line = <$unicodedata>) 0105 # { 0106 # chomp $line; 0107 # $line =~ /^([^;]+);([^;]+)/ or die; 0108 # 0109 # my $hex = $1; 0110 # my $name = $2; 0111 # 0112 # $names{$name} = 1; 0113 # push @unicode_pairs, [$hex, $name]; 0114 # } 0115 # 0116 # print $out "static const char unicode_names_strings[] = \\\n"; 0117 # 0118 # my $offset = 0; 0119 # 0120 # foreach my $name (sort keys %names) { 0121 # print $out " \"$name\\0\"\n"; 0122 # $names{$name} = $offset; 0123 # $offset += length($name) + 1; 0124 # } 0125 # 0126 # undef $offset; 0127 # 0128 # print $out ";\n"; 0129 # 0130 # print $out "typedef struct _UnicodeName UnicodeName;\n\n"; 0131 # 0132 # print $out "static const struct _UnicodeName\n"; 0133 # print $out "{\n"; 0134 # print $out " gunichar index;\n"; 0135 # print $out " guint32 name_offset;\n"; 0136 # print $out "} \n"; 0137 # print $out "unicode_names[] =\n"; 0138 # print $out "{\n"; 0139 # 0140 # my $first_line = 1; 0141 # 0142 # foreach my $pair (@unicode_pairs) { 0143 # if (!$first_line) { 0144 # print $out ",\n"; 0145 # } else { 0146 # $first_line = 0; 0147 # } 0148 # 0149 # my ($hex, $name) = @{$pair}; 0150 # my $offset = $names{$name}; 0151 # print $out " {0x$hex, $offset}"; 0152 # } 0153 # 0154 # print $out "\n};\n\n"; 0155 # 0156 # print $out <<EOT; 0157 # static inline const char * unicode_name_get_name(const UnicodeName *entry) 0158 # { 0159 # guint32 offset = entry->name_offset; 0160 # return unicode_names_strings + offset; 0161 # } 0162 # 0163 # EOT 0164 # 0165 # 0166 # undef %names; 0167 # undef @unicode_pairs; 0168 # 0169 # close ($unicodedata); 0170 # close ($out); 0171 0172 # part 2: categories 0173 0174 open ($unicodedata, $unicodedata_txt) or die; 0175 open (my $out, "> UnicodeCategories.h") or die; 0176 0177 # Map general category code onto symbolic name. 0178 my %mappings = 0179 ( 0180 # Normative. 0181 'Lu' => "UNICODE_UPPERCASE_LETTER", 0182 'Ll' => "UNICODE_LOWERCASE_LETTER", 0183 'Lt' => "UNICODE_TITLECASE_LETTER", 0184 'Mn' => "UNICODE_NON_SPACING_MARK", 0185 'Mc' => "UNICODE_COMBINING_MARK", 0186 'Me' => "UNICODE_ENCLOSING_MARK", 0187 'Nd' => "UNICODE_DECIMAL_NUMBER", 0188 'Nl' => "UNICODE_LETTER_NUMBER", 0189 'No' => "UNICODE_OTHER_NUMBER", 0190 'Zs' => "UNICODE_SPACE_SEPARATOR", 0191 'Zl' => "UNICODE_LINE_SEPARATOR", 0192 'Zp' => "UNICODE_PARAGRAPH_SEPARATOR", 0193 'Cc' => "UNICODE_CONTROL", 0194 'Cf' => "UNICODE_FORMAT", 0195 'Cs' => "UNICODE_SURROGATE", 0196 'Co' => "UNICODE_PRIVATE_USE", 0197 'Cn' => "UNICODE_UNASSIGNED", 0198 0199 # Informative. 0200 'Lm' => "UNICODE_MODIFIER_LETTER", 0201 'Lo' => "UNICODE_OTHER_LETTER", 0202 'Pc' => "UNICODE_CONNECT_PUNCTUATION", 0203 'Pd' => "UNICODE_DASH_PUNCTUATION", 0204 'Ps' => "UNICODE_OPEN_PUNCTUATION", 0205 'Pe' => "UNICODE_CLOSE_PUNCTUATION", 0206 'Pi' => "UNICODE_INITIAL_PUNCTUATION", 0207 'Pf' => "UNICODE_FINAL_PUNCTUATION", 0208 'Po' => "UNICODE_OTHER_PUNCTUATION", 0209 'Sm' => "UNICODE_MATH_SYMBOL", 0210 'Sc' => "UNICODE_CURRENCY_SYMBOL", 0211 'Sk' => "UNICODE_MODIFIER_SYMBOL", 0212 'So' => "UNICODE_OTHER_SYMBOL" 0213 ); 0214 0215 # these shouldn't be -1 0216 my ($codepoint, $last_codepoint, $start_codepoint) = (-999, -999, -999); 0217 0218 my ($category, $last_category) = ("FAKE1", "FAKE2"); 0219 my ($started_range, $finished_range) = (undef, undef); 0220 0221 print $out "/* UnicodeCategories.h */\n"; 0222 print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n"; 0223 print $out "/* Generated by $0 */\n"; 0224 print $out "/* Generated from UCD version $v */\n\n"; 0225 0226 print $out "#pragma once\n\n"; 0227 print $out "#include <QtGlobal>\n\n"; 0228 print $out "enum EUnicodeCategory\n"; 0229 print $out "{\n"; 0230 print $out " UNICODE_UPPERCASE_LETTER,\n"; 0231 print $out " UNICODE_LOWERCASE_LETTER,\n"; 0232 print $out " UNICODE_TITLECASE_LETTER,\n"; 0233 print $out " UNICODE_NON_SPACING_MARK,\n"; 0234 print $out " UNICODE_COMBINING_MARK,\n"; 0235 print $out " UNICODE_ENCLOSING_MARK,\n"; 0236 print $out " UNICODE_DECIMAL_NUMBER,\n"; 0237 print $out " UNICODE_LETTER_NUMBER,\n"; 0238 print $out " UNICODE_OTHER_NUMBER,\n"; 0239 print $out " UNICODE_SPACE_SEPARATOR,\n"; 0240 print $out " UNICODE_LINE_SEPARATOR,\n"; 0241 print $out " UNICODE_PARAGRAPH_SEPARATOR,\n"; 0242 print $out " UNICODE_CONTROL,\n"; 0243 print $out " UNICODE_FORMAT,\n"; 0244 print $out " UNICODE_SURROGATE,\n"; 0245 print $out " UNICODE_PRIVATE_USE,\n"; 0246 print $out " UNICODE_UNASSIGNED,\n"; 0247 print $out " UNICODE_MODIFIER_LETTER,\n"; 0248 print $out " UNICODE_OTHER_LETTER,\n"; 0249 print $out " UNICODE_CONNECT_PUNCTUATION,\n"; 0250 print $out " UNICODE_DASH_PUNCTUATION,\n"; 0251 print $out " UNICODE_OPEN_PUNCTUATION,\n"; 0252 print $out " UNICODE_CLOSE_PUNCTUATION,\n"; 0253 print $out " UNICODE_INITIAL_PUNCTUATION,\n"; 0254 print $out " UNICODE_FINAL_PUNCTUATION,\n"; 0255 print $out " UNICODE_OTHER_PUNCTUATION,\n"; 0256 print $out " UNICODE_MATH_SYMBOL,\n"; 0257 print $out " UNICODE_CURRENCY_SYMBOL,\n"; 0258 print $out " UNICODE_MODIFIER_SYMBOL,\n"; 0259 print $out " UNICODE_OTHER_SYMBOL,\n"; 0260 print $out "\n"; 0261 print $out " UNICODE_INVALID\n"; 0262 print $out "};\n\n"; 0263 print $out "struct TUnicodeCategory\n"; 0264 print $out "{\n"; 0265 print $out " quint32 start;\n"; 0266 print $out " quint32 end;\n"; 0267 print $out " EUnicodeCategory category;\n"; 0268 print $out "};\n\n"; 0269 print $out "static const TUnicodeCategory constUnicodeCategoryList[] =\n"; 0270 print $out "{\n"; 0271 0272 while (my $line = <$unicodedata>) 0273 { 0274 $line =~ /^([0-9A-F]*);([^;]*);([^;]*);/ or die; 0275 my $codepoint = hex ($1); 0276 my $name = $2; 0277 my $category = $mappings{$3}; 0278 0279 if ($finished_range 0280 or ($category ne $last_category) 0281 or (not $started_range and $codepoint != $last_codepoint + 1)) 0282 { 0283 if ($last_codepoint >= 0) { 0284 printf $out (" { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category); 0285 } 0286 0287 $start_codepoint = $codepoint; 0288 } 0289 0290 if ($name =~ /^<.*First>$/) { 0291 $started_range = 1; 0292 $finished_range = undef; 0293 } 0294 elsif ($name =~ /^<.*Last>$/) { 0295 $started_range = undef; 0296 $finished_range = 1; 0297 } 0298 elsif ($finished_range) { 0299 $finished_range = undef; 0300 } 0301 0302 $last_codepoint = $codepoint; 0303 $last_category = $category; 0304 } 0305 printf $out (" { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category); 0306 printf $out " { 0x0, 0x0, UNICODE_INVALID }\n"; 0307 print $out "};\n\n"; 0308 0309 0310 close ($out); 0311 print " done.\n"; 0312 } 0313 0314 #------------------------# 0315 0316 # XXX should do kFrequency too 0317 sub process_unihan_zip ($) 0318 { 0319 my ($unihan_zip) = @_; 0320 0321 open (my $unihan, "$UNZIP -c $unihan_zip |") or die; 0322 open (my $out, "> unicode-unihan.h") or die; 0323 0324 print "processing $unihan_zip"; 0325 0326 print $out "/* unicode-unihan.h */\n"; 0327 print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n"; 0328 print $out "/* Generated by $0 */\n"; 0329 print $out "/* Generated from UCD version $v */\n\n"; 0330 0331 print $out "#pragma once\n\n"; 0332 0333 print $out "#include <glib/gunicode.h>\n\n"; 0334 0335 print $out "typedef struct _Unihan Unihan;\n\n"; 0336 0337 print $out "static const struct _Unihan\n"; 0338 print $out "{\n"; 0339 print $out " gunichar index;\n"; 0340 print $out " gint32 kDefinition;\n"; 0341 print $out " gint32 kCantonese;\n"; 0342 print $out " gint32 kMandarin;\n"; 0343 print $out " gint32 kTang;\n"; 0344 print $out " gint32 kKorean;\n"; 0345 print $out " gint32 kJapaneseKun;\n"; 0346 print $out " gint32 kJapaneseOn;\n"; 0347 print $out "} \n"; 0348 print $out "unihan[] =\n"; 0349 print $out "{\n"; 0350 0351 my @strings; 0352 my $offset = 0; 0353 0354 my $wc = 0; 0355 my ($kDefinition, $kCantonese, $kMandarin, $kTang, $kKorean, $kJapaneseKun, $kJapaneseOn); 0356 0357 my $i = 0; 0358 while (my $line = <$unihan>) 0359 { 0360 chomp $line; 0361 $line =~ /^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$/ or next; 0362 0363 my $new_wc = hex ($1); 0364 my $field = $2; 0365 0366 my $value = $3; 0367 $value =~ s/\\/\\\\/g; 0368 $value =~ s/\"/\\"/g; 0369 0370 if ($new_wc != $wc) 0371 { 0372 if (defined $kDefinition or defined $kCantonese or defined $kMandarin 0373 or defined $kTang or defined $kKorean or defined $kJapaneseKun 0374 or defined $kJapaneseOn) 0375 { 0376 printf $out (" { 0x%04X, \%d, \%d, \%d, \%d, \%d, \%d, \%d },\n", 0377 $wc, 0378 (defined($kDefinition) ? $kDefinition : -1), 0379 (defined($kCantonese) ? $kCantonese: -1), 0380 (defined($kMandarin) ? $kMandarin : -1), 0381 (defined($kTang) ? $kTang : -1), 0382 (defined($kKorean) ? $kKorean : -1), 0383 (defined($kJapaneseKun) ? $kJapaneseKun : -1), 0384 (defined($kJapaneseOn) ? $kJapaneseOn : -1)); 0385 } 0386 0387 $wc = $new_wc; 0388 0389 undef $kDefinition; 0390 undef $kCantonese; 0391 undef $kMandarin; 0392 undef $kTang; 0393 undef $kKorean; 0394 undef $kJapaneseKun; 0395 undef $kJapaneseOn; 0396 } 0397 0398 for my $f qw(kDefinition kCantonese kMandarin 0399 kTang kKorean kJapaneseKun kJapaneseOn) { 0400 0401 if ($field eq $f) { 0402 push @strings, $value; 0403 my $last_offset = $offset; 0404 $offset += length($value) + 1; 0405 $value = $last_offset; 0406 last; 0407 } 0408 } 0409 0410 if ($field eq "kDefinition") { 0411 $kDefinition = $value; 0412 } 0413 elsif ($field eq "kCantonese") { 0414 $kCantonese = $value; 0415 } 0416 elsif ($field eq "kMandarin") { 0417 $kMandarin = $value; 0418 } 0419 elsif ($field eq "kTang") { 0420 $kTang = $value; 0421 } 0422 elsif ($field eq "kKorean") { 0423 $kKorean = $value; 0424 } 0425 elsif ($field eq "kJapaneseKun") { 0426 $kJapaneseKun = $value; 0427 } 0428 elsif ($field eq "kJapaneseOn") { 0429 $kJapaneseOn = $value; 0430 } 0431 0432 if ($i++ % 32768 == 0) { 0433 print "."; 0434 } 0435 } 0436 0437 print $out "};\n\n"; 0438 0439 print $out "static const char unihan_strings[] = \\\n"; 0440 0441 for my $s (@strings) { 0442 print $out " \"$s\\0\"\n"; 0443 } 0444 print $out ";\n\n"; 0445 0446 print $out "static const Unihan *_get_unihan (gunichar uc)\n;"; 0447 0448 for my $name qw(kDefinition kCantonese kMandarin 0449 kTang kKorean kJapaneseKun kJapaneseOn) { 0450 print $out <<EOT; 0451 0452 static inline const char * unihan_get_$name (const Unihan *uh) 0453 { 0454 gint32 offset = uh->$name; 0455 if (offset == -1) 0456 return NULL; 0457 return unihan_strings + offset; 0458 } 0459 0460 G_CONST_RETURN gchar * 0461 gucharmap_get_unicode_$name (gunichar uc) 0462 { 0463 const Unihan *uh = _get_unihan (uc); 0464 if (uh == NULL) 0465 return NULL; 0466 else 0467 return unihan_get_$name (uh); 0468 } 0469 0470 EOT 0471 } 0472 0473 0474 close ($unihan); 0475 close ($out); 0476 0477 print " done.\n"; 0478 } 0479 0480 #------------------------# 0481 0482 # $nameslist_hash = 0483 # { 0484 # 0x0027 => { '=' => { 0485 # 'index' => 30, 0486 # 'values' => [ 'APOSTROPHE-QUOTE', 'APL quote' ] 0487 # } 0488 # '*' => { 0489 # 'index' => 50, 0490 # 'values' => [ 'neutral (vertical) glyph with mixed usage', 0491 # '2019 is preferred for apostrophe', 0492 # 'preferred characters in English for paired quotation marks are 2018 & 2019' 0493 # ] 0494 # } 0495 # # etc 0496 # } 0497 # # etc 0498 # }; 0499 # 0500 sub process_nameslist_txt ($) 0501 { 0502 my ($nameslist_txt) = @_; 0503 0504 open (my $nameslist, "$ICONV -f 'ISO8859-1' -t 'UTF-8' $nameslist_txt |") or die; 0505 0506 print "processing $nameslist_txt..."; 0507 0508 my ($equal_i, $ex_i, $star_i, $pound_i, $colon_i) = (0, 0, 0, 0, 0); 0509 my $wc = 0; 0510 0511 my $nameslist_hash; 0512 0513 while (my $line = <$nameslist>) 0514 { 0515 chomp ($line); 0516 0517 if ($line =~ /^@/) 0518 { 0519 next; 0520 } 0521 elsif ($line =~ /^([0-9A-F]+)/) 0522 { 0523 $wc = hex ($1); 0524 } 0525 elsif ($line =~ /^\s+=\s+(.+)$/) 0526 { 0527 my $value = $1; 0528 $value =~ s/\\/\\\\/g; 0529 $value =~ s/\"/\\"/g; 0530 0531 if (not defined $nameslist_hash->{$wc}->{'='}->{'index'}) { 0532 $nameslist_hash->{$wc}->{'='}->{'index'} = $equal_i; 0533 } 0534 push (@{$nameslist_hash->{$wc}->{'='}->{'values'}}, $value); 0535 0536 $equal_i++; 0537 } 0538 elsif ($line =~ /^\s+\*\s+(.+)$/) 0539 { 0540 my $value = $1; 0541 $value =~ s/\\/\\\\/g; 0542 $value =~ s/\"/\\"/g; 0543 0544 if (not defined $nameslist_hash->{$wc}->{'*'}->{'index'}) { 0545 $nameslist_hash->{$wc}->{'*'}->{'index'} = $star_i; 0546 } 0547 push (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}, $value); 0548 0549 $star_i++; 0550 } 0551 elsif ($line =~ /^\s+#\s+(.+)$/) 0552 { 0553 my $value = $1; 0554 $value =~ s/\\/\\\\/g; 0555 $value =~ s/\"/\\"/g; 0556 0557 if (not defined $nameslist_hash->{$wc}->{'#'}->{'index'}) { 0558 $nameslist_hash->{$wc}->{'#'}->{'index'} = $pound_i; 0559 } 0560 push (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}, $value); 0561 0562 $pound_i++; 0563 } 0564 elsif ($line =~ /^\s+:\s+(.+)$/) 0565 { 0566 my $value = $1; 0567 $value =~ s/\\/\\\\/g; 0568 $value =~ s/\"/\\"/g; 0569 0570 if (not defined $nameslist_hash->{$wc}->{':'}->{'index'}) { 0571 $nameslist_hash->{$wc}->{':'}->{'index'} = $colon_i; 0572 } 0573 push (@{$nameslist_hash->{$wc}->{':'}->{'values'}}, $value); 0574 0575 $colon_i++; 0576 } 0577 elsif ($line =~ /^\s+x\s+.*([0-9A-F]{4,6})\)$/) # this one is different 0578 { 0579 my $value = hex ($1); 0580 0581 if (not defined $nameslist_hash->{$wc}->{'x'}->{'index'}) { 0582 $nameslist_hash->{$wc}->{'x'}->{'index'} = $ex_i; 0583 } 0584 push (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}, $value); 0585 0586 $ex_i++; 0587 } 0588 } 0589 0590 close ($nameslist); 0591 0592 open (my $out, "> unicode-nameslist.h") or die; 0593 0594 print $out "/* unicode-nameslist.h */\n"; 0595 print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n"; 0596 print $out "/* Generated by $0 */\n"; 0597 print $out "/* Generated from UCD version $v */\n\n"; 0598 0599 print $out "#pragma once\n\n"; 0600 0601 print $out "#include <glib/gunicode.h>\n\n"; 0602 0603 print $out "typedef struct _UnicharString UnicharString;\n"; 0604 print $out "typedef struct _UnicharUnichar UnicharUnichar;\n"; 0605 print $out "typedef struct _NamesList NamesList;\n\n"; 0606 0607 print $out "struct _UnicharString\n"; 0608 print $out "{\n"; 0609 print $out " gunichar index;\n"; 0610 print $out " const gchar *value;\n"; 0611 print $out "}; \n\n"; 0612 0613 print $out "struct _UnicharUnichar\n"; 0614 print $out "{\n"; 0615 print $out " gunichar index;\n"; 0616 print $out " gunichar value;\n"; 0617 print $out "}; \n\n"; 0618 0619 print $out "struct _NamesList\n"; 0620 print $out "{\n"; 0621 print $out " gunichar index;\n"; 0622 print $out " gint equals_index; /* -1 means */\n"; 0623 print $out " gint stars_index; /* this character */\n"; 0624 print $out " gint exes_index; /* doesn't */\n"; 0625 print $out " gint pounds_index; /* have any */\n"; 0626 print $out " gint colons_index;\n"; 0627 print $out "};\n\n"; 0628 0629 print $out "static const UnicharString names_list_equals[] = \n"; 0630 print $out "{\n"; 0631 for $wc (sort {$a <=> $b} keys %{$nameslist_hash}) 0632 { 0633 next if not exists $nameslist_hash->{$wc}->{'='}; 0634 for my $value (@{$nameslist_hash->{$wc}->{'='}->{'values'}}) { 0635 printf $out (qq/ { 0x%04X, "\%s" },\n/, $wc, $value); 0636 } 0637 } 0638 print $out " { (gunichar)(-1), 0 }\n"; 0639 print $out "};\n\n"; 0640 0641 print $out "static const UnicharString names_list_stars[] = \n"; 0642 print $out "{\n"; 0643 for $wc (sort {$a <=> $b} keys %{$nameslist_hash}) 0644 { 0645 next if not exists $nameslist_hash->{$wc}->{'*'}; 0646 for my $value (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}) { 0647 printf $out (qq/ { 0x%04X, "\%s" },\n/, $wc, $value); 0648 } 0649 } 0650 print $out " { (gunichar)(-1), 0 }\n"; 0651 print $out "};\n\n"; 0652 0653 print $out "static const UnicharString names_list_pounds[] = \n"; 0654 print $out "{\n"; 0655 for $wc (sort {$a <=> $b} keys %{$nameslist_hash}) 0656 { 0657 next if not exists $nameslist_hash->{$wc}->{'#'}; 0658 for my $value (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}) { 0659 printf $out (qq/ { 0x%04X, "\%s" },\n/, $wc, $value); 0660 } 0661 } 0662 print $out " { (gunichar)(-1), 0 }\n"; 0663 print $out "};\n\n"; 0664 0665 print $out "static const UnicharUnichar names_list_exes[] = \n"; 0666 print $out "{\n"; 0667 for $wc (sort {$a <=> $b} keys %{$nameslist_hash}) 0668 { 0669 next if not exists $nameslist_hash->{$wc}->{'x'}; 0670 for my $value (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}) { 0671 printf $out (qq/ { 0x%04X, 0x%04X },\n/, $wc, $value); 0672 } 0673 } 0674 print $out " { (gunichar)(-1), 0 }\n"; 0675 print $out "};\n\n"; 0676 0677 print $out "static const UnicharString names_list_colons[] = \n"; 0678 print $out "{\n"; 0679 for $wc (sort {$a <=> $b} keys %{$nameslist_hash}) 0680 { 0681 next if not exists $nameslist_hash->{$wc}->{':'}; 0682 for my $value (@{$nameslist_hash->{$wc}->{':'}->{'values'}}) { 0683 printf $out (qq/ { 0x%04X, "\%s" },\n/, $wc, $value); 0684 } 0685 } 0686 print $out " { (gunichar)(-1), 0 }\n"; 0687 print $out "};\n\n"; 0688 0689 print $out "static const NamesList names_list[] =\n"; 0690 print $out "{\n"; 0691 for $wc (sort {$a <=> $b} keys %{$nameslist_hash}) 0692 { 0693 my $eq = exists $nameslist_hash->{$wc}->{'='}->{'index'} ? $nameslist_hash->{$wc}->{'='}->{'index'} : -1; 0694 my $star = exists $nameslist_hash->{$wc}->{'*'}->{'index'} ? $nameslist_hash->{$wc}->{'*'}->{'index'} : -1; 0695 my $ex = exists $nameslist_hash->{$wc}->{'x'}->{'index'} ? $nameslist_hash->{$wc}->{'x'}->{'index'} : -1; 0696 my $pound = exists $nameslist_hash->{$wc}->{'#'}->{'index'} ? $nameslist_hash->{$wc}->{'#'}->{'index'} : -1; 0697 my $colon = exists $nameslist_hash->{$wc}->{':'}->{'index'} ? $nameslist_hash->{$wc}->{':'}->{'index'} : -1; 0698 0699 printf $out (" { 0x%04X, \%d, \%d, \%d, \%d, \%d },\n", $wc, $eq, $star, $ex, $pound, $colon); 0700 } 0701 print $out "};\n\n"; 0702 0703 0704 close ($out); 0705 0706 print " done.\n"; 0707 } 0708 0709 #------------------------# 0710 0711 sub process_blocks_txt ($) 0712 { 0713 my ($blocks_txt) = @_; 0714 0715 open (my $blocks, $blocks_txt) or die; 0716 open (my $out, "> UnicodeBlocks.h") or die; 0717 0718 print "processing $blocks_txt..."; 0719 0720 print $out "/* UnicodeBlocks.h */\n"; 0721 print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n"; 0722 print $out "/* Generated by $0 */\n"; 0723 print $out "/* Generated from UCD version $v */\n\n"; 0724 0725 print $out "#pragma once\n\n"; 0726 0727 print $out "#include <qglobal.h>\n"; 0728 print $out "#include <KLocalizedString>\n\n"; 0729 0730 print $out "struct TUnicodeBlock\n"; 0731 print $out "{\n"; 0732 print $out " quint32 start,\n"; 0733 print $out " end;\n"; 0734 print $out " const char *blockName;\n"; 0735 print $out "};\n\n"; 0736 print $out "static const struct TUnicodeBlock constUnicodeBlocks[] =\n"; 0737 print $out "{\n"; 0738 while (my $line = <$blocks>) 0739 { 0740 $line =~ /^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$/ or next; 0741 print $out qq/ { 0x$1, 0x$2, kli18n("$3") },\n/; 0742 } 0743 print $out " { 0x0, 0x0, NULL }\n"; 0744 print $out "};\n\n"; 0745 0746 0747 close ($blocks); 0748 close ($out); 0749 0750 print " done.\n"; 0751 } 0752 0753 #------------------------# 0754 0755 sub process_scripts_txt ($) 0756 { 0757 my ($scripts_txt) = @_; 0758 0759 my %script_hash; 0760 my %scripts; 0761 0762 open (my $scripts, $scripts_txt) or die; 0763 open (my $out, "> UnicodeScripts.h") or die; 0764 0765 print "processing $scripts_txt..."; 0766 0767 while (my $line = <$scripts>) 0768 { 0769 my ($start, $end, $raw_script); 0770 0771 if ($line =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)/) 0772 { 0773 $start = hex ($1); 0774 $end = hex ($2); 0775 $raw_script = $3; 0776 } 0777 elsif ($line =~ /^([0-9A-F]+)\s+;\s+(\S+)/) 0778 { 0779 $start = hex ($1); 0780 $end = $start; 0781 $raw_script = $2; 0782 } 0783 else 0784 { 0785 next; 0786 } 0787 0788 my $script = $raw_script; 0789 $script =~ tr/_/ /; 0790 $script =~ s/(\w+)/\u\L$1/g; 0791 0792 $script_hash{$start} = { 'end' => $end, 'script' => $script }; 0793 $scripts{$script} = 1; 0794 } 0795 0796 close ($scripts); 0797 0798 # Adds Common to make sure works with UCD <= 4.0.0 0799 $scripts{"Common"} = 1; 0800 0801 print $out "/* UnicodeScripts.h */\n"; 0802 print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n"; 0803 print $out "/* Generated by $0 */\n"; 0804 print $out "/* Generated from UCD version $v */\n\n"; 0805 0806 print $out "#pragma once\n\n"; 0807 0808 print $out "#include <QtGlobal>\n"; 0809 print $out "#include <KLocalizedString>\n\n"; 0810 0811 print $out "static const char * const constUnicodeScriptList[] =\n"; 0812 print $out "{\n"; 0813 my $i = 0; 0814 for my $script (sort keys %scripts) 0815 { 0816 $scripts{$script} = $i; 0817 print $out qq/ kli18n("$script"),\n/; 0818 $i++; 0819 } 0820 print $out " NULL\n"; 0821 print $out "};\n\n"; 0822 0823 print $out "struct TUnicodeScript\n"; 0824 print $out "{\n"; 0825 print $out " quint32 start,\n"; 0826 print $out " end;\n"; 0827 print $out " int scriptIndex; /* index into constUnicodeScriptList */\n"; 0828 print $out "};\n\n"; 0829 print $out "static const TUnicodeScript constUnicodeScripts[] =\n"; 0830 print $out "{\n"; 0831 for my $start (sort { $a <=> $b } keys %script_hash) 0832 { 0833 printf $out (qq/ { 0x%04X, 0x%04X, \%2d },\n/, 0834 $start, $script_hash{$start}->{'end'}, $scripts{$script_hash{$start}->{'script'}}); 0835 } 0836 printf $out " { 0x0, 0x0, -1 }\n"; 0837 print $out "};\n\n"; 0838 0839 0840 close ($out); 0841 print " done.\n"; 0842 }