addons/latexunicodecompletion/fetch_unicode_table.py

0001 #!/usr/bin/env python3
0002
0003 # SPDX-FileCopyrightText: 2021 Ilia Kats <ilia-kats@gmx.net>
0004 # SPDX-License-Identifier: LGPL-2.0-or-later
0005
0006
0007 JULIA_UNICODE_DOCUMENTATION_URL = "https://docs.julialang.org/en/v1/manual/unicode-input/"
0008 CONTAINER_ID = "documenter-page"
0009 OUTFNAME = "completiontable.h"
0010
0011 from urllib import request
0012 from html.parser import HTMLParser
0013 from string import ascii_letters, digits
0014
0015 class JuliaUnicodeCompletionsParser(HTMLParser):
0016     def __init__(self):
0017         super().__init__()
0018         self.table = []
0019         self._in_container = False
0020         self._in_table = False
0021         self._in_header = False
0022         self._in_body = False
0023         self._in_cell = False
0024         self._finished = False
0025
0026         self._current_row = None
0027
0028     def handle_starttag(self, tag, attrs):
0029         if self._finished:
0030             return
0031         if not self._in_container:
0032             for a in attrs:
0033                 if a[0] == "id" and a[1] == CONTAINER_ID:
0034                     self._in_container = True
0035                     break
0036         elif not self._in_table and tag == "table":
0037             self._in_table = True
0038         elif self._in_table:
0039             if tag == "tr":
0040                 if not self._in_header and not self._in_body:
0041                     self._in_header = True
0042                 else:
0043                     self._in_body = True
0044                     self._current_row = []
0045             elif tag == "td" and self._in_body:
0046                 self._in_cell = True
0047
0048     def handle_data(self, data):
0049         if self._finished:
0050             return
0051         if self._in_body:
0052             self._current_row.append(data)
0053
0054     def handle_endtag(self, tag):
0055         if self._finished:
0056             return
0057         if self._in_body:
0058             if tag == "tr":
0059                 for ccompletion in self._current_row[2].split(","):
0060                     self._current_row[2] = ccompletion.strip()
0061                     self.table.append(tuple(self._current_row))
0062                 self._current_row = []
0063             elif tag == "table":
0064                 self._finished = True
0065
0066 parser = JuliaUnicodeCompletionsParser()
0067 with request.urlopen(JULIA_UNICODE_DOCUMENTATION_URL) as page:
0068     parser.feed(page.read().decode(page.headers.get_content_charset()))
0069 parser.close()
0070
0071 parser.table.sort(key=lambda x: x[2])
0072
0073 completionchars = set()
0074 wordchars = set(list(ascii_letters) + list(digits) + ["_"])
0075 with open(OUTFNAME, "w", encoding="utf-8") as out:
0076     out.write(f"""\
0077 #include <QString>
0078 #include <QRegularExpression>
0079 struct Completion {{
0080     const char16_t *completion;
0081     const char16_t *codepoint;
0082     const char16_t *chars;
0083     const char16_t *name;
0084     const uint16_t completion_strlen;
0085 }};
0086
0087 static constexpr uint16_t n_completions = {len(parser.table)};
0088
0089 static constexpr Completion completiontable[] = {{
0090 """)
0091
0092     for i, completion in enumerate(parser.table):
0093         for letter in completion[2][1:]:
0094             if letter not in wordchars:
0095                 completionchars.add(letter)
0096         latexsymlength = (len(completion[2].encode("utf-16")) - 2) // 2
0097         # Python adds the BOM, thus -2. We need number of charachters, not bytes,
0098         # thus //2
0099         latexsym = completion[2].replace("\\", "\\\\")
0100         if i > 0:
0101             out.write(",")
0102         out.write(f"{{\n    u\"{latexsym}\",\n"
0103                   f"    u\"{completion[0]}\",\n"
0104                   f"    u\"{completion[1]}\",\n"
0105                   f"    u\"{completion[3]}\",\n"
0106                   f"    {latexsymlength}\n}}\n")
0107     out.write("""\
0108 };
0109 """)
0110
0111     have_dash = False
0112     if "-" in completionchars:
0113         have_dash = True
0114         completionchars.discard("-")
0115     if "]" in completionchars:
0116         completionchars.discard("]")
0117         completionchars.add("\\]")
0118     charclass = "".join(completionchars)
0119     if have_dash:
0120         charclass += "-"
0121
0122     out.write(f'static const QRegularExpression latexexpr(QStringLiteral("\\\\\\\\:?[\\\\w{charclass}]+:?$"), QRegularExpression::DontCaptureOption);\n')