utils/lexers_matcher/match_languages.py

0001 #!/usr/bin/env python3
0002
0003 """
0004 Copyright (c) 2022 Rafał Lalik <rafallalik@gmail.com>
0005
0006 Permission is hereby granted, free of charge, to any person obtaining a copy
0007 of this software and associated documentation files (the "Software"), to deal
0008 in the Software without restriction, including without limitation the rights
0009 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
0010 copies of the Software, and to permit persons to whom the Software is
0011 furnished to do so, subject to the following conditions:
0012
0013 The above copyright notice and this permission notice shall be included in all
0014 copies or substantial portions of the Software.
0015
0016 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0017 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0018 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
0019 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
0020 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
0021 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0022 SOFTWARE.
0023 """
0024
0025 import argparse
0026 import xml.etree.ElementTree as ET
0027 import pygments.lexers as pyglex
0028 import re
0029 import yaml
0030 from itertools import combinations
0031 from colorama import Fore, Back, Style
0032 #from fuzzywuzzy import fuzz
0033
0034 def main():
0035     parser = argparse.ArgumentParser()
0036     parser.add_argument('-o', '--output', help='output file', type=str, default="lexers_found.xml")
0037     parser.add_argument('-v', '--verbose', help='verbose mode', action='store_true', default=False)
0038     opts, args = parser.parse_known_args()
0039     if opts.verbose:
0040         print("Options: ", opts, args)
0041
0042     with open('config.yml', 'r') as file:
0043         config = yaml.safe_load(file)
0044     print(config)
0045
0046     pygment_lexers = list(pyglex.get_all_lexers())
0047
0048     kde_langs = []
0049
0050     for xmlfile in args:
0051         if xmlfile == 'latex.xml':
0052             continue
0053         #print(f"Read {xmlfile}")
0054         rc = parseXML(xmlfile)
0055         if rc is not None:
0056             print(f"Found '{rc}' language syntax file")
0057             kde_langs.append(rc)
0058
0059     if opts.verbose:
0060         print("***\nKDE languages: ", kde_langs)
0061         print("***\nPygments lexers: ", pygment_lexers, f"total = {len(pygment_lexers)}")
0062         print("\n***\nSearch for KDE lexer")
0063
0064     matched_lexers = []
0065     minted_used_lexers = set()
0066     for kde_lang in kde_langs:
0067         lower_kl = kde_lang.lower()
0068
0069         if kde_lang in config['minted_mapping']:
0070             val_of_minted_mapping = config['minted_mapping'][kde_lang]
0071             if opts.verbose:
0072                 print(Fore.YELLOW + f"+ Mapped KDE lang '{kde_lang}' to minted lang '{val_of_minted_mapping}'" + Style.RESET_ALL)
0073
0074             lexers = list(filter(lambda x: x[0] == val_of_minted_mapping, pygment_lexers))
0075             if len(lexers):
0076                 #print(f"Lexers are {lexers[0][1]} from {pygment_lexers} and {kde_lang}")
0077                 selected_lexers = filter_minted_lexers(lexers[0], config['minted_blacklisted'], minted_used_lexers)
0078
0079                 matched_lexers.append([kde_lang, val_of_minted_mapping, selected_lexers])
0080             else:
0081                 if opts.verbose:
0082                     print(Fore.RED + f"No lexers from {pygment_lexers} and {kde_lang}" + Style.RESET_ALL)
0083             continue
0084
0085         key_of_minted_matching = list(filter(lambda x: x[0].casefold() == kde_lang.casefold(), pygment_lexers))
0086         if len(key_of_minted_matching):
0087             if opts.verbose:
0088                 print(Fore.GREEN + f"+ Matched KDE lang '{kde_lang}' to minted lang '{key_of_minted_matching[0][0]}'" + Style.RESET_ALL)
0089
0090             selected_lexers = filter_minted_lexers(key_of_minted_matching[0], config['minted_blacklisted'], minted_used_lexers)
0091
0092             matched_lexers.append([kde_lang, key_of_minted_matching[0][0], selected_lexers])
0093             continue
0094
0095         #klm = max(kde_langs, key=lambda x: fuzz.ratio(ll[0].lower(), x.lower()))
0096         #klf = fuzz.ratio(ll[0].lower(), klm.lower())
0097         ##print(f"  Fuzz znalazł '{klm}'={klf} dla szukanego '{ll[1]}'")
0098         #if klf > int(config['fuzz_level']):
0099             #print(Fore.YELLOW + f"+ Fuzzed lexer '{ll[0]}' ({klf}) lang '{ll[1]}' with KDE lang '{klm}'" + Style.RESET_ALL)
0100             ##matched_lexers.append([ll[0], klm, ll[1]])
0101         #else:
0102         if opts.verbose:
0103             print(Fore.RED + f"- Lexer for KDE lang '{kde_lang}' not found" + Style.RESET_ALL)
0104
0105     generate_output(matched_lexers, opts.output)
0106
0107
0108 def generate_output(lexers, filename):
0109     print("\n***\nGenerate outputs")
0110
0111     used_codes = []
0112     with open(filename, "w") as f:
0113         f.write('    <list name="MintedCodeLang">\n')
0114         last_lang = None
0115
0116         for code,lang,ll in lexers:
0117             if last_lang is None or last_lang is not lang:
0118                 f.write(f"      <!-- {code} lexers -->\n")
0119
0120             for l in ll:
0121                 f.write(f"      <item>{l}code*</item>\n")
0122                 f.write(f"      <item>{l}code</item>\n")
0123         f.write('    </list>\n')
0124
0125         used_langs = []
0126         mintenv_list = []
0127         f.write('\n\n\n\n\n')
0128         f.write('      <!-- environment type 5: minted environment created with newminted -->\n')
0129         f.write('      <context name="MintedCodeEnv" attribute="Environment" lineEndContext="#stay" fallthroughContext="#pop#pop#pop#pop">\n')
0130         for code,lang,ll in lexers:
0131             for l in ll:
0132                 f.write(f'        <WordDetect String="{l}code*" attribute="Environment" context="Highlighting{code}CodeEnvS"/>\n');
0133                 f.write(f'        <WordDetect String="{l}code" attribute="Environment" context="Highlighting{code}CodeEnv"/>\n');
0134
0135             mintenv_list.append(f'        <WordDetect String="{l}" insensitive="true" context="Highlighting{code}CodeEnv"/>\n')
0136
0137         f.write('''        <RegExpr String=".+code\*" attribute="Environment" context="UnknownHighlightingCodeEnvS"/>
0138         <RegExpr String=".+code" attribute="Environment" context="UnknownHighlightingCodeEnv"/>
0139       </context>\n\n''')
0140
0141
0142         f.write('      <context name="HighlightingSelector" attribute="Normal Text" lineEndContext="#stay">\n')
0143         f.write(''.join(mintenv_list))
0144         f.write('''        <AnyChar String="}]" context="#pop!UnknownHighlightingBegin"/>
0145         <RegExpr String="[^]}]*" context="#stay"/>
0146       </context>
0147 ''')
0148         f.write('\n')
0149         f.write('''      <context name="HighlightingCommon" attribute="Normal Text" lineEndContext="#stay">
0150         <RegExpr String="\\\\end\s*\{(?:lstlisting|minted|[a-zA-Z]+code)\*?\}" attribute="Structure" lookAhead="true" context="#pop#pop#pop#pop#pop#pop"/>
0151       </context>\n''')
0152
0153
0154         for code,lang,ll in lexers:
0155             f.write('''
0156       <context name="Highlighting{0}CodeEnvS" attribute="Error" lineEndContext="#stay">
0157         <DetectSpaces/>
0158         <DetectChar char="{2}" attribute="Normal Text" context="#pop!Highlighting{0}CodeEnvSParam"/>
0159       </context>
0160       <context name="Highlighting{0}CodeEnvSParam" attribute="Error" lineEndContext="#stay">
0161         <DetectSpaces attribute="Normal Text"/>
0162         <DetectChar char="{1}" attribute="Normal Text" context="Highlighting{0}CodeEnvSParamInside"/>
0163         <IncludeRules context="FindComments"/>
0164       </context>
0165       <context name="Highlighting{0}CodeEnvSParamInside" attribute="Normal Text" lineEndContext="#stay">
0166         <DetectSpaces/>
0167         <DetectIdentifier/>
0168         <DetectChar char="{2}" attribute="Normal Text" context="#pop!HighlightingBegin{0}"/>
0169         <IncludeRules context="FindComments"/>
0170         <RegExpr String="\&envname;" attribute="Macro" context="#stay"/>
0171       </context>
0172       <context name="Highlighting{0}CodeEnv" attribute="Normal Text" lineEndContext="#stay">
0173         <DetectChar char="{2}" context="HighlightingBegin{0}"/>
0174         <RegExpr String="[^{2}]*" attribute="Normal Text" context="#stay"/>
0175       </context>
0176       <context name="HighlightingBegin{0}" attribute="Normal Text" lineEndContext="#pop!Highlighting{0}">
0177         <DetectSpaces/>
0178         <RegExpr String=".+" attribute="Error" context="#stay"/>
0179       </context>
0180       <context name="Highlighting{0}" attribute="Normal Text" lineEndContext="#stay">
0181         <IncludeRules context="HighlightingCommon"/>
0182         <IncludeRules context="##{0}" includeAttrib="true"/>
0183       </context>
0184 '''.format(code, '{', '}'))
0185
0186         f.write('      <!-- end of mintedcode environment -->\n')
0187
0188
0189 def filter_minted_lexers(lexers_list, blacklist, used_lexers):
0190     """Loop over lexers, filter out blacklisted and simplify names, pick up uniques"""
0191     selected = set()
0192     current_set = set()
0193
0194     name_filter = r"^[^a-z]+|[^a-z]+$"
0195
0196     for ll in lexers_list[1]:
0197         if ll in blacklist:
0198             print(f" Ignore blacklisted '{ll[1]}' lexer")
0199             continue
0200
0201         new_ll = ll
0202         #rc = re.search(name_filter, new_ll)
0203         #new_ll = re.sub(r"[^a-z]", "", ll[0])
0204         new_ll = re.sub(r"\+\+", "pp", new_ll)
0205         new_ll = re.sub(r"\#", "sharp", new_ll)
0206         new_ll = re.sub(r"[^a-z]", "", new_ll)
0207         #new_ll = re.sub(r"[0-9]", "", new_ll)
0208
0209         if new_ll != ll:
0210             print(f" {ll} => {new_ll}" + Fore.YELLOW + " - Replaced" + Style.RESET_ALL)
0211
0212         rc = re.search(name_filter, new_ll)
0213         if rc is not None:
0214             print(f" {ll} => {new_ll}" + Fore.RED + " - Removed" + Style.RESET_ALL)
0215             continue
0216
0217         if new_ll in used_lexers:
0218             print(f" {ll} => {new_ll}" + Fore.RED  + " - Ignored" + Style.RESET_ALL)
0219             continue
0220
0221         print(f" {ll} => {new_ll}" + Fore.GREEN + " - Added" + Style.RESET_ALL)
0222         selected.add(new_ll)
0223
0224     used_lexers.update(selected)    # update set of all lexers
0225     return sorted(selected)
0226
0227
0228 def search_lexer(langname, pygment_lexers):
0229     for l in pygment_lexers:
0230         #print(l)
0231         if langname.lower() in (x.lower() for x in l[1]):
0232             return l[0]
0233
0234     return None
0235
0236
0237 def split_lexers(lexers):
0238     """Get pygments language with lexers and create all lexers list"""
0239     lexers_list = []
0240     for l in lexers:
0241         for ll in l[1]:
0242             lexers_list.append([ll, l[0], False])
0243
0244     return lexers_list
0245
0246
0247 def parseXML(xmlfile):
0248     tree = ET.parse(xmlfile)
0249     root = tree.getroot();
0250     if root.tag == 'language':
0251         langname = root.attrib['name']
0252         rc = root.find('highlighting')
0253         if rc is None:
0254             if opts.verbose:
0255                 print(f"Language {langname} has no highlightng")
0256             return None
0257
0258         return langname
0259     return None
0260
0261 if __name__ == '__main__':
0262     main()