File indexing completed on 2024-04-28 11:48:59

0001 /*  This file is part of the KDE libraries
0002     SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
0003     SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
0004     SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
0005 
0006     SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008 
0009 #include <QCoreApplication>
0010 #include <QDataStream>
0011 #include <QFile>
0012 #include <QLocale>
0013 #include <QStandardPaths>
0014 
0015 #include "core_debug.h"
0016 #include "guesslanguage.h"
0017 #include "loader_p.h"
0018 #include "speller.h"
0019 #include "spellerplugin_p.h"
0020 #include "tokenizer_p.h"
0021 
0022 /*
0023 All language tags should be valid according to IETF BCP 47, as codified in RFC 4646.
0024 ISO 639-1 codes should be used for the language part except for cases where there
0025 exists no code, then 639-3 codes should be used. Country codes should only be used
0026 in special cases. Scripts can be differentiated by IANA subtags, available here:
0027 http://www.iana.org/assignments/language-subtag-registry
0028 The script tags correspond to ISO 15924
0029 
0030 An overview of the best practices concerning language tagging is available here:
0031 http://www.w3.org/International/articles/language-tags/Overview.en.php
0032 
0033 lang tags should use underscores (_) rather than hyphens (-) to separate subsections.
0034 
0035 EXCEPTIONS:
0036 For cases of known differences from the above tagging scheme and major
0037 spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers
0038 shall be used. All exception shall be noted here:
0039 
0040 BCP        SPELLCHECK
0041 az-Latn    az
0042 
0043 */
0044 
0045 namespace Sonnet
0046 {
0047 class GuessLanguagePrivate
0048 {
0049 public:
0050     GuessLanguagePrivate();
0051     //            language       trigram  score
0052     static QHash<QString, QHash<QString, int>> s_knownModels;
0053 
0054     void loadModels();
0055     QList<QChar::Script> findRuns(const QString &text);
0056     QVector<QString> createOrderedModel(const QString &content);
0057     int distance(const QVector<QString> &model, const QHash<QString, int> &knownModel);
0058     QStringList guessFromTrigrams(const QString &sample, const QStringList &langs);
0059     QStringList identify(const QString &sample, const QList<QChar::Script> &scripts);
0060     QString guessFromDictionaries(const QString &sentence, const QStringList &candidates);
0061 
0062     static QSet<QString> s_knownDictionaries;
0063     static QMultiHash<QChar::Script, QString> s_scriptLanguages;
0064     static QMap<QString, QString> s_dictionaryNameMap;
0065 
0066     const int MIN_LENGTH;
0067     int m_maxItems;
0068     double m_minConfidence;
0069 };
0070 
0071 QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels;
0072 QSet<QString> GuessLanguagePrivate::s_knownDictionaries;
0073 QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages;
0074 QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap;
0075 
0076 QStringList getNames(QLocale::Script script)
0077 {
0078     QStringList locales;
0079     const auto matchingLocales = QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry);
0080     locales.reserve(matchingLocales.size());
0081     for (const QLocale &locale : matchingLocales) {
0082         locales << locale.name();
0083     }
0084     return locales;
0085 }
0086 
0087 GuessLanguagePrivate::GuessLanguagePrivate()
0088     : MIN_LENGTH(5)
0089     , m_maxItems(1)
0090     , m_minConfidence(0)
0091 {
0092     if (!s_scriptLanguages.isEmpty()) {
0093         return;
0094     }
0095 
0096     const QStringList languages = Loader::openLoader()->languages();
0097     s_knownDictionaries = QSet<QString>(languages.begin(), languages.end());
0098     QSet<QString> dictionaryLanguages;
0099     for (const QString &dictName : std::as_const(s_knownDictionaries)) {
0100         QString languageName = QLocale(dictName).name();
0101         if (languageName.isEmpty()) {
0102             qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName;
0103             continue;
0104         }
0105         dictionaryLanguages.insert(languageName);
0106     }
0107 
0108     QSet<QString> allLanguages;
0109     for (int i = 0; i < int(QChar::ScriptCount); i++) {
0110         QChar::Script script = static_cast<QChar::Script>(i);
0111         QStringList names;
0112         switch (script) {
0113         case QChar::Script_Latin:
0114             names = getNames(QLocale::LatinScript);
0115             break;
0116         case QChar::Script_Greek:
0117             names = getNames(QLocale::GreekScript);
0118             break;
0119         case QChar::Script_Cyrillic:
0120             names = getNames(QLocale::CyrillicScript);
0121             break;
0122         case QChar::Script_Armenian:
0123             names = getNames(QLocale::ArmenianScript);
0124             break;
0125         case QChar::Script_Hebrew:
0126             names = getNames(QLocale::HebrewScript);
0127             break;
0128         case QChar::Script_Arabic:
0129             names = getNames(QLocale::ArabicScript);
0130             break;
0131         case QChar::Script_Syriac:
0132             names = getNames(QLocale::SyriacScript);
0133             break;
0134         case QChar::Script_Thaana:
0135             names = getNames(QLocale::ThaanaScript);
0136             break;
0137         case QChar::Script_Devanagari:
0138             names = getNames(QLocale::DevanagariScript);
0139             break;
0140         case QChar::Script_Bengali:
0141             names = getNames(QLocale::BengaliScript);
0142             break;
0143         case QChar::Script_Gurmukhi:
0144             names = getNames(QLocale::GurmukhiScript);
0145             break;
0146         case QChar::Script_Gujarati:
0147             names = getNames(QLocale::GujaratiScript);
0148             break;
0149         case QChar::Script_Oriya:
0150             names = getNames(QLocale::OriyaScript);
0151             break;
0152         case QChar::Script_Tamil:
0153             names = getNames(QLocale::TamilScript);
0154             break;
0155         case QChar::Script_Telugu:
0156             names = getNames(QLocale::TeluguScript);
0157             break;
0158         case QChar::Script_Kannada:
0159             names = getNames(QLocale::KannadaScript);
0160             break;
0161         case QChar::Script_Malayalam:
0162             names = getNames(QLocale::MalayalamScript);
0163             break;
0164         case QChar::Script_Sinhala:
0165             names = getNames(QLocale::SinhalaScript);
0166             break;
0167         case QChar::Script_Thai:
0168             names = getNames(QLocale::ThaiScript);
0169             break;
0170         case QChar::Script_Lao:
0171             names = getNames(QLocale::LaoScript);
0172             break;
0173         case QChar::Script_Tibetan:
0174             names = getNames(QLocale::TibetanScript);
0175             break;
0176         case QChar::Script_Myanmar:
0177             names = getNames(QLocale::MyanmarScript);
0178             break;
0179         case QChar::Script_Georgian:
0180             names = getNames(QLocale::GeorgianScript);
0181             break;
0182         case QChar::Script_Hangul:
0183             names = getNames(QLocale::HangulScript);
0184             break;
0185         case QChar::Script_Ethiopic:
0186             names = getNames(QLocale::EthiopicScript);
0187             break;
0188         case QChar::Script_Cherokee:
0189             names = getNames(QLocale::CherokeeScript);
0190             break;
0191         case QChar::Script_CanadianAboriginal:
0192             names = getNames(QLocale::CanadianAboriginalScript);
0193             break;
0194         case QChar::Script_Ogham:
0195             names = getNames(QLocale::OghamScript);
0196             break;
0197         case QChar::Script_Runic:
0198             names = getNames(QLocale::RunicScript);
0199             break;
0200         case QChar::Script_Khmer:
0201             names = getNames(QLocale::KhmerScript);
0202             break;
0203         case QChar::Script_Mongolian:
0204             names = getNames(QLocale::MongolianScript);
0205             break;
0206         case QChar::Script_Hiragana:
0207             names = getNames(QLocale::HiraganaScript);
0208             break;
0209         case QChar::Script_Katakana:
0210             names = getNames(QLocale::KatakanaScript);
0211             break;
0212         case QChar::Script_Bopomofo:
0213             names = getNames(QLocale::BopomofoScript);
0214             break;
0215         case QChar::Script_Han:
0216             names = getNames(QLocale::HanScript);
0217             break;
0218         case QChar::Script_Yi:
0219             names = getNames(QLocale::YiScript);
0220             break;
0221         case QChar::Script_OldItalic:
0222             names = getNames(QLocale::OldItalicScript);
0223             break;
0224         case QChar::Script_Gothic:
0225             names = getNames(QLocale::GothicScript);
0226             break;
0227         case QChar::Script_Deseret:
0228             names = getNames(QLocale::DeseretScript);
0229             break;
0230         case QChar::Script_Tagalog:
0231             names = getNames(QLocale::TagalogScript);
0232             break;
0233         case QChar::Script_Hanunoo:
0234             names = getNames(QLocale::HanunooScript);
0235             break;
0236         case QChar::Script_Buhid:
0237             names = getNames(QLocale::BuhidScript);
0238             break;
0239         case QChar::Script_Tagbanwa:
0240             names = getNames(QLocale::TagbanwaScript);
0241             break;
0242         case QChar::Script_Coptic:
0243             names = getNames(QLocale::CopticScript);
0244             break;
0245         case QChar::Script_Limbu:
0246             names = getNames(QLocale::LimbuScript);
0247             break;
0248         case QChar::Script_TaiLe:
0249             names = getNames(QLocale::TaiLeScript);
0250             break;
0251         case QChar::Script_LinearB:
0252             names = getNames(QLocale::LinearBScript);
0253             break;
0254         case QChar::Script_Ugaritic:
0255             names = getNames(QLocale::UgariticScript);
0256             break;
0257         case QChar::Script_Shavian:
0258             names = getNames(QLocale::ShavianScript);
0259             break;
0260         case QChar::Script_Osmanya:
0261             names = getNames(QLocale::OsmanyaScript);
0262             break;
0263         case QChar::Script_Cypriot:
0264             names = getNames(QLocale::CypriotScript);
0265             break;
0266         case QChar::Script_Braille:
0267             names = getNames(QLocale::BrailleScript);
0268             break;
0269         case QChar::Script_Buginese:
0270             names = getNames(QLocale::BugineseScript);
0271             break;
0272         case QChar::Script_NewTaiLue:
0273             names = getNames(QLocale::NewTaiLueScript);
0274             break;
0275         case QChar::Script_Glagolitic:
0276             names = getNames(QLocale::GlagoliticScript);
0277             break;
0278         case QChar::Script_Tifinagh:
0279             names = getNames(QLocale::TifinaghScript);
0280             break;
0281         case QChar::Script_SylotiNagri:
0282             names = getNames(QLocale::SylotiNagriScript);
0283             break;
0284         case QChar::Script_OldPersian:
0285             names = getNames(QLocale::OldPersianScript);
0286             break;
0287         case QChar::Script_Kharoshthi:
0288             names = getNames(QLocale::KharoshthiScript);
0289             break;
0290         case QChar::Script_Balinese:
0291             names = getNames(QLocale::BalineseScript);
0292             break;
0293         case QChar::Script_Cuneiform:
0294             names = getNames(QLocale::CuneiformScript);
0295             break;
0296         case QChar::Script_Phoenician:
0297             names = getNames(QLocale::PhoenicianScript);
0298             break;
0299         case QChar::Script_PhagsPa:
0300             names = getNames(QLocale::PhagsPaScript);
0301             break;
0302         case QChar::Script_Nko:
0303             names = getNames(QLocale::NkoScript);
0304             break;
0305         case QChar::Script_Sundanese:
0306             names = getNames(QLocale::SundaneseScript);
0307             break;
0308         case QChar::Script_Lepcha:
0309             names = getNames(QLocale::LepchaScript);
0310             break;
0311         case QChar::Script_OlChiki:
0312             names = getNames(QLocale::OlChikiScript);
0313             break;
0314         case QChar::Script_Vai:
0315             names = getNames(QLocale::VaiScript);
0316             break;
0317         case QChar::Script_Saurashtra:
0318             names = getNames(QLocale::SaurashtraScript);
0319             break;
0320         case QChar::Script_KayahLi:
0321             names = getNames(QLocale::KayahLiScript);
0322             break;
0323         case QChar::Script_Rejang:
0324             names = getNames(QLocale::RejangScript);
0325             break;
0326         case QChar::Script_Lycian:
0327             names = getNames(QLocale::LycianScript);
0328             break;
0329         case QChar::Script_Carian:
0330             names = getNames(QLocale::CarianScript);
0331             break;
0332         case QChar::Script_Lydian:
0333             names = getNames(QLocale::LydianScript);
0334             break;
0335         case QChar::Script_Cham:
0336             names = getNames(QLocale::ChamScript);
0337             break;
0338         case QChar::Script_TaiTham:
0339             names = getNames(QLocale::LannaScript);
0340             break;
0341         case QChar::Script_TaiViet:
0342             names = getNames(QLocale::TaiVietScript);
0343             break;
0344         case QChar::Script_Avestan:
0345             names = getNames(QLocale::AvestanScript);
0346             break;
0347         case QChar::Script_EgyptianHieroglyphs:
0348             names = getNames(QLocale::EgyptianHieroglyphsScript);
0349             break;
0350         case QChar::Script_Samaritan:
0351             names = getNames(QLocale::SamaritanScript);
0352             break;
0353         case QChar::Script_Lisu:
0354             names = getNames(QLocale::FraserScript);
0355             break;
0356         case QChar::Script_Bamum:
0357             names = getNames(QLocale::BamumScript);
0358             break;
0359         case QChar::Script_Javanese:
0360             names = getNames(QLocale::JavaneseScript);
0361             break;
0362         case QChar::Script_MeeteiMayek:
0363             names = getNames(QLocale::MeiteiMayekScript);
0364             break;
0365         case QChar::Script_ImperialAramaic:
0366             names = getNames(QLocale::ImperialAramaicScript);
0367             break;
0368         case QChar::Script_OldSouthArabian:
0369             names = getNames(QLocale::OldSouthArabianScript);
0370             break;
0371         case QChar::Script_InscriptionalParthian:
0372             names = getNames(QLocale::InscriptionalParthianScript);
0373             break;
0374         case QChar::Script_InscriptionalPahlavi:
0375             names = getNames(QLocale::InscriptionalPahlaviScript);
0376             break;
0377         case QChar::Script_Kaithi:
0378             names = getNames(QLocale::KaithiScript);
0379             break;
0380         case QChar::Script_Batak:
0381             names = getNames(QLocale::BatakScript);
0382             break;
0383         case QChar::Script_Brahmi:
0384             names = getNames(QLocale::BrahmiScript);
0385             break;
0386         case QChar::Script_Mandaic:
0387             names = getNames(QLocale::MandaeanScript);
0388             break;
0389         case QChar::Script_Chakma:
0390             names = getNames(QLocale::ChakmaScript);
0391             break;
0392         case QChar::Script_MeroiticCursive:
0393         case QChar::Script_MeroiticHieroglyphs:
0394             names = getNames(QLocale::MeroiticCursiveScript);
0395             names.append(getNames(QLocale::MeroiticScript));
0396             break;
0397         case QChar::Script_Miao:
0398             names = getNames(QLocale::PollardPhoneticScript);
0399             break;
0400         case QChar::Script_Sharada:
0401             names = getNames(QLocale::SharadaScript);
0402             break;
0403         case QChar::Script_SoraSompeng:
0404             names = getNames(QLocale::SoraSompengScript);
0405             break;
0406         case QChar::Script_Takri:
0407             names = getNames(QLocale::TakriScript);
0408             break;
0409         case QChar::Script_CaucasianAlbanian:
0410             names = getNames(QLocale::CaucasianAlbanianScript);
0411             break;
0412         case QChar::Script_BassaVah:
0413             names = getNames(QLocale::BassaVahScript);
0414             break;
0415         case QChar::Script_Duployan:
0416             names = getNames(QLocale::DuployanScript);
0417             break;
0418         case QChar::Script_Elbasan:
0419             names = getNames(QLocale::ElbasanScript);
0420             break;
0421         case QChar::Script_Grantha:
0422             names = getNames(QLocale::GranthaScript);
0423             break;
0424         case QChar::Script_PahawhHmong:
0425             names = getNames(QLocale::PahawhHmongScript);
0426             break;
0427         case QChar::Script_Khojki:
0428             names = getNames(QLocale::KhojkiScript);
0429             break;
0430         case QChar::Script_LinearA:
0431             names = getNames(QLocale::LinearAScript);
0432             break;
0433         case QChar::Script_Mahajani:
0434             names = getNames(QLocale::MahajaniScript);
0435             break;
0436         case QChar::Script_Manichaean:
0437             names = getNames(QLocale::ManichaeanScript);
0438             break;
0439         case QChar::Script_MendeKikakui:
0440             names = getNames(QLocale::MendeKikakuiScript);
0441             break;
0442         case QChar::Script_Modi:
0443             names = getNames(QLocale::ModiScript);
0444             break;
0445         case QChar::Script_Mro:
0446             names = getNames(QLocale::MroScript);
0447             break;
0448         case QChar::Script_OldNorthArabian:
0449             names = getNames(QLocale::OldNorthArabianScript);
0450             break;
0451         case QChar::Script_Nabataean:
0452             names = getNames(QLocale::NabataeanScript);
0453             break;
0454         case QChar::Script_Palmyrene:
0455             names = getNames(QLocale::PalmyreneScript);
0456             break;
0457         case QChar::Script_PauCinHau:
0458             names = getNames(QLocale::PauCinHauScript);
0459             break;
0460         case QChar::Script_OldPermic:
0461             names = getNames(QLocale::OldPermicScript);
0462             break;
0463         case QChar::Script_PsalterPahlavi:
0464             names = getNames(QLocale::PsalterPahlaviScript);
0465             break;
0466         case QChar::Script_Siddham:
0467             names = getNames(QLocale::SiddhamScript);
0468             break;
0469         case QChar::Script_Khudawadi:
0470             names = getNames(QLocale::KhudawadiScript);
0471             break;
0472         case QChar::Script_Tirhuta:
0473             names = getNames(QLocale::TirhutaScript);
0474             break;
0475         case QChar::Script_WarangCiti:
0476             names = getNames(QLocale::VarangKshitiScript);
0477             break;
0478         case QChar::Script_Ahom:
0479             names = getNames(QLocale::AhomScript);
0480             break;
0481         case QChar::Script_AnatolianHieroglyphs:
0482             names = getNames(QLocale::AnatolianHieroglyphsScript);
0483             break;
0484         case QChar::Script_Hatran:
0485             names = getNames(QLocale::HatranScript);
0486             break;
0487         case QChar::Script_Multani:
0488             names = getNames(QLocale::MultaniScript);
0489             break;
0490         case QChar::Script_OldHungarian:
0491             names = getNames(QLocale::OldHungarianScript);
0492             break;
0493         case QChar::Script_Unknown:
0494         case QChar::Script_Inherited:
0495         case QChar::Script_Common:
0496         case QChar::Script_OldTurkic:
0497         case QChar::Script_SignWriting:
0498             break;
0499         default:
0500             qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script;
0501             break;
0502         }
0503         allLanguages.unite(QSet<QString>(names.constBegin(), names.constEnd()));
0504 
0505         { // Remove unknown languages
0506             QStringList pruned;
0507             for (const QString &name : std::as_const(names)) {
0508                 if (!dictionaryLanguages.contains(name)) {
0509                     continue;
0510                 }
0511                 pruned.append(name);
0512             }
0513             names = pruned;
0514         }
0515 
0516         if (names.isEmpty()) {
0517             continue;
0518         }
0519 
0520         for (const QString &name : std::as_const(names)) {
0521             s_scriptLanguages.insert(script, name);
0522         }
0523     }
0524 
0525     // Try to handle some badly named dictionaries
0526     if (!allLanguages.contains(s_knownDictionaries)) {
0527         QSet<QString> dicts(s_knownDictionaries);
0528         dicts.subtract(allLanguages);
0529         for (const QString &dictName : std::as_const(dicts)) {
0530             QString languageName = QLocale(dictName).name();
0531             if (languageName.isEmpty()) {
0532                 qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName;
0533                 continue;
0534             }
0535             s_dictionaryNameMap[languageName] = dictName;
0536             if (std::find(s_scriptLanguages.cbegin(), s_scriptLanguages.cend(), languageName) == s_scriptLanguages.cend()) {
0537                 qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName;
0538             }
0539         }
0540     }
0541 }
0542 
0543 GuessLanguage::GuessLanguage()
0544     : d(new GuessLanguagePrivate)
0545 {
0546 }
0547 
0548 GuessLanguage::~GuessLanguage()
0549 {
0550     delete d;
0551 }
0552 
0553 QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const
0554 {
0555     if (text.isEmpty()) {
0556         return QString();
0557     }
0558 
0559     // Filter for available dictionaries
0560     QStringList suggestionsList;
0561     for (const QString &suggestion : suggestionsListIn) {
0562         if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) {
0563             suggestionsList.append(suggestion);
0564         }
0565     }
0566 
0567     // Load the model on demand
0568     if (d->s_knownModels.isEmpty()) {
0569         d->loadModels();
0570     }
0571 
0572     const QList<QChar::Script> scriptsList = d->findRuns(text);
0573 
0574     QStringList candidateLanguages = d->identify(text, scriptsList);
0575 
0576     // if guessing from trigrams fail
0577     for (const QChar::Script script : scriptsList) {
0578         const auto languagesList = d->s_scriptLanguages.values(script);
0579         for (const QString &lang : languagesList) {
0580             if (!d->s_knownModels.contains(lang)) {
0581                 candidateLanguages.append(lang);
0582             }
0583         }
0584     }
0585 
0586     // Hack for some bad dictionary names
0587     for (int i = 0; i < candidateLanguages.count(); i++) {
0588         if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) {
0589             candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]);
0590         }
0591     }
0592 
0593     if (candidateLanguages.count() == 1) {
0594         return candidateLanguages.first();
0595     }
0596 
0597     // Wasn't able to get a good guess with the trigrams, try checking all
0598     // dictionaries for the suggested languages.
0599     candidateLanguages.append(suggestionsList);
0600     candidateLanguages.removeDuplicates();
0601     QString identified = d->guessFromDictionaries(text, candidateLanguages);
0602     if (!identified.isEmpty()) {
0603         return identified;
0604     }
0605 
0606     qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text;
0607 
0608     // None of our methods worked, just return the best suggestion
0609     if (!suggestionsList.isEmpty()) {
0610         return suggestionsList.first();
0611     }
0612 
0613     qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text;
0614 
0615     // Not even any suggestions, give up
0616     return QString();
0617 }
0618 
0619 void GuessLanguage::setLimits(int maxItems, double minConfidence)
0620 {
0621     d->m_maxItems = maxItems;
0622     d->m_minConfidence = minConfidence;
0623 }
0624 
0625 void GuessLanguagePrivate::loadModels()
0626 {
0627     // use trigrams from resource file, easy to deploy on all platforms
0628     const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map");
0629     qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile;
0630 
0631     QFile sin(triMapFile);
0632     if (!sin.open(QIODevice::ReadOnly)) {
0633         qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile;
0634         return;
0635     }
0636 
0637     QDataStream in(&sin);
0638     in >> s_knownModels;
0639 
0640     // Sanity check
0641     QSet<QString> availableLanguages;
0642     QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels);
0643     while (iterator.hasNext()) {
0644         iterator.next();
0645         if (iterator.value().count() < MAXGRAMS) {
0646             qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS;
0647         }
0648         availableLanguages.insert(iterator.key());
0649     }
0650     QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd());
0651     knownLanguages.subtract(availableLanguages);
0652     if (!knownLanguages.isEmpty()) {
0653         qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages;
0654     }
0655 }
0656 
0657 QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text)
0658 {
0659     QHash<QChar::Script, int> scriptCounts;
0660 
0661     int totalCount = 0;
0662 
0663     for (const QChar c : text) {
0664         const QChar::Script script = c.script();
0665 
0666         if (script == QChar::Script_Common || script == QChar::Script_Inherited) {
0667             continue;
0668         }
0669 
0670         if (!c.isLetter()) {
0671             continue;
0672         }
0673 
0674         scriptCounts[script]++;
0675         totalCount++;
0676     }
0677 
0678     QList<QChar::Script> relevantScripts;
0679 
0680     if (totalCount == 0) {
0681         return relevantScripts;
0682     }
0683 
0684     if (scriptCounts.size() == 1) {
0685         return {scriptCounts.cbegin().key()};
0686     }
0687 
0688     for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) {
0689         // return run types that used for 40% or more of the string
0690         const int scriptCount = it.value();
0691         const auto currentScript = it.key();
0692         if (scriptCount * 100 / totalCount >= 40) {
0693             relevantScripts << currentScript;
0694             // always return basic latin if found more than 15%.
0695         } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) {
0696             relevantScripts << currentScript;
0697         }
0698     }
0699 
0700     return relevantScripts;
0701 }
0702 
0703 QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts)
0704 {
0705     if (sample.size() < MIN_LENGTH) {
0706         return QStringList();
0707     }
0708 
0709     QStringList guesses;
0710     for (const QChar::Script script : scripts) {
0711         guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script)));
0712     }
0713 
0714     return guesses;
0715 }
0716 
0717 QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages)
0718 {
0719     QStringList ret;
0720 
0721     const QVector<QString> sampleTrigrams = createOrderedModel(sample);
0722 
0723     // Sort by score
0724     QMultiMap<int, QString> scores;
0725     for (const QString &language : languages) {
0726         if (s_knownModels.contains(language)) {
0727             scores.insert(distance(sampleTrigrams, s_knownModels[language]), language);
0728         }
0729     }
0730 
0731     // Skip if either no results or best result is completely unknown (distance >= maxdistance)
0732     if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) {
0733         qCDebug(SONNET_LOG_CORE) << "No scores for" << sample;
0734         return ret;
0735     }
0736 
0737     int counter = 0;
0738     double confidence = 0;
0739 
0740 #if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0)
0741     QMultiMapIterator<int, QString> it(scores);
0742 #else
0743     QMapIterator<int, QString> it(scores);
0744 #endif
0745     it.next();
0746 
0747     QString prevItem = it.value();
0748     int prevScore = it.key();
0749 
0750     while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) {
0751         it.next();
0752         counter++;
0753         confidence += (it.key() - prevScore) / (double)it.key();
0754         ret += prevItem;
0755         prevItem = it.value();
0756         prevScore = it.key();
0757     }
0758     if (counter < m_maxItems && confidence < m_minConfidence) {
0759         ret += prevItem;
0760     }
0761 
0762     return ret;
0763 }
0764 
0765 QVector<QString> GuessLanguagePrivate::createOrderedModel(const QString &content)
0766 {
0767     QHash<QString, int> trigramCounts;
0768 
0769     // collect trigrams
0770     trigramCounts.reserve(content.size() - 2);
0771     for (int i = 0; i < (content.size() - 2); ++i) {
0772         QString tri = content.mid(i, 3).toLower();
0773         trigramCounts[tri]++;
0774     }
0775 
0776     // invert the map <freq, trigram>
0777     QVector<QPair<int, QString>> trigramFrequencyList;
0778     trigramFrequencyList.reserve(trigramCounts.size());
0779 
0780     auto it = trigramCounts.constBegin();
0781     for (; it != trigramCounts.constEnd(); ++it) {
0782         const QChar *data = it.key().constData();
0783         bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace()));
0784 
0785         if (!hasTwoSpaces) {
0786             const int freq = it.value();
0787             const QString &trigram = it.key();
0788             trigramFrequencyList.append({freq, trigram});
0789         }
0790     }
0791 
0792     // sort descending by frequency
0793     std::sort(trigramFrequencyList.begin(), trigramFrequencyList.end(), [](const QPair<int, QString> &a, const QPair<int, QString> &b) {
0794         return a.first > b.first;
0795     });
0796 
0797     QVector<QString> orderedTrigrams;
0798     orderedTrigrams.reserve(trigramFrequencyList.size());
0799     for (const auto &tri : std::as_const(trigramFrequencyList)) {
0800         orderedTrigrams.append(tri.second);
0801     }
0802 
0803     return orderedTrigrams;
0804 }
0805 
0806 int GuessLanguagePrivate::distance(const QVector<QString> &model, const QHash<QString, int> &knownModel)
0807 {
0808     int counter = -1;
0809     int dist = 0;
0810 
0811     for (const QString &trigram : model) {
0812         const int val = knownModel.value(trigram, -1);
0813         if (val != -1) {
0814             dist += qAbs(++counter - val);
0815         } else {
0816             dist += MAXGRAMS;
0817         }
0818 
0819         if (counter == (MAXGRAMS - 1)) {
0820             break;
0821         }
0822     }
0823 
0824     return dist;
0825 }
0826 
0827 QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates)
0828 {
0829     // Try to see how many languages we can get spell checking for
0830     QList<QSharedPointer<SpellerPlugin>> spellers;
0831     for (const QString &lang : candidates) {
0832         if (!Loader::openLoader()->languages().contains(lang)) {
0833             qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang;
0834             continue;
0835         }
0836         QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(lang);
0837         if (!plugin.isNull()) {
0838             spellers.append(plugin);
0839         }
0840     }
0841 
0842     // If there's no spell checkers, give up
0843     if (spellers.isEmpty()) {
0844         return QString();
0845     }
0846 
0847     QMap<QString, int> correctHits;
0848 
0849     WordTokenizer tokenizer(sentence);
0850     while (tokenizer.hasNext()) {
0851         Token word = tokenizer.next();
0852         if (!tokenizer.isSpellcheckable()) {
0853             continue;
0854         }
0855 
0856         for (int i = 0; i < spellers.count(); ++i) {
0857             if (spellers[i]->isCorrect(word.toString())) {
0858                 correctHits[spellers[i]->language()]++;
0859             }
0860         }
0861     }
0862 
0863     if (correctHits.isEmpty()) {
0864         return QString();
0865     }
0866 
0867     QMap<QString, int>::const_iterator max = correctHits.constBegin();
0868     for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) {
0869         if (itr.value() > max.value()) {
0870             max = itr;
0871         }
0872     }
0873     return max.key();
0874 }
0875 }