File indexing completed on 2024-04-28 15:34:18
0001 /* This file is part of the KDE libraries 0002 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> 0003 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> 0004 SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 0009 #include <QCoreApplication> 0010 #include <QDataStream> 0011 #include <QFile> 0012 #include <QLocale> 0013 #include <QStandardPaths> 0014 0015 #include "core_debug.h" 0016 #include "guesslanguage.h" 0017 #include "loader_p.h" 0018 #include "speller.h" 0019 #include "spellerplugin_p.h" 0020 #include "tokenizer_p.h" 0021 0022 /* 0023 All language tags should be valid according to IETF BCP 47, as codified in RFC 4646. 0024 ISO 639-1 codes should be used for the language part except for cases where there 0025 exists no code, then 639-3 codes should be used. Country codes should only be used 0026 in special cases. Scripts can be differentiated by IANA subtags, available here: 0027 http://www.iana.org/assignments/language-subtag-registry 0028 The script tags correspond to ISO 15924 0029 0030 An overview of the best practices concerning language tagging is available here: 0031 http://www.w3.org/International/articles/language-tags/Overview.en.php 0032 0033 lang tags should use underscores (_) rather than hyphens (-) to separate subsections. 0034 0035 EXCEPTIONS: 0036 For cases of known differences from the above tagging scheme and major 0037 spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers 0038 shall be used. All exception shall be noted here: 0039 0040 BCP SPELLCHECK 0041 az-Latn az 0042 0043 */ 0044 0045 namespace Sonnet 0046 { 0047 class GuessLanguagePrivate 0048 { 0049 public: 0050 GuessLanguagePrivate(); 0051 // language trigram score 0052 static QHash<QString, QHash<QString, int>> s_knownModels; 0053 0054 void loadModels(); 0055 QList<QChar::Script> findRuns(const QString &text); 0056 QVector<QString> createOrderedModel(const QString &content); 0057 int distance(const QVector<QString> &model, const QHash<QString, int> &knownModel); 0058 QStringList guessFromTrigrams(const QString &sample, const QStringList &langs); 0059 QStringList identify(const QString &sample, const QList<QChar::Script> &scripts); 0060 QString guessFromDictionaries(const QString &sentence, const QStringList &candidates); 0061 0062 static QSet<QString> s_knownDictionaries; 0063 static QMultiHash<QChar::Script, QString> s_scriptLanguages; 0064 static QMap<QString, QString> s_dictionaryNameMap; 0065 0066 const int MIN_LENGTH; 0067 int m_maxItems; 0068 double m_minConfidence; 0069 }; 0070 0071 QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels; 0072 QSet<QString> GuessLanguagePrivate::s_knownDictionaries; 0073 QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages; 0074 QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap; 0075 0076 QStringList getNames(QLocale::Script script) 0077 { 0078 QStringList locales; 0079 const auto matchingLocales = QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry); 0080 locales.reserve(matchingLocales.size()); 0081 for (const QLocale &locale : matchingLocales) { 0082 locales << locale.name(); 0083 } 0084 return locales; 0085 } 0086 0087 GuessLanguagePrivate::GuessLanguagePrivate() 0088 : MIN_LENGTH(5) 0089 , m_maxItems(1) 0090 , m_minConfidence(0) 0091 { 0092 if (!s_scriptLanguages.isEmpty()) { 0093 return; 0094 } 0095 0096 const QStringList languages = Loader::openLoader()->languages(); 0097 s_knownDictionaries = QSet<QString>(languages.begin(), languages.end()); 0098 QSet<QString> dictionaryLanguages; 0099 for (const QString &dictName : std::as_const(s_knownDictionaries)) { 0100 QString languageName = QLocale(dictName).name(); 0101 if (languageName.isEmpty()) { 0102 qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName; 0103 continue; 0104 } 0105 dictionaryLanguages.insert(languageName); 0106 } 0107 0108 QSet<QString> allLanguages; 0109 for (int i = 0; i < int(QChar::ScriptCount); i++) { 0110 QChar::Script script = static_cast<QChar::Script>(i); 0111 QStringList names; 0112 switch (script) { 0113 case QChar::Script_Latin: 0114 names = getNames(QLocale::LatinScript); 0115 break; 0116 case QChar::Script_Greek: 0117 names = getNames(QLocale::GreekScript); 0118 break; 0119 case QChar::Script_Cyrillic: 0120 names = getNames(QLocale::CyrillicScript); 0121 break; 0122 case QChar::Script_Armenian: 0123 names = getNames(QLocale::ArmenianScript); 0124 break; 0125 case QChar::Script_Hebrew: 0126 names = getNames(QLocale::HebrewScript); 0127 break; 0128 case QChar::Script_Arabic: 0129 names = getNames(QLocale::ArabicScript); 0130 break; 0131 case QChar::Script_Syriac: 0132 names = getNames(QLocale::SyriacScript); 0133 break; 0134 case QChar::Script_Thaana: 0135 names = getNames(QLocale::ThaanaScript); 0136 break; 0137 case QChar::Script_Devanagari: 0138 names = getNames(QLocale::DevanagariScript); 0139 break; 0140 case QChar::Script_Bengali: 0141 names = getNames(QLocale::BengaliScript); 0142 break; 0143 case QChar::Script_Gurmukhi: 0144 names = getNames(QLocale::GurmukhiScript); 0145 break; 0146 case QChar::Script_Gujarati: 0147 names = getNames(QLocale::GujaratiScript); 0148 break; 0149 case QChar::Script_Oriya: 0150 names = getNames(QLocale::OriyaScript); 0151 break; 0152 case QChar::Script_Tamil: 0153 names = getNames(QLocale::TamilScript); 0154 break; 0155 case QChar::Script_Telugu: 0156 names = getNames(QLocale::TeluguScript); 0157 break; 0158 case QChar::Script_Kannada: 0159 names = getNames(QLocale::KannadaScript); 0160 break; 0161 case QChar::Script_Malayalam: 0162 names = getNames(QLocale::MalayalamScript); 0163 break; 0164 case QChar::Script_Sinhala: 0165 names = getNames(QLocale::SinhalaScript); 0166 break; 0167 case QChar::Script_Thai: 0168 names = getNames(QLocale::ThaiScript); 0169 break; 0170 case QChar::Script_Lao: 0171 names = getNames(QLocale::LaoScript); 0172 break; 0173 case QChar::Script_Tibetan: 0174 names = getNames(QLocale::TibetanScript); 0175 break; 0176 case QChar::Script_Myanmar: 0177 names = getNames(QLocale::MyanmarScript); 0178 break; 0179 case QChar::Script_Georgian: 0180 names = getNames(QLocale::GeorgianScript); 0181 break; 0182 case QChar::Script_Hangul: 0183 names = getNames(QLocale::HangulScript); 0184 break; 0185 case QChar::Script_Ethiopic: 0186 names = getNames(QLocale::EthiopicScript); 0187 break; 0188 case QChar::Script_Cherokee: 0189 names = getNames(QLocale::CherokeeScript); 0190 break; 0191 case QChar::Script_CanadianAboriginal: 0192 names = getNames(QLocale::CanadianAboriginalScript); 0193 break; 0194 case QChar::Script_Ogham: 0195 names = getNames(QLocale::OghamScript); 0196 break; 0197 case QChar::Script_Runic: 0198 names = getNames(QLocale::RunicScript); 0199 break; 0200 case QChar::Script_Khmer: 0201 names = getNames(QLocale::KhmerScript); 0202 break; 0203 case QChar::Script_Mongolian: 0204 names = getNames(QLocale::MongolianScript); 0205 break; 0206 case QChar::Script_Hiragana: 0207 names = getNames(QLocale::HiraganaScript); 0208 break; 0209 case QChar::Script_Katakana: 0210 names = getNames(QLocale::KatakanaScript); 0211 break; 0212 case QChar::Script_Bopomofo: 0213 names = getNames(QLocale::BopomofoScript); 0214 break; 0215 case QChar::Script_Han: 0216 names = getNames(QLocale::HanScript); 0217 break; 0218 case QChar::Script_Yi: 0219 names = getNames(QLocale::YiScript); 0220 break; 0221 case QChar::Script_OldItalic: 0222 names = getNames(QLocale::OldItalicScript); 0223 break; 0224 case QChar::Script_Gothic: 0225 names = getNames(QLocale::GothicScript); 0226 break; 0227 case QChar::Script_Deseret: 0228 names = getNames(QLocale::DeseretScript); 0229 break; 0230 case QChar::Script_Tagalog: 0231 names = getNames(QLocale::TagalogScript); 0232 break; 0233 case QChar::Script_Hanunoo: 0234 names = getNames(QLocale::HanunooScript); 0235 break; 0236 case QChar::Script_Buhid: 0237 names = getNames(QLocale::BuhidScript); 0238 break; 0239 case QChar::Script_Tagbanwa: 0240 names = getNames(QLocale::TagbanwaScript); 0241 break; 0242 case QChar::Script_Coptic: 0243 names = getNames(QLocale::CopticScript); 0244 break; 0245 case QChar::Script_Limbu: 0246 names = getNames(QLocale::LimbuScript); 0247 break; 0248 case QChar::Script_TaiLe: 0249 names = getNames(QLocale::TaiLeScript); 0250 break; 0251 case QChar::Script_LinearB: 0252 names = getNames(QLocale::LinearBScript); 0253 break; 0254 case QChar::Script_Ugaritic: 0255 names = getNames(QLocale::UgariticScript); 0256 break; 0257 case QChar::Script_Shavian: 0258 names = getNames(QLocale::ShavianScript); 0259 break; 0260 case QChar::Script_Osmanya: 0261 names = getNames(QLocale::OsmanyaScript); 0262 break; 0263 case QChar::Script_Cypriot: 0264 names = getNames(QLocale::CypriotScript); 0265 break; 0266 case QChar::Script_Braille: 0267 names = getNames(QLocale::BrailleScript); 0268 break; 0269 case QChar::Script_Buginese: 0270 names = getNames(QLocale::BugineseScript); 0271 break; 0272 case QChar::Script_NewTaiLue: 0273 names = getNames(QLocale::NewTaiLueScript); 0274 break; 0275 case QChar::Script_Glagolitic: 0276 names = getNames(QLocale::GlagoliticScript); 0277 break; 0278 case QChar::Script_Tifinagh: 0279 names = getNames(QLocale::TifinaghScript); 0280 break; 0281 case QChar::Script_SylotiNagri: 0282 names = getNames(QLocale::SylotiNagriScript); 0283 break; 0284 case QChar::Script_OldPersian: 0285 names = getNames(QLocale::OldPersianScript); 0286 break; 0287 case QChar::Script_Kharoshthi: 0288 names = getNames(QLocale::KharoshthiScript); 0289 break; 0290 case QChar::Script_Balinese: 0291 names = getNames(QLocale::BalineseScript); 0292 break; 0293 case QChar::Script_Cuneiform: 0294 names = getNames(QLocale::CuneiformScript); 0295 break; 0296 case QChar::Script_Phoenician: 0297 names = getNames(QLocale::PhoenicianScript); 0298 break; 0299 case QChar::Script_PhagsPa: 0300 names = getNames(QLocale::PhagsPaScript); 0301 break; 0302 case QChar::Script_Nko: 0303 names = getNames(QLocale::NkoScript); 0304 break; 0305 case QChar::Script_Sundanese: 0306 names = getNames(QLocale::SundaneseScript); 0307 break; 0308 case QChar::Script_Lepcha: 0309 names = getNames(QLocale::LepchaScript); 0310 break; 0311 case QChar::Script_OlChiki: 0312 names = getNames(QLocale::OlChikiScript); 0313 break; 0314 case QChar::Script_Vai: 0315 names = getNames(QLocale::VaiScript); 0316 break; 0317 case QChar::Script_Saurashtra: 0318 names = getNames(QLocale::SaurashtraScript); 0319 break; 0320 case QChar::Script_KayahLi: 0321 names = getNames(QLocale::KayahLiScript); 0322 break; 0323 case QChar::Script_Rejang: 0324 names = getNames(QLocale::RejangScript); 0325 break; 0326 case QChar::Script_Lycian: 0327 names = getNames(QLocale::LycianScript); 0328 break; 0329 case QChar::Script_Carian: 0330 names = getNames(QLocale::CarianScript); 0331 break; 0332 case QChar::Script_Lydian: 0333 names = getNames(QLocale::LydianScript); 0334 break; 0335 case QChar::Script_Cham: 0336 names = getNames(QLocale::ChamScript); 0337 break; 0338 case QChar::Script_TaiTham: 0339 names = getNames(QLocale::LannaScript); 0340 break; 0341 case QChar::Script_TaiViet: 0342 names = getNames(QLocale::TaiVietScript); 0343 break; 0344 case QChar::Script_Avestan: 0345 names = getNames(QLocale::AvestanScript); 0346 break; 0347 case QChar::Script_EgyptianHieroglyphs: 0348 names = getNames(QLocale::EgyptianHieroglyphsScript); 0349 break; 0350 case QChar::Script_Samaritan: 0351 names = getNames(QLocale::SamaritanScript); 0352 break; 0353 case QChar::Script_Lisu: 0354 names = getNames(QLocale::FraserScript); 0355 break; 0356 case QChar::Script_Bamum: 0357 names = getNames(QLocale::BamumScript); 0358 break; 0359 case QChar::Script_Javanese: 0360 names = getNames(QLocale::JavaneseScript); 0361 break; 0362 case QChar::Script_MeeteiMayek: 0363 names = getNames(QLocale::MeiteiMayekScript); 0364 break; 0365 case QChar::Script_ImperialAramaic: 0366 names = getNames(QLocale::ImperialAramaicScript); 0367 break; 0368 case QChar::Script_OldSouthArabian: 0369 names = getNames(QLocale::OldSouthArabianScript); 0370 break; 0371 case QChar::Script_InscriptionalParthian: 0372 names = getNames(QLocale::InscriptionalParthianScript); 0373 break; 0374 case QChar::Script_InscriptionalPahlavi: 0375 names = getNames(QLocale::InscriptionalPahlaviScript); 0376 break; 0377 case QChar::Script_Kaithi: 0378 names = getNames(QLocale::KaithiScript); 0379 break; 0380 case QChar::Script_Batak: 0381 names = getNames(QLocale::BatakScript); 0382 break; 0383 case QChar::Script_Brahmi: 0384 names = getNames(QLocale::BrahmiScript); 0385 break; 0386 case QChar::Script_Mandaic: 0387 names = getNames(QLocale::MandaeanScript); 0388 break; 0389 case QChar::Script_Chakma: 0390 names = getNames(QLocale::ChakmaScript); 0391 break; 0392 case QChar::Script_MeroiticCursive: 0393 case QChar::Script_MeroiticHieroglyphs: 0394 names = getNames(QLocale::MeroiticCursiveScript); 0395 names.append(getNames(QLocale::MeroiticScript)); 0396 break; 0397 case QChar::Script_Miao: 0398 names = getNames(QLocale::PollardPhoneticScript); 0399 break; 0400 case QChar::Script_Sharada: 0401 names = getNames(QLocale::SharadaScript); 0402 break; 0403 case QChar::Script_SoraSompeng: 0404 names = getNames(QLocale::SoraSompengScript); 0405 break; 0406 case QChar::Script_Takri: 0407 names = getNames(QLocale::TakriScript); 0408 break; 0409 case QChar::Script_CaucasianAlbanian: 0410 names = getNames(QLocale::CaucasianAlbanianScript); 0411 break; 0412 case QChar::Script_BassaVah: 0413 names = getNames(QLocale::BassaVahScript); 0414 break; 0415 case QChar::Script_Duployan: 0416 names = getNames(QLocale::DuployanScript); 0417 break; 0418 case QChar::Script_Elbasan: 0419 names = getNames(QLocale::ElbasanScript); 0420 break; 0421 case QChar::Script_Grantha: 0422 names = getNames(QLocale::GranthaScript); 0423 break; 0424 case QChar::Script_PahawhHmong: 0425 names = getNames(QLocale::PahawhHmongScript); 0426 break; 0427 case QChar::Script_Khojki: 0428 names = getNames(QLocale::KhojkiScript); 0429 break; 0430 case QChar::Script_LinearA: 0431 names = getNames(QLocale::LinearAScript); 0432 break; 0433 case QChar::Script_Mahajani: 0434 names = getNames(QLocale::MahajaniScript); 0435 break; 0436 case QChar::Script_Manichaean: 0437 names = getNames(QLocale::ManichaeanScript); 0438 break; 0439 case QChar::Script_MendeKikakui: 0440 names = getNames(QLocale::MendeKikakuiScript); 0441 break; 0442 case QChar::Script_Modi: 0443 names = getNames(QLocale::ModiScript); 0444 break; 0445 case QChar::Script_Mro: 0446 names = getNames(QLocale::MroScript); 0447 break; 0448 case QChar::Script_OldNorthArabian: 0449 names = getNames(QLocale::OldNorthArabianScript); 0450 break; 0451 case QChar::Script_Nabataean: 0452 names = getNames(QLocale::NabataeanScript); 0453 break; 0454 case QChar::Script_Palmyrene: 0455 names = getNames(QLocale::PalmyreneScript); 0456 break; 0457 case QChar::Script_PauCinHau: 0458 names = getNames(QLocale::PauCinHauScript); 0459 break; 0460 case QChar::Script_OldPermic: 0461 names = getNames(QLocale::OldPermicScript); 0462 break; 0463 case QChar::Script_PsalterPahlavi: 0464 names = getNames(QLocale::PsalterPahlaviScript); 0465 break; 0466 case QChar::Script_Siddham: 0467 names = getNames(QLocale::SiddhamScript); 0468 break; 0469 case QChar::Script_Khudawadi: 0470 names = getNames(QLocale::KhudawadiScript); 0471 break; 0472 case QChar::Script_Tirhuta: 0473 names = getNames(QLocale::TirhutaScript); 0474 break; 0475 case QChar::Script_WarangCiti: 0476 names = getNames(QLocale::VarangKshitiScript); 0477 break; 0478 case QChar::Script_Ahom: 0479 names = getNames(QLocale::AhomScript); 0480 break; 0481 case QChar::Script_AnatolianHieroglyphs: 0482 names = getNames(QLocale::AnatolianHieroglyphsScript); 0483 break; 0484 case QChar::Script_Hatran: 0485 names = getNames(QLocale::HatranScript); 0486 break; 0487 case QChar::Script_Multani: 0488 names = getNames(QLocale::MultaniScript); 0489 break; 0490 case QChar::Script_OldHungarian: 0491 names = getNames(QLocale::OldHungarianScript); 0492 break; 0493 case QChar::Script_Unknown: 0494 case QChar::Script_Inherited: 0495 case QChar::Script_Common: 0496 case QChar::Script_OldTurkic: 0497 case QChar::Script_SignWriting: 0498 break; 0499 default: 0500 qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script; 0501 break; 0502 } 0503 allLanguages.unite(QSet<QString>(names.constBegin(), names.constEnd())); 0504 0505 { // Remove unknown languages 0506 QStringList pruned; 0507 for (const QString &name : std::as_const(names)) { 0508 if (!dictionaryLanguages.contains(name)) { 0509 continue; 0510 } 0511 pruned.append(name); 0512 } 0513 names = pruned; 0514 } 0515 0516 if (names.isEmpty()) { 0517 continue; 0518 } 0519 0520 for (const QString &name : std::as_const(names)) { 0521 s_scriptLanguages.insert(script, name); 0522 } 0523 } 0524 0525 // Try to handle some badly named dictionaries 0526 if (!allLanguages.contains(s_knownDictionaries)) { 0527 QSet<QString> dicts(s_knownDictionaries); 0528 dicts.subtract(allLanguages); 0529 for (const QString &dictName : std::as_const(dicts)) { 0530 QString languageName = QLocale(dictName).name(); 0531 if (languageName.isEmpty()) { 0532 qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName; 0533 continue; 0534 } 0535 s_dictionaryNameMap[languageName] = dictName; 0536 if (std::find(s_scriptLanguages.cbegin(), s_scriptLanguages.cend(), languageName) == s_scriptLanguages.cend()) { 0537 qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName; 0538 } 0539 } 0540 } 0541 } 0542 0543 GuessLanguage::GuessLanguage() 0544 : d(new GuessLanguagePrivate) 0545 { 0546 } 0547 0548 GuessLanguage::~GuessLanguage() 0549 { 0550 delete d; 0551 } 0552 0553 QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const 0554 { 0555 if (text.isEmpty()) { 0556 return QString(); 0557 } 0558 0559 // Filter for available dictionaries 0560 QStringList suggestionsList; 0561 for (const QString &suggestion : suggestionsListIn) { 0562 if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) { 0563 suggestionsList.append(suggestion); 0564 } 0565 } 0566 0567 // Load the model on demand 0568 if (d->s_knownModels.isEmpty()) { 0569 d->loadModels(); 0570 } 0571 0572 const QList<QChar::Script> scriptsList = d->findRuns(text); 0573 0574 QStringList candidateLanguages = d->identify(text, scriptsList); 0575 0576 // if guessing from trigrams fail 0577 for (const QChar::Script script : scriptsList) { 0578 const auto languagesList = d->s_scriptLanguages.values(script); 0579 for (const QString &lang : languagesList) { 0580 if (!d->s_knownModels.contains(lang)) { 0581 candidateLanguages.append(lang); 0582 } 0583 } 0584 } 0585 0586 // Hack for some bad dictionary names 0587 for (int i = 0; i < candidateLanguages.count(); i++) { 0588 if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) { 0589 candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]); 0590 } 0591 } 0592 0593 if (candidateLanguages.count() == 1) { 0594 return candidateLanguages.first(); 0595 } 0596 0597 // Wasn't able to get a good guess with the trigrams, try checking all 0598 // dictionaries for the suggested languages. 0599 candidateLanguages.append(suggestionsList); 0600 candidateLanguages.removeDuplicates(); 0601 QString identified = d->guessFromDictionaries(text, candidateLanguages); 0602 if (!identified.isEmpty()) { 0603 return identified; 0604 } 0605 0606 qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text; 0607 0608 // None of our methods worked, just return the best suggestion 0609 if (!suggestionsList.isEmpty()) { 0610 return suggestionsList.first(); 0611 } 0612 0613 qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text; 0614 0615 // Not even any suggestions, give up 0616 return QString(); 0617 } 0618 0619 void GuessLanguage::setLimits(int maxItems, double minConfidence) 0620 { 0621 d->m_maxItems = maxItems; 0622 d->m_minConfidence = minConfidence; 0623 } 0624 0625 void GuessLanguagePrivate::loadModels() 0626 { 0627 // use trigrams from resource file, easy to deploy on all platforms 0628 const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map"); 0629 qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile; 0630 0631 QFile sin(triMapFile); 0632 if (!sin.open(QIODevice::ReadOnly)) { 0633 qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile; 0634 return; 0635 } 0636 0637 QDataStream in(&sin); 0638 in >> s_knownModels; 0639 0640 // Sanity check 0641 QSet<QString> availableLanguages; 0642 QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels); 0643 while (iterator.hasNext()) { 0644 iterator.next(); 0645 if (iterator.value().count() < MAXGRAMS) { 0646 qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS; 0647 } 0648 availableLanguages.insert(iterator.key()); 0649 } 0650 QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd()); 0651 knownLanguages.subtract(availableLanguages); 0652 if (!knownLanguages.isEmpty()) { 0653 qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages; 0654 } 0655 } 0656 0657 QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text) 0658 { 0659 QHash<QChar::Script, int> scriptCounts; 0660 0661 int totalCount = 0; 0662 0663 for (const QChar c : text) { 0664 const QChar::Script script = c.script(); 0665 0666 if (script == QChar::Script_Common || script == QChar::Script_Inherited) { 0667 continue; 0668 } 0669 0670 if (!c.isLetter()) { 0671 continue; 0672 } 0673 0674 scriptCounts[script]++; 0675 totalCount++; 0676 } 0677 0678 QList<QChar::Script> relevantScripts; 0679 0680 if (totalCount == 0) { 0681 return relevantScripts; 0682 } 0683 0684 if (scriptCounts.size() == 1) { 0685 return {scriptCounts.cbegin().key()}; 0686 } 0687 0688 for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) { 0689 // return run types that used for 40% or more of the string 0690 const int scriptCount = it.value(); 0691 const auto currentScript = it.key(); 0692 if (scriptCount * 100 / totalCount >= 40) { 0693 relevantScripts << currentScript; 0694 // always return basic latin if found more than 15%. 0695 } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) { 0696 relevantScripts << currentScript; 0697 } 0698 } 0699 0700 return relevantScripts; 0701 } 0702 0703 QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts) 0704 { 0705 if (sample.size() < MIN_LENGTH) { 0706 return QStringList(); 0707 } 0708 0709 QStringList guesses; 0710 for (const QChar::Script script : scripts) { 0711 guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script))); 0712 } 0713 0714 return guesses; 0715 } 0716 0717 QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages) 0718 { 0719 QStringList ret; 0720 0721 const QVector<QString> sampleTrigrams = createOrderedModel(sample); 0722 0723 // Sort by score 0724 QMultiMap<int, QString> scores; 0725 for (const QString &language : languages) { 0726 if (s_knownModels.contains(language)) { 0727 scores.insert(distance(sampleTrigrams, s_knownModels[language]), language); 0728 } 0729 } 0730 0731 // Skip if either no results or best result is completely unknown (distance >= maxdistance) 0732 if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) { 0733 qCDebug(SONNET_LOG_CORE) << "No scores for" << sample; 0734 return ret; 0735 } 0736 0737 int counter = 0; 0738 double confidence = 0; 0739 0740 #if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0) 0741 QMultiMapIterator<int, QString> it(scores); 0742 #else 0743 QMapIterator<int, QString> it(scores); 0744 #endif 0745 it.next(); 0746 0747 QString prevItem = it.value(); 0748 int prevScore = it.key(); 0749 0750 while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) { 0751 it.next(); 0752 counter++; 0753 confidence += (it.key() - prevScore) / (double)it.key(); 0754 ret += prevItem; 0755 prevItem = it.value(); 0756 prevScore = it.key(); 0757 } 0758 if (counter < m_maxItems && confidence < m_minConfidence) { 0759 ret += prevItem; 0760 } 0761 0762 return ret; 0763 } 0764 0765 QVector<QString> GuessLanguagePrivate::createOrderedModel(const QString &content) 0766 { 0767 QHash<QString, int> trigramCounts; 0768 0769 // collect trigrams 0770 trigramCounts.reserve(content.size() - 2); 0771 for (int i = 0; i < (content.size() - 2); ++i) { 0772 QString tri = content.mid(i, 3).toLower(); 0773 trigramCounts[tri]++; 0774 } 0775 0776 // invert the map <freq, trigram> 0777 QVector<QPair<int, QString>> trigramFrequencyList; 0778 trigramFrequencyList.reserve(trigramCounts.size()); 0779 0780 auto it = trigramCounts.constBegin(); 0781 for (; it != trigramCounts.constEnd(); ++it) { 0782 const QChar *data = it.key().constData(); 0783 bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace())); 0784 0785 if (!hasTwoSpaces) { 0786 const int freq = it.value(); 0787 const QString &trigram = it.key(); 0788 trigramFrequencyList.append({freq, trigram}); 0789 } 0790 } 0791 0792 // sort descending by frequency 0793 std::sort(trigramFrequencyList.begin(), trigramFrequencyList.end(), [](const QPair<int, QString> &a, const QPair<int, QString> &b) { 0794 return a.first > b.first; 0795 }); 0796 0797 QVector<QString> orderedTrigrams; 0798 orderedTrigrams.reserve(trigramFrequencyList.size()); 0799 for (const auto &tri : std::as_const(trigramFrequencyList)) { 0800 orderedTrigrams.append(tri.second); 0801 } 0802 0803 return orderedTrigrams; 0804 } 0805 0806 int GuessLanguagePrivate::distance(const QVector<QString> &model, const QHash<QString, int> &knownModel) 0807 { 0808 int counter = -1; 0809 int dist = 0; 0810 0811 for (const QString &trigram : model) { 0812 const int val = knownModel.value(trigram, -1); 0813 if (val != -1) { 0814 dist += qAbs(++counter - val); 0815 } else { 0816 dist += MAXGRAMS; 0817 } 0818 0819 if (counter == (MAXGRAMS - 1)) { 0820 break; 0821 } 0822 } 0823 0824 return dist; 0825 } 0826 0827 QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates) 0828 { 0829 // Try to see how many languages we can get spell checking for 0830 QList<QSharedPointer<SpellerPlugin>> spellers; 0831 for (const QString &lang : candidates) { 0832 if (!Loader::openLoader()->languages().contains(lang)) { 0833 qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang; 0834 continue; 0835 } 0836 QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(lang); 0837 if (!plugin.isNull()) { 0838 spellers.append(plugin); 0839 } 0840 } 0841 0842 // If there's no spell checkers, give up 0843 if (spellers.isEmpty()) { 0844 return QString(); 0845 } 0846 0847 QMap<QString, int> correctHits; 0848 0849 WordTokenizer tokenizer(sentence); 0850 while (tokenizer.hasNext()) { 0851 Token word = tokenizer.next(); 0852 if (!tokenizer.isSpellcheckable()) { 0853 continue; 0854 } 0855 0856 for (int i = 0; i < spellers.count(); ++i) { 0857 if (spellers[i]->isCorrect(word.toString())) { 0858 correctHits[spellers[i]->language()]++; 0859 } 0860 } 0861 } 0862 0863 if (correctHits.isEmpty()) { 0864 return QString(); 0865 } 0866 0867 QMap<QString, int>::const_iterator max = correctHits.constBegin(); 0868 for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) { 0869 if (itr.value() > max.value()) { 0870 max = itr; 0871 } 0872 } 0873 return max.key(); 0874 } 0875 }