File indexing completed on 2024-03-24 04:03:40
0001 /* This file is part of the KDE libraries 0002 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> 0003 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> 0004 SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 0009 #include <QCoreApplication> 0010 #include <QDataStream> 0011 #include <QFile> 0012 #include <QLocale> 0013 #include <QStandardPaths> 0014 0015 #include "core_debug.h" 0016 #include "guesslanguage.h" 0017 #include "loader_p.h" 0018 #include "speller.h" 0019 #include "spellerplugin_p.h" 0020 #include "tokenizer_p.h" 0021 0022 /* 0023 All language tags should be valid according to IETF BCP 47, as codified in RFC 4646. 0024 ISO 639-1 codes should be used for the language part except for cases where there 0025 exists no code, then 639-3 codes should be used. Country codes should only be used 0026 in special cases. Scripts can be differentiated by IANA subtags, available here: 0027 http://www.iana.org/assignments/language-subtag-registry 0028 The script tags correspond to ISO 15924 0029 0030 An overview of the best practices concerning language tagging is available here: 0031 http://www.w3.org/International/articles/language-tags/Overview.en.php 0032 0033 lang tags should use underscores (_) rather than hyphens (-) to separate subsections. 0034 0035 EXCEPTIONS: 0036 For cases of known differences from the above tagging scheme and major 0037 spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers 0038 shall be used. All exception shall be noted here: 0039 0040 BCP SPELLCHECK 0041 az-Latn az 0042 0043 */ 0044 0045 namespace Sonnet 0046 { 0047 class GuessLanguagePrivate 0048 { 0049 public: 0050 GuessLanguagePrivate(); 0051 // language trigram score 0052 static QHash<QString, QHash<QString, int>> s_knownModels; 0053 0054 void loadModels(); 0055 QList<QChar::Script> findRuns(const QString &text); 0056 QList<QString> createOrderedModel(const QString &content); 0057 int distance(const QList<QString> &model, const QHash<QString, int> &knownModel); 0058 QStringList guessFromTrigrams(const QString &sample, const QStringList &langs); 0059 QStringList identify(const QString &sample, const QList<QChar::Script> &scripts); 0060 QString guessFromDictionaries(const QString &sentence, const QStringList &candidates); 0061 0062 static QSet<QString> s_knownDictionaries; 0063 static QMultiHash<QChar::Script, QString> s_scriptLanguages; 0064 static QMap<QString, QString> s_dictionaryNameMap; 0065 0066 const int MIN_LENGTH; 0067 int m_maxItems; 0068 double m_minConfidence; 0069 }; 0070 0071 QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels; 0072 QSet<QString> GuessLanguagePrivate::s_knownDictionaries; 0073 QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages; 0074 QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap; 0075 0076 QStringList getNames(QLocale::Script script) 0077 { 0078 QStringList locales; 0079 const auto matchingLocales = QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry); 0080 locales.reserve(matchingLocales.size()); 0081 for (const QLocale &locale : matchingLocales) { 0082 locales << locale.name(); 0083 } 0084 return locales; 0085 } 0086 0087 GuessLanguagePrivate::GuessLanguagePrivate() 0088 : MIN_LENGTH(5) 0089 , m_maxItems(1) 0090 , m_minConfidence(0) 0091 { 0092 if (!s_scriptLanguages.isEmpty()) { 0093 return; 0094 } 0095 0096 const QStringList languages = Loader::openLoader()->languages(); 0097 s_knownDictionaries = QSet<QString>(languages.begin(), languages.end()); 0098 QSet<QString> dictionaryLanguages; 0099 for (const QString &dictName : std::as_const(s_knownDictionaries)) { 0100 QString languageName = QLocale(dictName).name(); 0101 if (languageName.isEmpty()) { 0102 qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName; 0103 continue; 0104 } 0105 dictionaryLanguages.insert(languageName); 0106 } 0107 0108 QSet<QString> allLanguages; 0109 for (int i = 0; i < int(QChar::ScriptCount); i++) { 0110 QChar::Script script = static_cast<QChar::Script>(i); 0111 QStringList names; 0112 switch (script) { 0113 case QChar::Script_Latin: 0114 names = getNames(QLocale::LatinScript); 0115 break; 0116 case QChar::Script_Greek: 0117 names = getNames(QLocale::GreekScript); 0118 break; 0119 case QChar::Script_Cyrillic: 0120 names = getNames(QLocale::CyrillicScript); 0121 break; 0122 case QChar::Script_Armenian: 0123 names = getNames(QLocale::ArmenianScript); 0124 break; 0125 case QChar::Script_Hebrew: 0126 names = getNames(QLocale::HebrewScript); 0127 break; 0128 case QChar::Script_Arabic: 0129 names = getNames(QLocale::ArabicScript); 0130 break; 0131 case QChar::Script_Syriac: 0132 names = getNames(QLocale::SyriacScript); 0133 break; 0134 case QChar::Script_Thaana: 0135 names = getNames(QLocale::ThaanaScript); 0136 break; 0137 case QChar::Script_Devanagari: 0138 names = getNames(QLocale::DevanagariScript); 0139 break; 0140 case QChar::Script_Bengali: 0141 names = getNames(QLocale::BengaliScript); 0142 break; 0143 case QChar::Script_Gurmukhi: 0144 names = getNames(QLocale::GurmukhiScript); 0145 break; 0146 case QChar::Script_Gujarati: 0147 names = getNames(QLocale::GujaratiScript); 0148 break; 0149 case QChar::Script_Oriya: 0150 names = getNames(QLocale::OriyaScript); 0151 break; 0152 case QChar::Script_Tamil: 0153 names = getNames(QLocale::TamilScript); 0154 break; 0155 case QChar::Script_Telugu: 0156 names = getNames(QLocale::TeluguScript); 0157 break; 0158 case QChar::Script_Kannada: 0159 names = getNames(QLocale::KannadaScript); 0160 break; 0161 case QChar::Script_Malayalam: 0162 names = getNames(QLocale::MalayalamScript); 0163 break; 0164 case QChar::Script_Sinhala: 0165 names = getNames(QLocale::SinhalaScript); 0166 break; 0167 case QChar::Script_Thai: 0168 names = getNames(QLocale::ThaiScript); 0169 break; 0170 case QChar::Script_Lao: 0171 names = getNames(QLocale::LaoScript); 0172 break; 0173 case QChar::Script_Tibetan: 0174 names = getNames(QLocale::TibetanScript); 0175 break; 0176 case QChar::Script_Myanmar: 0177 names = getNames(QLocale::MyanmarScript); 0178 break; 0179 case QChar::Script_Georgian: 0180 names = getNames(QLocale::GeorgianScript); 0181 break; 0182 case QChar::Script_Hangul: 0183 names = getNames(QLocale::HangulScript); 0184 break; 0185 case QChar::Script_Ethiopic: 0186 names = getNames(QLocale::EthiopicScript); 0187 break; 0188 case QChar::Script_Cherokee: 0189 names = getNames(QLocale::CherokeeScript); 0190 break; 0191 case QChar::Script_CanadianAboriginal: 0192 names = getNames(QLocale::CanadianAboriginalScript); 0193 break; 0194 case QChar::Script_Ogham: 0195 names = getNames(QLocale::OghamScript); 0196 break; 0197 case QChar::Script_Runic: 0198 names = getNames(QLocale::RunicScript); 0199 break; 0200 case QChar::Script_Khmer: 0201 names = getNames(QLocale::KhmerScript); 0202 break; 0203 case QChar::Script_Mongolian: 0204 names = getNames(QLocale::MongolianScript); 0205 break; 0206 case QChar::Script_Hiragana: 0207 names = getNames(QLocale::HiraganaScript); 0208 break; 0209 case QChar::Script_Katakana: 0210 names = getNames(QLocale::KatakanaScript); 0211 break; 0212 case QChar::Script_Bopomofo: 0213 names = getNames(QLocale::BopomofoScript); 0214 break; 0215 case QChar::Script_Han: 0216 names = getNames(QLocale::HanScript); 0217 break; 0218 case QChar::Script_Yi: 0219 names = getNames(QLocale::YiScript); 0220 break; 0221 case QChar::Script_OldItalic: 0222 names = getNames(QLocale::OldItalicScript); 0223 break; 0224 case QChar::Script_Gothic: 0225 names = getNames(QLocale::GothicScript); 0226 break; 0227 case QChar::Script_Deseret: 0228 names = getNames(QLocale::DeseretScript); 0229 break; 0230 case QChar::Script_Tagalog: 0231 names = getNames(QLocale::TagalogScript); 0232 break; 0233 case QChar::Script_Hanunoo: 0234 names = getNames(QLocale::HanunooScript); 0235 break; 0236 case QChar::Script_Buhid: 0237 names = getNames(QLocale::BuhidScript); 0238 break; 0239 case QChar::Script_Tagbanwa: 0240 names = getNames(QLocale::TagbanwaScript); 0241 break; 0242 case QChar::Script_Coptic: 0243 names = getNames(QLocale::CopticScript); 0244 break; 0245 case QChar::Script_Limbu: 0246 names = getNames(QLocale::LimbuScript); 0247 break; 0248 case QChar::Script_TaiLe: 0249 names = getNames(QLocale::TaiLeScript); 0250 break; 0251 case QChar::Script_LinearB: 0252 names = getNames(QLocale::LinearBScript); 0253 break; 0254 case QChar::Script_Ugaritic: 0255 names = getNames(QLocale::UgariticScript); 0256 break; 0257 case QChar::Script_Shavian: 0258 names = getNames(QLocale::ShavianScript); 0259 break; 0260 case QChar::Script_Osmanya: 0261 names = getNames(QLocale::OsmanyaScript); 0262 break; 0263 case QChar::Script_Cypriot: 0264 names = getNames(QLocale::CypriotScript); 0265 break; 0266 case QChar::Script_Braille: 0267 names = getNames(QLocale::BrailleScript); 0268 break; 0269 case QChar::Script_Buginese: 0270 names = getNames(QLocale::BugineseScript); 0271 break; 0272 case QChar::Script_NewTaiLue: 0273 names = getNames(QLocale::NewTaiLueScript); 0274 break; 0275 case QChar::Script_Glagolitic: 0276 names = getNames(QLocale::GlagoliticScript); 0277 break; 0278 case QChar::Script_Tifinagh: 0279 names = getNames(QLocale::TifinaghScript); 0280 break; 0281 case QChar::Script_SylotiNagri: 0282 names = getNames(QLocale::SylotiNagriScript); 0283 break; 0284 case QChar::Script_OldPersian: 0285 names = getNames(QLocale::OldPersianScript); 0286 break; 0287 case QChar::Script_Kharoshthi: 0288 names = getNames(QLocale::KharoshthiScript); 0289 break; 0290 case QChar::Script_Balinese: 0291 names = getNames(QLocale::BalineseScript); 0292 break; 0293 case QChar::Script_Cuneiform: 0294 names = getNames(QLocale::CuneiformScript); 0295 break; 0296 case QChar::Script_Phoenician: 0297 names = getNames(QLocale::PhoenicianScript); 0298 break; 0299 case QChar::Script_PhagsPa: 0300 names = getNames(QLocale::PhagsPaScript); 0301 break; 0302 case QChar::Script_Nko: 0303 names = getNames(QLocale::NkoScript); 0304 break; 0305 case QChar::Script_Sundanese: 0306 names = getNames(QLocale::SundaneseScript); 0307 break; 0308 case QChar::Script_Lepcha: 0309 names = getNames(QLocale::LepchaScript); 0310 break; 0311 case QChar::Script_OlChiki: 0312 names = getNames(QLocale::OlChikiScript); 0313 break; 0314 case QChar::Script_Vai: 0315 names = getNames(QLocale::VaiScript); 0316 break; 0317 case QChar::Script_Saurashtra: 0318 names = getNames(QLocale::SaurashtraScript); 0319 break; 0320 case QChar::Script_KayahLi: 0321 names = getNames(QLocale::KayahLiScript); 0322 break; 0323 case QChar::Script_Rejang: 0324 names = getNames(QLocale::RejangScript); 0325 break; 0326 case QChar::Script_Lycian: 0327 names = getNames(QLocale::LycianScript); 0328 break; 0329 case QChar::Script_Carian: 0330 names = getNames(QLocale::CarianScript); 0331 break; 0332 case QChar::Script_Lydian: 0333 names = getNames(QLocale::LydianScript); 0334 break; 0335 case QChar::Script_Cham: 0336 names = getNames(QLocale::ChamScript); 0337 break; 0338 case QChar::Script_TaiTham: 0339 names = getNames(QLocale::LannaScript); 0340 break; 0341 case QChar::Script_TaiViet: 0342 names = getNames(QLocale::TaiVietScript); 0343 break; 0344 case QChar::Script_Avestan: 0345 names = getNames(QLocale::AvestanScript); 0346 break; 0347 case QChar::Script_EgyptianHieroglyphs: 0348 names = getNames(QLocale::EgyptianHieroglyphsScript); 0349 break; 0350 case QChar::Script_Samaritan: 0351 names = getNames(QLocale::SamaritanScript); 0352 break; 0353 case QChar::Script_Lisu: 0354 names = getNames(QLocale::FraserScript); 0355 break; 0356 case QChar::Script_Bamum: 0357 names = getNames(QLocale::BamumScript); 0358 break; 0359 case QChar::Script_Javanese: 0360 names = getNames(QLocale::JavaneseScript); 0361 break; 0362 case QChar::Script_MeeteiMayek: 0363 names = getNames(QLocale::MeiteiMayekScript); 0364 break; 0365 case QChar::Script_ImperialAramaic: 0366 names = getNames(QLocale::ImperialAramaicScript); 0367 break; 0368 case QChar::Script_OldSouthArabian: 0369 names = getNames(QLocale::OldSouthArabianScript); 0370 break; 0371 case QChar::Script_InscriptionalParthian: 0372 names = getNames(QLocale::InscriptionalParthianScript); 0373 break; 0374 case QChar::Script_InscriptionalPahlavi: 0375 names = getNames(QLocale::InscriptionalPahlaviScript); 0376 break; 0377 case QChar::Script_Kaithi: 0378 names = getNames(QLocale::KaithiScript); 0379 break; 0380 case QChar::Script_Batak: 0381 names = getNames(QLocale::BatakScript); 0382 break; 0383 case QChar::Script_Brahmi: 0384 names = getNames(QLocale::BrahmiScript); 0385 break; 0386 case QChar::Script_Mandaic: 0387 names = getNames(QLocale::MandaeanScript); 0388 break; 0389 case QChar::Script_Chakma: 0390 names = getNames(QLocale::ChakmaScript); 0391 break; 0392 case QChar::Script_MeroiticCursive: 0393 case QChar::Script_MeroiticHieroglyphs: 0394 names = getNames(QLocale::MeroiticCursiveScript); 0395 names.append(getNames(QLocale::MeroiticScript)); 0396 break; 0397 case QChar::Script_Miao: 0398 names = getNames(QLocale::PollardPhoneticScript); 0399 break; 0400 case QChar::Script_Sharada: 0401 names = getNames(QLocale::SharadaScript); 0402 break; 0403 case QChar::Script_SoraSompeng: 0404 names = getNames(QLocale::SoraSompengScript); 0405 break; 0406 case QChar::Script_Takri: 0407 names = getNames(QLocale::TakriScript); 0408 break; 0409 case QChar::Script_CaucasianAlbanian: 0410 names = getNames(QLocale::CaucasianAlbanianScript); 0411 break; 0412 case QChar::Script_BassaVah: 0413 names = getNames(QLocale::BassaVahScript); 0414 break; 0415 case QChar::Script_Duployan: 0416 names = getNames(QLocale::DuployanScript); 0417 break; 0418 case QChar::Script_Elbasan: 0419 names = getNames(QLocale::ElbasanScript); 0420 break; 0421 case QChar::Script_Grantha: 0422 names = getNames(QLocale::GranthaScript); 0423 break; 0424 case QChar::Script_PahawhHmong: 0425 names = getNames(QLocale::PahawhHmongScript); 0426 break; 0427 case QChar::Script_Khojki: 0428 names = getNames(QLocale::KhojkiScript); 0429 break; 0430 case QChar::Script_LinearA: 0431 names = getNames(QLocale::LinearAScript); 0432 break; 0433 case QChar::Script_Mahajani: 0434 names = getNames(QLocale::MahajaniScript); 0435 break; 0436 case QChar::Script_Manichaean: 0437 names = getNames(QLocale::ManichaeanScript); 0438 break; 0439 case QChar::Script_MendeKikakui: 0440 names = getNames(QLocale::MendeKikakuiScript); 0441 break; 0442 case QChar::Script_Modi: 0443 names = getNames(QLocale::ModiScript); 0444 break; 0445 case QChar::Script_Mro: 0446 names = getNames(QLocale::MroScript); 0447 break; 0448 case QChar::Script_OldNorthArabian: 0449 names = getNames(QLocale::OldNorthArabianScript); 0450 break; 0451 case QChar::Script_Nabataean: 0452 names = getNames(QLocale::NabataeanScript); 0453 break; 0454 case QChar::Script_Palmyrene: 0455 names = getNames(QLocale::PalmyreneScript); 0456 break; 0457 case QChar::Script_PauCinHau: 0458 names = getNames(QLocale::PauCinHauScript); 0459 break; 0460 case QChar::Script_OldPermic: 0461 names = getNames(QLocale::OldPermicScript); 0462 break; 0463 case QChar::Script_PsalterPahlavi: 0464 names = getNames(QLocale::PsalterPahlaviScript); 0465 break; 0466 case QChar::Script_Siddham: 0467 names = getNames(QLocale::SiddhamScript); 0468 break; 0469 case QChar::Script_Khudawadi: 0470 names = getNames(QLocale::KhudawadiScript); 0471 break; 0472 case QChar::Script_Tirhuta: 0473 names = getNames(QLocale::TirhutaScript); 0474 break; 0475 case QChar::Script_WarangCiti: 0476 names = getNames(QLocale::VarangKshitiScript); 0477 break; 0478 case QChar::Script_Ahom: 0479 names = getNames(QLocale::AhomScript); 0480 break; 0481 case QChar::Script_AnatolianHieroglyphs: 0482 names = getNames(QLocale::AnatolianHieroglyphsScript); 0483 break; 0484 case QChar::Script_Hatran: 0485 names = getNames(QLocale::HatranScript); 0486 break; 0487 case QChar::Script_Multani: 0488 names = getNames(QLocale::MultaniScript); 0489 break; 0490 case QChar::Script_OldHungarian: 0491 names = getNames(QLocale::OldHungarianScript); 0492 break; 0493 case QChar::Script_Unknown: 0494 case QChar::Script_Inherited: 0495 case QChar::Script_Common: 0496 case QChar::Script_OldTurkic: 0497 case QChar::Script_SignWriting: 0498 break; 0499 default: 0500 qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script; 0501 break; 0502 } 0503 allLanguages.unite(QSet<QString>(names.constBegin(), names.constEnd())); 0504 0505 { // Remove unknown languages 0506 QStringList pruned; 0507 for (const QString &name : std::as_const(names)) { 0508 if (!dictionaryLanguages.contains(name)) { 0509 continue; 0510 } 0511 pruned.append(name); 0512 } 0513 names = pruned; 0514 } 0515 0516 if (names.isEmpty()) { 0517 continue; 0518 } 0519 0520 for (const QString &name : std::as_const(names)) { 0521 s_scriptLanguages.insert(script, name); 0522 } 0523 } 0524 0525 // Try to handle some badly named dictionaries 0526 if (!allLanguages.contains(s_knownDictionaries)) { 0527 QSet<QString> dicts(s_knownDictionaries); 0528 dicts.subtract(allLanguages); 0529 for (const QString &dictName : std::as_const(dicts)) { 0530 QString languageName = QLocale(dictName).name(); 0531 if (languageName.isEmpty()) { 0532 qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName; 0533 continue; 0534 } 0535 s_dictionaryNameMap[languageName] = dictName; 0536 if (std::find(s_scriptLanguages.cbegin(), s_scriptLanguages.cend(), languageName) == s_scriptLanguages.cend()) { 0537 qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName; 0538 } 0539 } 0540 } 0541 } 0542 0543 GuessLanguage::GuessLanguage() 0544 : d(new GuessLanguagePrivate) 0545 { 0546 } 0547 0548 GuessLanguage::~GuessLanguage() = default; 0549 0550 QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const 0551 { 0552 if (text.isEmpty()) { 0553 return QString(); 0554 } 0555 0556 // Filter for available dictionaries 0557 QStringList suggestionsList; 0558 for (const QString &suggestion : suggestionsListIn) { 0559 if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) { 0560 suggestionsList.append(suggestion); 0561 } 0562 } 0563 0564 // Load the model on demand 0565 if (d->s_knownModels.isEmpty()) { 0566 d->loadModels(); 0567 } 0568 0569 const QList<QChar::Script> scriptsList = d->findRuns(text); 0570 0571 QStringList candidateLanguages = d->identify(text, scriptsList); 0572 0573 // if guessing from trigrams fail 0574 if (candidateLanguages.isEmpty()) { 0575 for (const QChar::Script script : scriptsList) { 0576 const auto languagesList = d->s_scriptLanguages.values(script); 0577 for (const QString &lang : languagesList) { 0578 if (!d->s_knownModels.contains(lang)) { 0579 candidateLanguages.append(lang); 0580 } 0581 } 0582 } 0583 } 0584 0585 // Hack for some bad dictionary names 0586 for (int i = 0; i < candidateLanguages.count(); i++) { 0587 if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) { 0588 candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]); 0589 } 0590 } 0591 0592 if (candidateLanguages.count() == 1) { 0593 return candidateLanguages.first(); 0594 } 0595 0596 // Wasn't able to get a good guess with the trigrams, try checking all 0597 // dictionaries for the suggested languages. 0598 candidateLanguages.append(suggestionsList); 0599 candidateLanguages.removeDuplicates(); 0600 QString identified = d->guessFromDictionaries(text, candidateLanguages); 0601 if (!identified.isEmpty()) { 0602 return identified; 0603 } 0604 0605 qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text; 0606 0607 // None of our methods worked, just return the best suggestion 0608 if (!suggestionsList.isEmpty()) { 0609 return suggestionsList.first(); 0610 } 0611 0612 qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text; 0613 0614 // Not even any suggestions, give up 0615 return QString(); 0616 } 0617 0618 void GuessLanguage::setLimits(int maxItems, double minConfidence) 0619 { 0620 d->m_maxItems = maxItems; 0621 d->m_minConfidence = minConfidence; 0622 } 0623 0624 void GuessLanguagePrivate::loadModels() 0625 { 0626 // use trigrams from resource file, easy to deploy on all platforms 0627 const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map"); 0628 qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile; 0629 0630 QFile sin(triMapFile); 0631 if (!sin.open(QIODevice::ReadOnly)) { 0632 qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile; 0633 return; 0634 } 0635 0636 QDataStream in(&sin); 0637 in >> s_knownModels; 0638 0639 // Sanity check 0640 QSet<QString> availableLanguages; 0641 QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels); 0642 while (iterator.hasNext()) { 0643 iterator.next(); 0644 if (iterator.value().count() < MAXGRAMS) { 0645 qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS; 0646 } 0647 availableLanguages.insert(iterator.key()); 0648 } 0649 QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd()); 0650 knownLanguages.subtract(availableLanguages); 0651 if (!knownLanguages.isEmpty()) { 0652 qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages; 0653 } 0654 } 0655 0656 QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text) 0657 { 0658 QHash<QChar::Script, int> scriptCounts; 0659 0660 int totalCount = 0; 0661 0662 for (const QChar c : text) { 0663 const QChar::Script script = c.script(); 0664 0665 if (script == QChar::Script_Common || script == QChar::Script_Inherited) { 0666 continue; 0667 } 0668 0669 if (!c.isLetter()) { 0670 continue; 0671 } 0672 0673 scriptCounts[script]++; 0674 totalCount++; 0675 } 0676 0677 QList<QChar::Script> relevantScripts; 0678 0679 if (totalCount == 0) { 0680 return relevantScripts; 0681 } 0682 0683 if (scriptCounts.size() == 1) { 0684 return {scriptCounts.cbegin().key()}; 0685 } 0686 0687 for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) { 0688 // return run types that used for 40% or more of the string 0689 const int scriptCount = it.value(); 0690 const auto currentScript = it.key(); 0691 if (scriptCount * 100 / totalCount >= 40) { 0692 relevantScripts << currentScript; 0693 // always return basic latin if found more than 15%. 0694 } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) { 0695 relevantScripts << currentScript; 0696 } 0697 } 0698 0699 return relevantScripts; 0700 } 0701 0702 QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts) 0703 { 0704 if (sample.size() < MIN_LENGTH) { 0705 return QStringList(); 0706 } 0707 0708 QStringList guesses; 0709 for (const QChar::Script script : scripts) { 0710 guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script))); 0711 } 0712 0713 return guesses; 0714 } 0715 0716 QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages) 0717 { 0718 QStringList ret; 0719 0720 const QList<QString> sampleTrigrams = createOrderedModel(sample); 0721 0722 // Sort by score 0723 QMultiMap<int, QString> scores; 0724 for (const QString &language : languages) { 0725 if (s_knownModels.contains(language)) { 0726 scores.insert(distance(sampleTrigrams, s_knownModels[language]), language); 0727 } 0728 } 0729 0730 // Skip if either no results or best result is completely unknown (distance >= maxdistance) 0731 if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) { 0732 qCDebug(SONNET_LOG_CORE) << "No scores for" << sample; 0733 return ret; 0734 } 0735 0736 int counter = 0; 0737 double confidence = 0; 0738 0739 QMultiMapIterator<int, QString> it(scores); 0740 it.next(); 0741 0742 QString prevItem = it.value(); 0743 int prevScore = it.key(); 0744 0745 while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) { 0746 it.next(); 0747 counter++; 0748 confidence += (it.key() - prevScore) / (double)it.key(); 0749 ret += prevItem; 0750 prevItem = it.value(); 0751 prevScore = it.key(); 0752 } 0753 if (counter < m_maxItems && confidence < m_minConfidence) { 0754 ret += prevItem; 0755 } 0756 0757 return ret; 0758 } 0759 0760 QList<QString> GuessLanguagePrivate::createOrderedModel(const QString &content) 0761 { 0762 QHash<QString, int> trigramCounts; 0763 0764 // collect trigrams 0765 trigramCounts.reserve(content.size() - 2); 0766 for (int i = 0; i < (content.size() - 2); ++i) { 0767 QString tri = content.mid(i, 3).toLower(); 0768 trigramCounts[tri]++; 0769 } 0770 0771 // invert the map <freq, trigram> 0772 QList<QPair<int, QString>> trigramFrequencyList; 0773 trigramFrequencyList.reserve(trigramCounts.size()); 0774 0775 auto it = trigramCounts.constBegin(); 0776 for (; it != trigramCounts.constEnd(); ++it) { 0777 const QChar *data = it.key().constData(); 0778 bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace())); 0779 0780 if (!hasTwoSpaces) { 0781 const int freq = it.value(); 0782 const QString &trigram = it.key(); 0783 trigramFrequencyList.append({freq, trigram}); 0784 } 0785 } 0786 0787 // sort descending by frequency 0788 std::sort(trigramFrequencyList.begin(), trigramFrequencyList.end(), [](const QPair<int, QString> &a, const QPair<int, QString> &b) { 0789 return a.first > b.first; 0790 }); 0791 0792 QList<QString> orderedTrigrams; 0793 orderedTrigrams.reserve(trigramFrequencyList.size()); 0794 for (const auto &tri : std::as_const(trigramFrequencyList)) { 0795 orderedTrigrams.append(tri.second); 0796 } 0797 0798 return orderedTrigrams; 0799 } 0800 0801 int GuessLanguagePrivate::distance(const QList<QString> &model, const QHash<QString, int> &knownModel) 0802 { 0803 int counter = -1; 0804 int dist = 0; 0805 0806 for (const QString &trigram : model) { 0807 const int val = knownModel.value(trigram, -1); 0808 if (val != -1) { 0809 dist += qAbs(++counter - val); 0810 } else { 0811 dist += MAXGRAMS; 0812 } 0813 0814 if (counter == (MAXGRAMS - 1)) { 0815 break; 0816 } 0817 } 0818 0819 return dist; 0820 } 0821 0822 QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates) 0823 { 0824 // Try to see how many languages we can get spell checking for 0825 QList<QSharedPointer<SpellerPlugin>> spellers; 0826 for (const QString &lang : candidates) { 0827 if (!Loader::openLoader()->languages().contains(lang)) { 0828 qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang; 0829 continue; 0830 } 0831 QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(lang); 0832 if (!plugin.isNull()) { 0833 spellers.append(plugin); 0834 } 0835 } 0836 0837 // If there's no spell checkers, give up 0838 if (spellers.isEmpty()) { 0839 return QString(); 0840 } 0841 0842 QMap<QString, int> correctHits; 0843 0844 WordTokenizer tokenizer(sentence); 0845 while (tokenizer.hasNext()) { 0846 Token word = tokenizer.next(); 0847 if (!tokenizer.isSpellcheckable()) { 0848 continue; 0849 } 0850 0851 for (int i = 0; i < spellers.count(); ++i) { 0852 if (spellers[i]->isCorrect(word.toString())) { 0853 correctHits[spellers[i]->language()]++; 0854 } 0855 } 0856 } 0857 0858 if (correctHits.isEmpty()) { 0859 return QString(); 0860 } 0861 0862 QMap<QString, int>::const_iterator max = correctHits.constBegin(); 0863 for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) { 0864 if (itr.value() > max.value()) { 0865 max = itr; 0866 } 0867 } 0868 return max.key(); 0869 } 0870 }