File indexing completed on 2024-03-24 04:03:40
0001 /* This file is part of the KDE libraries 0002 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #ifndef GUESSLANGUAGE_H 0008 #define GUESSLANGUAGE_H 0009 0010 #include <QString> 0011 #include <QStringList> 0012 0013 #include "sonnetcore_export.h" 0014 0015 #include <memory> 0016 0017 namespace Sonnet 0018 { 0019 // Amount of trigrams in each file 0020 static const int MAXGRAMS = 300; 0021 0022 class GuessLanguagePrivate; 0023 0024 /** 0025 * @class Sonnet::GuessLanguage guesslanguage.h <Sonnet/GuessLanguage> 0026 * 0027 * @short GuessLanguage determines the language of a given text. 0028 * 0029 * GuessLanguage can determine the difference between ~75 languages for a given string. It is 0030 * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com> 0031 * called Languid. His script used a 2 part heuristic to determine language. First the text 0032 * is checked for the scripts it contains, then for each set of languages using those 0033 * scripts a n-gram frequency model of a given language is compared to a model of the text. 0034 * The most similar language model is assumed to be the language. If no language is found 0035 * an empty string is returned. 0036 * 0037 * 0038 * @author Jacob Rideout <kde@jacobrideout.net> 0039 * @since 4.3 0040 */ 0041 class SONNETCORE_EXPORT GuessLanguage 0042 { 0043 public: 0044 /** Constructor 0045 * Creates a new GuessLanguage instance. If @p text is specified, 0046 * it sets the text to be checked. 0047 * @param text the text that is to be checked 0048 */ 0049 GuessLanguage(); 0050 0051 /** Destructor 0052 */ 0053 ~GuessLanguage(); 0054 0055 GuessLanguage(const GuessLanguage &) = delete; 0056 GuessLanguage &operator=(const GuessLanguage &) = delete; 0057 0058 /** 0059 * Sets limits to number of languages returned by identify(). The confidence for each language is computed 0060 * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get 0061 * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly 0062 * as after call to setLimits(1,0). 0063 * @param maxItems The list returned by identify() will never have more than maxItems item 0064 * @param minConfidence The list will have only enough items for their summary confidence equal 0065 * or exceed minConfidence. 0066 */ 0067 void setLimits(int maxItems, double minConfidence); 0068 0069 /** 0070 * Returns the 2 digit ISO 639-1 code for the language of the currently 0071 * set text and. Three digits are returned only in the case where a 2 digit 0072 * code does not exist. If @p text isn't empty, set the text to checked. 0073 * @param text to be identified 0074 * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means 0075 * it is impossible to determine language with confidence required by setLimits 0076 */ 0077 QString identify(const QString &text, const QStringList &suggestions = QStringList()) const; 0078 0079 private: 0080 std::unique_ptr<GuessLanguagePrivate> const d; 0081 }; 0082 } 0083 0084 #endif