File indexing completed on 2024-04-28 11:48:59
0001 /* This file is part of the KDE libraries 0002 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #ifndef GUESSLANGUAGE_H 0008 #define GUESSLANGUAGE_H 0009 0010 #include <QString> 0011 #include <QStringList> 0012 0013 #include "sonnetcore_export.h" 0014 0015 namespace Sonnet 0016 { 0017 // Amount of trigrams in each file 0018 static const int MAXGRAMS = 300; 0019 0020 class GuessLanguagePrivate; 0021 0022 /** 0023 * @class Sonnet::GuessLanguage guesslanguage.h <Sonnet/GuessLanguage> 0024 * 0025 * @short GuessLanguage determines the language of a given text. 0026 * 0027 * GuessLanguage can determine the difference between ~75 languages for a given string. It is 0028 * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com> 0029 * called Languid. His script used a 2 part heuristic to determine language. First the text 0030 * is checked for the scripts it contains, then for each set of languages using those 0031 * scripts a n-gram frequency model of a given language is compared to a model of the text. 0032 * The most similar language model is assumed to be the language. If no language is found 0033 * an empty string is returned. 0034 * 0035 * 0036 * @author Jacob Rideout <kde@jacobrideout.net> 0037 * @since 4.3 0038 */ 0039 class SONNETCORE_EXPORT GuessLanguage 0040 { 0041 public: 0042 /** Constructor 0043 * Creates a new GuessLanguage instance. If @p text is specified, 0044 * it sets the text to be checked. 0045 * @param text the text that is to be checked 0046 */ 0047 GuessLanguage(); 0048 0049 /** Destructor 0050 */ 0051 ~GuessLanguage(); 0052 0053 GuessLanguage(const GuessLanguage &) = delete; 0054 GuessLanguage &operator=(const GuessLanguage &) = delete; 0055 0056 /** 0057 * Sets limits to number of languages returned by identify(). The confidence for each language is computed 0058 * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get 0059 * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly 0060 * as after call to setLimits(1,0). 0061 * @param maxItems The list returned by identify() will never have more than maxItems item 0062 * @param minConfidence The list will have only enough items for their summary confidence equal 0063 * or exceed minConfidence. 0064 */ 0065 void setLimits(int maxItems, double minConfidence); 0066 0067 /** 0068 * Returns the 2 digit ISO 639-1 code for the language of the currently 0069 * set text and. Three digits are returned only in the case where a 2 digit 0070 * code does not exist. If @p text isn't empty, set the text to checked. 0071 * @param text to be identified 0072 * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means 0073 * it is impossible to determine language with confidence required by setLimits 0074 */ 0075 QString identify(const QString &text, const QStringList &suggestions = QStringList()) const; 0076 0077 private: 0078 GuessLanguagePrivate *const d; 0079 }; 0080 } 0081 0082 #endif