File indexing completed on 2024-04-28 11:48:59

0001 /*  This file is part of the KDE libraries
0002     SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
0003 
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #ifndef GUESSLANGUAGE_H
0008 #define GUESSLANGUAGE_H
0009 
0010 #include <QString>
0011 #include <QStringList>
0012 
0013 #include "sonnetcore_export.h"
0014 
0015 namespace Sonnet
0016 {
0017 // Amount of trigrams in each file
0018 static const int MAXGRAMS = 300;
0019 
0020 class GuessLanguagePrivate;
0021 
0022 /**
0023  * @class Sonnet::GuessLanguage guesslanguage.h <Sonnet/GuessLanguage>
0024  *
0025  * @short GuessLanguage determines the language of a given text.
0026  *
0027  * GuessLanguage can determine the difference between ~75 languages for a given string. It is
0028  * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com>
0029  * called Languid. His script used a 2 part heuristic to determine language. First the text
0030  * is checked for the scripts it contains, then for each set of languages using those
0031  * scripts a n-gram frequency model of a given language is compared to a model of the text.
0032  * The most similar language model is assumed to be the language. If no language is found
0033  * an empty string is returned.
0034  *
0035  *
0036  * @author Jacob Rideout <kde@jacobrideout.net>
0037  * @since 4.3
0038  */
0039 class SONNETCORE_EXPORT GuessLanguage
0040 {
0041 public:
0042     /** Constructor
0043      * Creates a new GuessLanguage instance. If @p text is specified,
0044      * it sets the text to be checked.
0045      * @param text the text that is to be checked
0046      */
0047     GuessLanguage();
0048 
0049     /** Destructor
0050      */
0051     ~GuessLanguage();
0052 
0053     GuessLanguage(const GuessLanguage &) = delete;
0054     GuessLanguage &operator=(const GuessLanguage &) = delete;
0055 
0056     /**
0057      * Sets limits to number of languages returned by identify(). The confidence for each language is computed
0058      * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get
0059      * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly
0060      * as after call to setLimits(1,0).
0061      * @param maxItems The list returned by identify() will never have more than maxItems item
0062      * @param minConfidence The list will have only enough items for their summary confidence equal
0063      * or exceed minConfidence.
0064      */
0065     void setLimits(int maxItems, double minConfidence);
0066 
0067     /**
0068      * Returns the 2 digit ISO 639-1 code for the language of the currently
0069      * set text and. Three digits are returned only in the case where a 2 digit
0070      * code does not exist. If @p text isn't empty, set the text to checked.
0071      * @param text to be identified
0072      * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means
0073      * it is impossible to determine language with confidence required by setLimits
0074      */
0075     QString identify(const QString &text, const QStringList &suggestions = QStringList()) const;
0076 
0077 private:
0078     GuessLanguagePrivate *const d;
0079 };
0080 }
0081 
0082 #endif