File indexing completed on 2024-04-28 07:50:09

0001 /*  This file is part of the KDE libraries
0002     SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
0003 
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #ifndef GUESSLANGUAGE_H
0008 #define GUESSLANGUAGE_H
0009 
0010 #include <QString>
0011 #include <QStringList>
0012 
0013 #include "sonnetcore_export.h"
0014 
0015 #include <memory>
0016 
0017 namespace Sonnet
0018 {
0019 // Amount of trigrams in each file
0020 static const int MAXGRAMS = 300;
0021 
0022 class GuessLanguagePrivate;
0023 
0024 /**
0025  * @class Sonnet::GuessLanguage guesslanguage.h <Sonnet/GuessLanguage>
0026  *
0027  * @short GuessLanguage determines the language of a given text.
0028  *
0029  * GuessLanguage can determine the difference between ~75 languages for a given string. It is
0030  * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com>
0031  * called Languid. His script used a 2 part heuristic to determine language. First the text
0032  * is checked for the scripts it contains, then for each set of languages using those
0033  * scripts a n-gram frequency model of a given language is compared to a model of the text.
0034  * The most similar language model is assumed to be the language. If no language is found
0035  * an empty string is returned.
0036  *
0037  *
0038  * @author Jacob Rideout <kde@jacobrideout.net>
0039  * @since 4.3
0040  */
0041 class SONNETCORE_EXPORT GuessLanguage
0042 {
0043 public:
0044     /** Constructor
0045      * Creates a new GuessLanguage instance. If @p text is specified,
0046      * it sets the text to be checked.
0047      * @param text the text that is to be checked
0048      */
0049     GuessLanguage();
0050 
0051     /** Destructor
0052      */
0053     ~GuessLanguage();
0054 
0055     GuessLanguage(const GuessLanguage &) = delete;
0056     GuessLanguage &operator=(const GuessLanguage &) = delete;
0057 
0058     /**
0059      * Sets limits to number of languages returned by identify(). The confidence for each language is computed
0060      * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get
0061      * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly
0062      * as after call to setLimits(1,0).
0063      * @param maxItems The list returned by identify() will never have more than maxItems item
0064      * @param minConfidence The list will have only enough items for their summary confidence equal
0065      * or exceed minConfidence.
0066      */
0067     void setLimits(int maxItems, double minConfidence);
0068 
0069     /**
0070      * Returns the 2 digit ISO 639-1 code for the language of the currently
0071      * set text and. Three digits are returned only in the case where a 2 digit
0072      * code does not exist. If @p text isn't empty, set the text to checked.
0073      * @param text to be identified
0074      * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means
0075      * it is impossible to determine language with confidence required by setLimits
0076      */
0077     QString identify(const QString &text, const QStringList &suggestions = QStringList()) const;
0078 
0079 private:
0080     std::unique_ptr<GuessLanguagePrivate> const d;
0081 };
0082 }
0083 
0084 #endif