File indexing completed on 2024-04-21 04:00:55

0001 /*  This file is part of the KDE libraries
0002 
0003     SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #ifndef ABSTRACTTOKENIZER_H
0009 #define ABSTRACTTOKENIZER_H
0010 
0011 #include "sonnetcore_export.h"
0012 #include <QString>
0013 
0014 #include <memory>
0015 
0016 namespace Sonnet
0017 {
0018 struct Token {
0019     QStringView token = nullptr;
0020     int positionInBuffer = -1;
0021 
0022     QString toString() const
0023     {
0024         return token.toString();
0025     }
0026 
0027     /**
0028      * @brief length of this token
0029      */
0030     Q_DECL_CONSTEXPR int length() const
0031     {
0032         return token.size();
0033     }
0034 
0035     /**
0036      * @brief position in buffer of which the @ref token is a view
0037      */
0038     Q_DECL_CONSTEXPR int position() const
0039     {
0040         return positionInBuffer;
0041     }
0042 
0043     Q_DECL_CONSTEXPR bool isNull() const
0044     {
0045         return token.isNull();
0046     }
0047 
0048     Q_DECL_CONSTEXPR bool isEmpty() const
0049     {
0050         return token.isEmpty();
0051     }
0052 
0053     Q_DECL_CONSTEXPR QChar at(qsizetype n) const
0054     {
0055         return token.at(n);
0056     }
0057 };
0058 
0059 /**
0060  * @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs.
0061  *
0062  * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled
0063  * after Java-style iterators. During tokenization buffer can be modified using provided replace() method.
0064  *
0065  * @since 4.3
0066  */
0067 class AbstractTokenizer
0068 {
0069 public:
0070     virtual ~AbstractTokenizer()
0071     {
0072     }
0073 
0074     /**
0075      * Sets text to tokenize. It also resets tokenizer state.
0076      */
0077     virtual void setBuffer(const QString &buffer = QString()) = 0;
0078     /**
0079      * Returns true if there is another token available.
0080      * @return true if another token is available, false if not.
0081      */
0082     virtual bool hasNext() const = 0;
0083 
0084     /**
0085      * Returns next token or null QString if there is none
0086      */
0087     virtual Token next() = 0;
0088 
0089     /** Returns content of currently tokenized buffer*/
0090     virtual QString buffer() const = 0;
0091 
0092     /**
0093      * Replace part of text in current buffer. Always use this function instead of directly
0094      * changing data in underlying buffer or tokenizer's internal state may become inconsistent.
0095      */
0096     virtual void replace(int position, int len, const QString &newWord) = 0;
0097 };
0098 
0099 class BreakTokenizerPrivate;
0100 
0101 /**
0102 @short WordTokenizer splits supplied buffer into individual words.
0103 
0104 WordTokenizer splits buffer into words according to rules from Unicode standard 5.1.
0105 If purpose is to check spelling, use isSpellcheckable() to determine if current word should be
0106 checked or ignored.
0107 
0108 Usage example:
0109 
0110 @code
0111 WordTokenizer t(buffer);
0112 Speller sp;
0113 while (t.hasNext()) {
0114     Token word=t.next();
0115     if (!t.isSpellcheckable()) continue;
0116     qDebug() << word.toString() << " " << sp.isCorrect(word.toString());
0117 }
0118 @endcode
0119 
0120 This example checks spelling of given buffer
0121  * @since 4.3
0122 */
0123 class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer
0124 {
0125 public:
0126     /**
0127      * Constructor for word tokenizer
0128      * @param buffer
0129      */
0130     WordTokenizer(const QString &buffer = QString());
0131     ~WordTokenizer() override;
0132 
0133     void setBuffer(const QString &buffer) override;
0134     bool hasNext() const override;
0135     Token next() override;
0136     QString buffer() const override;
0137     void replace(int position, int len, const QString &newWord) override;
0138 
0139     /** Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */
0140     bool isSpellcheckable() const;
0141 
0142     /** If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */
0143     void setIgnoreUppercase(bool val);
0144 
0145 private:
0146     SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const;
0147 
0148 private:
0149     std::unique_ptr<BreakTokenizerPrivate> const d;
0150 };
0151 
0152 /**
0153 @short SentenceTokenizer splits supplied buffer into individual sentences.
0154 
0155 SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1.
0156  * @since 4.3
0157 */
0158 class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer
0159 {
0160 public:
0161     SentenceTokenizer(const QString &buffer = QString());
0162     ~SentenceTokenizer() override;
0163     void setBuffer(const QString &buffer) override;
0164     bool hasNext() const override;
0165     Token next() override;
0166     QString buffer() const override;
0167     void replace(int position, int len, const QString &newWord) override;
0168 
0169 private:
0170     std::unique_ptr<BreakTokenizerPrivate> const d;
0171 };
0172 }
0173 #endif