src/core/tokenizer_p.h

0001 /*  This file is part of the KDE libraries
0002
0003     SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
0004
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007
0008 #ifndef ABSTRACTTOKENIZER_H
0009 #define ABSTRACTTOKENIZER_H
0010
0011 #include "sonnetcore_export.h"
0012 #include <QString>
0013
0014 #include <memory>
0015
0016 namespace Sonnet
0017 {
0018 struct Token {
0019     QStringView token = nullptr;
0020     int positionInBuffer = -1;
0021
0022     QString toString() const
0023     {
0024         return token.toString();
0025     }
0026
0027     /**
0028      * @brief length of this token
0029      */
0030     Q_DECL_CONSTEXPR int length() const
0031     {
0032         return token.size();
0033     }
0034
0035     /**
0036      * @brief position in buffer of which the @ref token is a view
0037      */
0038     Q_DECL_CONSTEXPR int position() const
0039     {
0040         return positionInBuffer;
0041     }
0042
0043     Q_DECL_CONSTEXPR bool isNull() const
0044     {
0045         return token.isNull();
0046     }
0047
0048     Q_DECL_CONSTEXPR bool isEmpty() const
0049     {
0050         return token.isEmpty();
0051     }
0052
0053     Q_DECL_CONSTEXPR QChar at(qsizetype n) const
0054     {
0055         return token.at(n);
0056     }
0057 };
0058
0059 /**
0060  * @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs.
0061  *
0062  * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled
0063  * after Java-style iterators. During tokenization buffer can be modified using provided replace() method.
0064  *
0065  * @since 4.3
0066  */
0067 class AbstractTokenizer
0068 {
0069 public:
0070     virtual ~AbstractTokenizer()
0071     {
0072     }
0073
0074     /**
0075      * Sets text to tokenize. It also resets tokenizer state.
0076      */
0077     virtual void setBuffer(const QString &buffer = QString()) = 0;
0078     /**
0079      * Returns true if there is another token available.
0080      * @return true if another token is available, false if not.
0081      */
0082     virtual bool hasNext() const = 0;
0083
0084     /**
0085      * Returns next token or null QString if there is none
0086      */
0087     virtual Token next() = 0;
0088
0089     /** Returns content of currently tokenized buffer*/
0090     virtual QString buffer() const = 0;
0091
0092     /**
0093      * Replace part of text in current buffer. Always use this function instead of directly
0094      * changing data in underlying buffer or tokenizer's internal state may become inconsistent.
0095      */
0096     virtual void replace(int position, int len, const QString &newWord) = 0;
0097 };
0098
0099 class BreakTokenizerPrivate;
0100
0101 /**
0102 @short WordTokenizer splits supplied buffer into individual words.
0103
0104 WordTokenizer splits buffer into words according to rules from Unicode standard 5.1.
0105 If purpose is to check spelling, use isSpellcheckable() to determine if current word should be
0106 checked or ignored.
0107
0108 Usage example:
0109
0110 @code
0111 WordTokenizer t(buffer);
0112 Speller sp;
0113 while (t.hasNext()) {
0114     Token word=t.next();
0115     if (!t.isSpellcheckable()) continue;
0116     qDebug() << word.toString() << " " << sp.isCorrect(word.toString());
0117 }
0118 @endcode
0119
0120 This example checks spelling of given buffer
0121  * @since 4.3
0122 */
0123 class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer
0124 {
0125 public:
0126     /**
0127      * Constructor for word tokenizer
0128      * @param buffer
0129      */
0130     WordTokenizer(const QString &buffer = QString());
0131     ~WordTokenizer() override;
0132
0133     void setBuffer(const QString &buffer) override;
0134     bool hasNext() const override;
0135     Token next() override;
0136     QString buffer() const override;
0137     void replace(int position, int len, const QString &newWord) override;
0138
0139     /** Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */
0140     bool isSpellcheckable() const;
0141
0142     /** If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */
0143     void setIgnoreUppercase(bool val);
0144
0145 private:
0146     SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const;
0147
0148 private:
0149     std::unique_ptr<BreakTokenizerPrivate> const d;
0150 };
0151
0152 /**
0153 @short SentenceTokenizer splits supplied buffer into individual sentences.
0154
0155 SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1.
0156  * @since 4.3
0157 */
0158 class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer
0159 {
0160 public:
0161     SentenceTokenizer(const QString &buffer = QString());
0162     ~SentenceTokenizer() override;
0163     void setBuffer(const QString &buffer) override;
0164     bool hasNext() const override;
0165     Token next() override;
0166     QString buffer() const override;
0167     void replace(int position, int len, const QString &newWord) override;
0168
0169 private:
0170     std::unique_ptr<BreakTokenizerPrivate> const d;
0171 };
0172 }
0173 #endif