src/core/tokenizer_p.h

0001 /*  This file is part of the KDE libraries
0002
0003     SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
0004
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007
0008 #ifndef ABSTRACTTOKENIZER_H
0009 #define ABSTRACTTOKENIZER_H
0010
0011 #include "sonnetcore_export.h"
0012 #include <QString>
0013
0014 namespace Sonnet
0015 {
0016 struct Token {
0017     QStringView token = nullptr;
0018     int positionInBuffer = -1;
0019
0020     QString toString() const
0021     {
0022         return token.toString();
0023     }
0024
0025     /**
0026      * @brief length of this token
0027      */
0028     Q_DECL_CONSTEXPR int length() const
0029     {
0030         return token.size();
0031     }
0032
0033     /**
0034      * @brief position in buffer of which the @ref token is a view
0035      */
0036     Q_DECL_CONSTEXPR int position() const
0037     {
0038         return positionInBuffer;
0039     }
0040
0041     Q_DECL_CONSTEXPR bool isNull() const
0042     {
0043         return token.isNull();
0044     }
0045
0046     Q_DECL_CONSTEXPR bool isEmpty() const
0047     {
0048         return token.isEmpty();
0049     }
0050
0051     Q_DECL_CONSTEXPR QChar at(qsizetype n) const
0052     {
0053         return token.at(n);
0054     }
0055 };
0056
0057 /**
0058  * @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs.
0059  *
0060  * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled
0061  * after Java-style iterators. During tokenization buffer can be modified using provided replace() method.
0062  *
0063  * @since 4.3
0064  */
0065 class AbstractTokenizer
0066 {
0067 public:
0068     virtual ~AbstractTokenizer()
0069     {
0070     }
0071
0072     /**
0073      * Sets text to tokenize. It also resets tokenizer state.
0074      */
0075     virtual void setBuffer(const QString &buffer = QString()) = 0;
0076     /**
0077      * Returns true if there is another token available.
0078      * @return true if another token is available, false if not.
0079      */
0080     virtual bool hasNext() const = 0;
0081
0082     /**
0083      * Returns next token or null QString if there is none
0084      */
0085     virtual Token next() = 0;
0086
0087     /** Returns content of currently tokenized buffer*/
0088     virtual QString buffer() const = 0;
0089
0090     /**
0091      * Replace part of text in current buffer. Always use this function instead of directly
0092      * changing data in underlying buffer or tokenizer's internal state may become inconsistent.
0093      */
0094     virtual void replace(int position, int len, const QString &newWord) = 0;
0095 };
0096
0097 class BreakTokenizerPrivate;
0098
0099 /**
0100 @short WordTokenizer splits supplied buffer into individual words.
0101
0102 WordTokenizer splits buffer into words according to rules from Unicode standard 5.1.
0103 If purpose is to check spelling, use isSpellcheckable() to determine if current word should be
0104 checked or ignored.
0105
0106 Usage example:
0107
0108 @code
0109 WordTokenizer t(buffer);
0110 Speller sp;
0111 while (t.hasNext()) {
0112     Token word=t.next();
0113     if (!t.isSpellcheckable()) continue;
0114     qDebug() << word.toString() << " " << sp.isCorrect(word.toString());
0115 }
0116 @endcode
0117
0118 This example checks spelling of given buffer
0119  * @since 4.3
0120 */
0121 class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer
0122 {
0123 public:
0124     /**
0125      * Constructor for word tokenizer
0126      * @param buffer
0127      */
0128     WordTokenizer(const QString &buffer = QString());
0129     ~WordTokenizer() override;
0130
0131     void setBuffer(const QString &buffer) override;
0132     bool hasNext() const override;
0133     Token next() override;
0134     QString buffer() const override;
0135     void replace(int position, int len, const QString &newWord) override;
0136
0137     /** Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */
0138     bool isSpellcheckable() const;
0139
0140     /** If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */
0141     void setIgnoreUppercase(bool val);
0142
0143 private:
0144     SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const;
0145     BreakTokenizerPrivate *const d;
0146 };
0147
0148 /**
0149 @short SentenceTokenizer splits supplied buffer into individual sentences.
0150
0151 SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1.
0152  * @since 4.3
0153 */
0154 class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer
0155 {
0156 public:
0157     SentenceTokenizer(const QString &buffer = QString());
0158     ~SentenceTokenizer() override;
0159     void setBuffer(const QString &buffer) override;
0160     bool hasNext() const override;
0161     Token next() override;
0162     QString buffer() const override;
0163     void replace(int position, int len, const QString &newWord) override;
0164
0165 private:
0166     BreakTokenizerPrivate *const d;
0167 };
0168 }
0169 #endif