File indexing completed on 2024-05-05 05:50:42

0001 /*
0002     SPDX-FileCopyrightText: 2020-2021 Klarälvdalens Datakonsult AB a KDAB Group company, info@kdab.com, author Marc Mutz <marc.mutz@kdab.com>
0003 
0004     This file is part of KDToolBox (https://github.com/KDAB/KDToolBox).
0005 
0006     SPDX-License-Identifier: MIT
0007 */
0008 
0009 #include "qstringtokenizer.h"
0010 #include "qstringalgorithms.h"
0011 
0012 /*!
0013     \class QStringTokenizer
0014     \brief The QStringTokenizer class splits strings into tokens along given separators
0015     \reentrant
0016 
0017     Splits a string into substrings wherever a given separator occurs,
0018     and returns a (lazy) list of those strings. If the separator does
0019     not match anywhere in the string, produces a single-element
0020     containing this string.  If the separator is empty,
0021     QStringTokenizer produces an empty string, followed by each of the
0022     string's characters, followed by another empty string. The two
0023     enumerations Qt::SplitBehavior and Qt::CaseSensitivity further
0024     control the output.
0025 
0026     QStringTokenizer drives QStringView::tokenize(), but, at least with a
0027     recent compiler, you can use it directly, too:
0028 
0029     \code
0030     for (auto it : QStringTokenizer{string, separator})
0031         use(*it);
0032     \endcode
0033 
0034     \note You should never, ever, name the template arguments of a
0035     QStringTokenizer explicitly.  If you can use C++17 Class Template
0036     Argument Deduction (CTAD), you may write
0037     \c{QStringTokenizer{string, separator}} (without template
0038     arguments).  If you can't use C++17 CTAD, you must use the
0039     QStringView::split() or QLatin1String::split() member functions
0040     and store the return value only in \c{auto} variables:
0041 
0042     \code
0043     auto result = string.split(sep);
0044     \endcode
0045 
0046     This is because the template arguments of QStringTokenizer have a
0047     very subtle dependency on the specific string and separator types
0048     from with which they are constructed, and they don't usually
0049     correspond to the actual types passed.
0050 
0051     \section Lazy Sequences
0052 
0053     QStringTokenizer acts as a so-called lazy sequence, that is, each
0054     next element is only computed once you ask for it. Lazy sequences
0055     have the advantage that they only require O(1) memory. They have
0056     the disadvantage that, at least for QStringTokenizer, they only
0057     allow forward, not random-access, iteration.
0058 
0059     The intended use-case is that you just plug it into a ranged for loop:
0060 
0061     \code
0062     for (auto it : QStringTokenizer{string, separator})
0063         use(*it);
0064     \endcode
0065 
0066     or a C++20 ranged algorithm:
0067 
0068     \code
0069     std::ranges::for_each(QStringTokenizer{string, separator},
0070                           [] (auto token) { use(token); });
0071     \endcode
0072 
0073     \section End Sentinel
0074 
0075     The QStringTokenizer iterators cannot be used with classical STL
0076     algorithms, because those require iterator/iterator pairs, while
0077     QStringTokenizer uses sentinels, that is, it uses a different
0078     type, QStringTokenizer::sentinel, to mark the end of the
0079     range. This improves performance, because the sentinel is an empty
0080     type. Sentinels are supported from C++17 (for ranged for)
0081     and C++20 (for algorithms using the new ranges library).
0082 
0083     QStringTokenizer falls back to a non-sentinel end iterator
0084     implementation if the compiler doesn't support separate types for
0085     begin and end iterators in ranged for loops
0086     (\link{https://wg21.link/P0184}{P1084}), in which case traditional
0087     STL algorthms will \em appear to be supported, but as you migrate
0088     to a compiler that supports P0184, such code will break.  We
0089     recommend to use only the C++20 \c{std::ranges} algorithms, or, if
0090     you're stuck on C++14/17 for the time being,
0091     \link{https://github.com/ericniebler/range-v3}{Eric Niebler's
0092     Ranges v3 library}, which has the same semantics as the C++20
0093     \c{std::ranges} library.
0094 
0095     \section Temporaries
0096 
0097     QStringTokenizer is very carefully designed to avoid dangling
0098     references. If you construct a tokenizer from a temporary string
0099     (an rvalue), that argument is stored internally, so the referenced
0100     data isn't deleted before it is tokenized:
0101 
0102     \code
0103     auto tok = QStringTokenizer{widget.text(), u','};
0104     // return value of `widget.text()` is destroyed, but content was moved into `tok`
0105     for (auto e : tok)
0106        use(e);
0107     \endcode
0108 
0109     If you pass named objects (lvalues), then QStringTokenizer does
0110     not store a copy. You are reponsible to keep the named object's
0111     data around for longer than the tokenizer operates on it:
0112 
0113     \code
0114     auto text = widget.text();
0115     auto tok = QStringTokenizer{text, u','};
0116     text.clear();      // destroy content of `text`
0117     for (auto e : tok) // ERROR: `tok` references deleted data!
0118         use(e);
0119     \endcode
0120 
0121     \sa QStringView::split(), QLatin1Sting::split(), Qt::SplitBehavior, Qt::CaseSensitivity
0122 */
0123 
0124 /*!
0125     \typedef QStringTokenizer::value_type
0126 
0127     Alias for \c{const QStringView} or \c{const QLatin1String},
0128     depending on the tokenizer's \c Haystack template argument.
0129 */
0130 
0131 /*!
0132     \typedef QStringTokenizer::difference_type
0133 
0134     Alias for qsizetype.
0135 */
0136 
0137 /*!
0138     \typedef QStringTokenizer::size_type
0139 
0140     Alias for qsizetype.
0141 */
0142 
0143 /*!
0144     \typedef QStringTokenizer::reference
0145 
0146     Alias for \c{value_type &}.
0147 
0148     QStringTokenizer does not support mutable references, so this is
0149     the same as const_reference.
0150 */
0151 
0152 /*!
0153     \typedef QStringTokenizer::const_reference
0154 
0155     Alias for \c{value_type &}.
0156 */
0157 
0158 /*!
0159     \typedef QStringTokenizer::pointer
0160 
0161     Alias for \c{value_type *}.
0162 
0163     QStringTokenizer does not support mutable iterators, so this is
0164     the same as const_pointer.
0165 */
0166 
0167 /*!
0168     \typedef QStringTokenizer::const_pointer
0169 
0170     Alias for \c{value_type *}.
0171 */
0172 
0173 /*!
0174     \typedef QStringTokenizer::iterator
0175 
0176     This typedef provides an STL-style const iterator for
0177     QStringTokenizer.
0178 
0179     QStringTokenizer does not support mutable iterators, so this is
0180     the same as const_iterator.
0181 
0182     \sa const_iterator
0183 */
0184 
0185 /*!
0186     \typedef QStringTokenizer::const_iterator
0187 
0188     This typedef provides an STL-style const iterator for
0189     QStringTokenizer.
0190 
0191     \sa iterator
0192 */
0193 
0194 /*!
0195     \typedef QStringTokenizer::sentinel
0196 
0197     This typedef provides an STL-style sentinel for
0198     QStringTokenizer::iterator and QStringTokenizer::const_iterator.
0199 
0200     \sa const_iterator
0201 */
0202 
0203 /*!
0204     \fn QStringTokenizer(Haystack haystack, String needle, Qt::CaseSensitivity cs, Qt::SplitBehavior sb)
0205     \fn QStringTokenizer(Haystack haystack, String needle, Qt::SplitBehavior sb, Qt::CaseSensitivity cs)
0206 
0207     Constructs a string tokenizer that splits the string \a haystack
0208     into substrings wherever \a needle occurs, and allows iteration
0209     over those strings as they are found. If \a needle does not match
0210     anywhere in \a haystack, a single element containing \a haystack
0211     is produced.
0212 
0213     \a cs specifies whether \a needle should be matched case
0214     sensitively or case insensitively.
0215 
0216     If \a sb is QString::SkipEmptyParts, empty entries don't
0217     appear in the result. By default, empty entries are included.
0218 
0219     \sa QStringView::split(), QLatin1String::split(), Qt::CaseSensitivity, Qt::SplitBehavior
0220 */
0221 
0222 /*!
0223     \fn QStringTokenizer::const_iterator QStringTokenizer::begin() const
0224 
0225     Returns a const \l{STL-style iterators}{STL-style iterator}
0226     pointing to the first token in the list.
0227 
0228     \sa end(), cbegin()
0229 */
0230 
0231 /*!
0232     \fn QStringTokenizer::const_iterator QStringTokenizer::cbegin() const
0233 
0234     Same as begin().
0235 
0236     \sa cend(), begin()
0237 */
0238 
0239 /*!
0240     \fn QStringTokenizer::sentinel QStringTokenizer::end() const
0241 
0242     Returns a const \l{STL-style iterators}{STL-style sentinel}
0243     pointing to the imaginary token after the last token in the list.
0244 
0245     \sa begin(), cend()
0246 */
0247 
0248 /*!
0249     \fn QStringTokenizer::sentinel QStringTokenizer::cend() const
0250 
0251     Same as end().
0252 
0253     \sa cbegin(), end()
0254 */
0255 
0256 /*!
0257     \fn QStringTokenizer::toContainer(Container &&c) const &
0258 
0259     Convenience method to convert the lazy sequence into a
0260     (typically) random-access container.
0261 
0262     This function is only available if \c Container has a \c value_type
0263     matching this tokenizer's value_type.
0264 
0265     If you pass in a named container (an lvalue), then that container
0266     is filled, and a reference to it is returned.
0267 
0268     If you pass in a temporary container (an rvalue, incl. the default
0269     argument), then that container is filled, and returned by value.
0270 
0271     \code
0272     // assuming tok's value_type is QStringView, then...
0273     auto tok = QStringTokenizer{~~~};
0274     // ... rac1 is a QVector:
0275     auto rac1 = tok.toContainer();
0276     // ... rac2 is std::pmr::vector<QStringView>:
0277     auto rac2 = tok.toContainer<std::pmr::vector<QStringView>>();
0278     auto rac3 = QVarLengthArray<QStringView, 12>{};
0279     // appends the token sequence produced by tok to rac3
0280     //  and returns a reference to rac3 (which we ignore here):
0281     tok.toContainer(rac3);
0282     \endcode
0283 
0284     This gives you maximum flexibility in how you want the sequence to
0285     be stored.
0286 */
0287 
0288 /*!
0289     \fn QStringTokenizer::toContainer(Container &&c) const &&
0290     \overload
0291 
0292     In addition to the constraints on the lvalue-this overload, this
0293     rvalue-this overload is only available when this QStringTokenizer
0294     does not store the haystack internally, as this could create a
0295     container full of dangling references:
0296 
0297     \code
0298     auto tokens = QStringTokenizer{widget.text(), u','}.toContainer();
0299     // ERROR: cannot call toContainer() on rvalue
0300     // 'tokens' references the data of the copy of widget.text()
0301     // stored inside the QStringTokenizer, which has since been deleted
0302     \endcode
0303 
0304     To fix, store the QStringTokenizer in a temporary:
0305 
0306     \code
0307     auto tokenizer = QStringTokenizer{widget.text90, u','};
0308     auto tokens = tokenizer.toContainer();
0309     // OK: the copy of widget.text() stored in 'tokenizer' keeps the data
0310     // referenced by 'tokens' alive.
0311     \endcode
0312 
0313     You can force this function into existence by passing a view instead:
0314 
0315     \code
0316     func(QStringTokenizer{QStringView{widget.text()}, u','}.toContainer());
0317     // OK: compiler keeps widget.text() around until after func() has executed
0318     \endcode
0319 */
0320 
0321 /*!
0322     \fn qTokenize(Haystack &&haystack, Needle &&needle, Flags...flags)
0323     \relates QStringTokenizer
0324 
0325     Factory function for QStringTokenizer. You can use this function
0326     if your compiler doesn't, yet, support C++17 Class Template
0327     Argument Deduction (CTAD), but we recommend direct use of
0328     QStringTokenizer with CTAD instead.
0329 */