File indexing completed on 2024-05-12 15:43:36

0001 /*
0002  *  This file is part of the KDE libraries
0003  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
0004  *  Copyright (C) 2004 Apple Computer, Inc.
0005  *
0006  *  This library is free software; you can redistribute it and/or
0007  *  modify it under the terms of the GNU Library General Public
0008  *  License as published by the Free Software Foundation; either
0009  *  version 2 of the License, or (at your option) any later version.
0010  *
0011  *  This library is distributed in the hope that it will be useful,
0012  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
0013  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0014  *  Library General Public License for more details.
0015  *
0016  *  You should have received a copy of the GNU Library General Public License
0017  *  along with this library; see the file COPYING.LIB.  If not, write to
0018  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0019  *  Boston, MA 02110-1301, USA.
0020  *
0021  */
0022 
0023 #ifndef _KJS_USTRING_H_
0024 #define _KJS_USTRING_H_
0025 
0026 #include "kjs/global.h"
0027 
0028 #include <wtf/AlwaysInline.h>
0029 #include <wtf/FastMalloc.h>
0030 #include <wtf/RefPtr.h>
0031 #include <wtf/PassRefPtr.h>
0032 #include <wtf/Vector.h>
0033 
0034 #include <assert.h>
0035 #include "collector.h"
0036 #if HAVE_STDINT_H
0037 #include <stdint.h>
0038 #endif
0039 
0040 /* On some ARM platforms GCC won't pack structures by default so sizeof(UChar)
0041    will end up being != 2 which causes crashes since the code depends on that. */
0042 #if defined(WTF_COMPILER_GCC) && PLATFORM(FORCE_PACK)
0043 #define PACK_STRUCT __attribute__((packed))
0044 #else
0045 #define PACK_STRUCT
0046 #endif
0047 
0048 /**
0049  * @internal
0050  */
0051 namespace DOM
0052 {
0053 class DOMString;
0054 }
0055 class QString;
0056 class QConstString;
0057 
0058 namespace KJS
0059 {
0060 
0061 class UString;
0062 
0063 /**
0064  * @short Unicode character.
0065  *
0066  * UChar represents a 16 bit Unicode character. Its internal data
0067  * representation is compatible to XChar2b and QChar. It's therefore
0068  * possible to exchange data with X and Qt with shallow copies.
0069  */
0070 struct KJS_EXPORT UChar {
0071     /**
0072      * Construct a character with uninitialized value.
0073      */
0074     UChar();
0075     /**
0076      * Construct a character with the value denoted by the arguments.
0077      * @param h higher byte
0078      * @param l lower byte
0079      */
0080     UChar(unsigned char h, unsigned char l);
0081     /**
0082      * Construct a character with the given value.
0083      * @param u 16 bit Unicode value
0084      */
0085     UChar(char u);
0086     UChar(unsigned char u);
0087     UChar(unsigned short u);
0088     /**
0089      * @return The higher byte of the character.
0090      */
0091     unsigned char high() const
0092     {
0093         return static_cast<unsigned char>(uc >> 8);
0094     }
0095 
0096     /**
0097      * @return The lower byte of the character.
0098      */
0099     unsigned char low() const
0100     {
0101         return static_cast<unsigned char>(uc);
0102     }
0103 
0104     /**
0105      * @return the 16 bit Unicode value of the character
0106      */
0107     unsigned short unicode() const
0108     {
0109         return uc;
0110     }
0111 
0112     unsigned short uc;
0113 } PACK_STRUCT;
0114 
0115 inline UChar::UChar() { }
0116 inline UChar::UChar(unsigned char h, unsigned char l) : uc(h << 8 | l) { }
0117 inline UChar::UChar(char u) : uc((unsigned char)u) { }
0118 inline UChar::UChar(unsigned char u) : uc(u) { }
0119 inline UChar::UChar(unsigned short u) : uc(u) { }
0120 
0121 /**
0122  * @short 8 bit char based string class
0123  */
0124 class KJS_EXPORT CString
0125 {
0126 public:
0127     CString() : data(nullptr), length(0) { }
0128     CString(const char *c);
0129     CString(const char *c, size_t len);
0130     CString(const CString &);
0131 
0132     ~CString();
0133 
0134     CString &operator=(const char *c);
0135     CString &operator=(const CString &);
0136 
0137     size_t size() const
0138     {
0139         return length;
0140     }
0141     const char *c_str() const
0142     {
0143         return data;
0144     }
0145 private:
0146     char *data;
0147     size_t length;
0148 };
0149 
0150 /**
0151  * @short Unicode string class
0152  */
0153 class KJS_EXPORT UString
0154 {
0155     KJS_EXPORT friend bool operator==(const UString &, const UString &);
0156 
0157 public:
0158     /**
0159      * @internal
0160      */
0161     struct KJS_EXPORT Rep {
0162 
0163         static PassRefPtr<Rep> create(UChar *d, int l);
0164         static PassRefPtr<Rep> createCopying(const UChar *d, int l);
0165         static PassRefPtr<Rep> create(PassRefPtr<Rep> base, int offset, int length);
0166 
0167         void destroy();
0168 
0169         bool baseIsSelf() const
0170         {
0171             return baseString == this;
0172         }
0173         UChar *data() const
0174         {
0175             return baseString->buf + baseString->preCapacity + offset;
0176         }
0177         int size() const
0178         {
0179             return len;
0180         }
0181 
0182         unsigned hash() const
0183         {
0184             if (_hash == 0) {
0185                 _hash = computeHash(data(), len);
0186             } return _hash;
0187         }
0188         unsigned computedHash() const
0189         {
0190             assert(_hash);    // fast path for Identifiers
0191             return _hash;
0192         }
0193         static unsigned computeHash(const UChar *, int length);
0194         static unsigned computeHash(const char *s, int length);
0195         static unsigned computeHash(const char *);
0196 
0197         Rep *ref()
0198         {
0199             ++rc;
0200             return this;
0201         }
0202         ALWAYS_INLINE void deref()
0203         {
0204             if (--rc == 0) {
0205                 destroy();
0206             }
0207         }
0208 
0209         // unshared data
0210         int offset;
0211         int len;
0212         int rc;
0213         mutable unsigned _hash;
0214         bool isIdentifier;
0215         UString::Rep *baseString;
0216         size_t reportedCost;
0217 
0218         // potentially shared data
0219         UChar *buf;
0220         int usedCapacity;
0221         int capacity;
0222         int usedPreCapacity;
0223         int preCapacity;
0224 
0225         static Rep null;
0226         static Rep empty;
0227     };
0228 
0229 public:
0230     /**
0231      * Constructs a null string.
0232      */
0233     UString();
0234     /**
0235      * Constructs an empty string.
0236      */
0237     enum Empty { empty };
0238     UString(Empty);
0239     /**
0240      * Constructs a string from the single character c.
0241      */
0242     explicit UString(char c);
0243     /**
0244      * Constructs a string from a classical zero determined char string.
0245      */
0246     UString(const char *c);
0247     UString(const char *c, size_t length);
0248     /**
0249      * Constructs a string from an array of Unicode characters of the specified
0250      * length.
0251      */
0252     UString(const UChar *c, int length);
0253     /**
0254      * If copy is false the string data will be adopted.
0255      * That means that the data will NOT be copied and the pointer will
0256      * be deleted when the UString object is modified or destroyed.
0257      * Behaviour defaults to a deep copy if copy is true.
0258      */
0259     UString(UChar *c, int length, bool copy);
0260     /**
0261      * Copy constructor. Makes a shallow copy only.
0262      */
0263     UString(const UString &s) : m_rep(s.m_rep) {}
0264     UString &operator=(const UString &s) { m_rep = s.m_rep; return *this; }
0265 
0266     UString(const Vector<UChar> &buffer);
0267 
0268     /**
0269      * Convenience declaration only ! You'll be on your own to write the
0270      * implementation for a construction from QString.
0271      *
0272      * Note: feel free to contact me if you want to see a dummy header for
0273      * your favorite FooString class here !
0274      */
0275     KJS_EXTERNAL_EXPORT UString(const QString &);
0276     /**
0277      * Convenience declaration only ! See UString(const QString&).
0278      */
0279     KJS_EXTERNAL_EXPORT UString(const DOM::DOMString &);
0280 
0281     /**
0282      * Concatenation constructor. Makes operator+ more efficient.
0283      */
0284     UString(const UString &, const UString &);
0285     /**
0286      * Destructor.
0287      */
0288     ~UString() {}
0289 
0290     /**
0291      * Constructs a string from an int.
0292      */
0293     static UString from(int i);
0294     /**
0295      * Constructs a string from an unsigned int.
0296      */
0297     static UString from(unsigned int u);
0298     /**
0299      * Constructs a string from a long int.
0300      */
0301     static UString from(long u);
0302     /**
0303      * Constructs a string from a double.
0304      */
0305     static UString from(double d);
0306 
0307     static bool equal(const UString::Rep *a, const UString::Rep *b);
0308 
0309     struct Range {
0310     public:
0311         Range(int pos, int len) : position(pos), length(len) {}
0312         Range() {}
0313         int position;
0314         int length;
0315     };
0316 
0317     UString spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const;
0318 
0319     /**
0320      * Append another string.
0321      */
0322     UString &append(const UString &subStr, int subPos, int subLength = -1);
0323     UString &append(const UString &t);
0324     UString &append(const char *t);
0325     UString &append(const char *t, int tSize);
0326     UString &append(unsigned short);
0327     UString &append(char c)
0328     {
0329         return append(static_cast<unsigned short>(static_cast<unsigned char>(c)));
0330     }
0331     UString &append(UChar c)
0332     {
0333         return append(c.uc);
0334     }
0335 
0336     /**
0337      * @return The string converted to the 8-bit string type CString().
0338      */
0339     CString cstring() const;
0340     /**
0341      * Convert the Unicode string to plain ASCII chars chopping of any higher
0342      * bytes. This method should only be used for *debugging* purposes as it
0343      * is neither Unicode safe nor free from side effects. In order not to
0344      * waste any memory the char buffer is static and *shared* by all UString
0345      * instances.
0346      */
0347     char *ascii() const;
0348 
0349     /**
0350      * Convert the string to UTF-8, assuming it is UTF-16 encoded.
0351      * Since this function is tolerant of badly formed UTF-16, it can create UTF-8
0352      * strings that are invalid because they have characters in the range
0353      * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
0354      * be otherwise valid.
0355      */
0356     CString UTF8String() const;
0357 
0358     /**
0359      * @see UString(const QString&).
0360      */
0361     KJS_EXTERNAL_EXPORT DOM::DOMString domString() const;
0362     /**
0363      * @see UString(const QString&).
0364      */
0365     KJS_EXTERNAL_EXPORT QString qstring() const;
0366     /**
0367      * @see UString(const QString&).
0368      */
0369     KJS_EXTERNAL_EXPORT QConstString qconststring() const;
0370 
0371     /**
0372      * Assignment operator.
0373      */
0374     UString &operator=(const char *c);
0375     UString &operator=(Empty);
0376     /**
0377      * Appends the specified string.
0378      */
0379     UString &operator+=(const UString &s)
0380     {
0381         return append(s);
0382     }
0383     UString &operator+=(const char *s)
0384     {
0385         return append(s);
0386     }
0387 
0388     /**
0389      * @return A pointer to the internal Unicode data.
0390      */
0391     const UChar *data() const
0392     {
0393         return m_rep->data();
0394     }
0395     /**
0396      * @return True if null.
0397      */
0398     bool isNull() const
0399     {
0400         return (m_rep == &Rep::null);
0401     }
0402     /**
0403      * @return True if null or zero length.
0404      */
0405     bool isEmpty() const
0406     {
0407         return (!m_rep->len);
0408     }
0409     /**
0410      * Use this if you want to make sure that this string is a plain ASCII
0411      * string. For example, if you don't want to lose any information when
0412      * using cstring() or ascii().
0413      *
0414      * @return True if the string doesn't contain any non-ASCII characters.
0415      */
0416     bool is8Bit() const;
0417     /**
0418      * @return The length of the string.
0419      */
0420     int size() const
0421     {
0422         return m_rep->size();
0423     }
0424     /**
0425      * Const character at specified position.
0426      */
0427     const UChar operator[](int pos) const;
0428     /**
0429      * Attempts an conversion to a number. Apart from floating point numbers,
0430      * the algorithm will recognize hexadecimal representations (as
0431      * indicated by a 0x or 0X prefix) and +/- Infinity.
0432      * Returns NaN if the conversion failed.
0433      * @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number.
0434      * @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0.
0435      */
0436     double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const;
0437     double toDouble(bool tolerateTrailingJunk) const;
0438     double toDouble() const;
0439 
0440     /**
0441      * Attempts an conversion to a 32-bit integer. ok will be set
0442      * according to the success.
0443      */
0444     uint32_t toStrictUInt32(bool *ok = nullptr) const;
0445 
0446     /**
0447      * Attempts an conversion to an array index. The "ok" boolean will be set
0448      * to true if it is a valid array index according to the rule from
0449      * ECMA 15.2 about what an array index is. It must exactly match the string
0450      * form of an unsigned integer, and be less than 2^32 - 1.
0451      */
0452     unsigned toArrayIndex(bool *ok = nullptr) const;
0453 
0454     /**
0455      * @return Position of first occurrence of f starting at position pos.
0456      * -1 if the search was not successful.
0457      */
0458     int find(const UString &f, int pos = 0) const;
0459     int find(UChar, int pos = 0) const;
0460     /**
0461      * @return Position of first occurrence of f searching backwards from
0462      * position pos.
0463      * -1 if the search was not successful.
0464      */
0465     int rfind(const UString &f, int pos) const;
0466     int rfind(UChar, int pos) const;
0467     /**
0468      * @return The sub string starting at position pos and length len.
0469      */
0470     UString substr(int pos = 0, int len = -1) const;
0471     /**
0472      * Static instance of a null string.
0473      */
0474     static const UString &null();
0475 
0476     /**
0477      * Maximum permitted string length
0478      * @since 5.20
0479      */
0480     static size_t maxUChars();
0481 
0482     Rep *rep() const
0483     {
0484         return m_rep.get();
0485     }
0486     UString(PassRefPtr<Rep> r) : m_rep(r)
0487     {
0488         assert(m_rep);
0489     }
0490     void copyForWriting();
0491 
0492     size_t cost() const;
0493 private:
0494     size_t expandedSize(size_t size, size_t otherSize) const;
0495     int usedCapacity() const;
0496     int usedPreCapacity() const;
0497     void expandCapacity(int requiredLength);
0498     void expandPreCapacity(int requiredPreCap);
0499     void set(const char *c, int len);
0500 
0501     RefPtr<Rep> m_rep;
0502 };
0503 
0504 KJS_EXPORT inline bool operator==(const UChar &c1, const UChar &c2)
0505 {
0506     return (c1.uc == c2.uc);
0507 }
0508 KJS_EXPORT bool operator==(const UString &s1, const UString &s2);
0509 KJS_EXPORT inline bool operator!=(const UString &s1, const UString &s2)
0510 {
0511     return !KJS::operator==(s1, s2);
0512 }
0513 KJS_EXPORT bool operator<(const UString &s1, const UString &s2);
0514 KJS_EXPORT bool operator==(const UString &s1, const char *s2);
0515 KJS_EXPORT inline bool operator!=(const UString &s1, const char *s2)
0516 {
0517     return !KJS::operator==(s1, s2);
0518 }
0519 KJS_EXPORT inline bool operator==(const char *s1, const UString &s2)
0520 {
0521     return operator==(s2, s1);
0522 }
0523 KJS_EXPORT inline bool operator!=(const char *s1, const UString &s2)
0524 {
0525     return !KJS::operator==(s1, s2);
0526 }
0527 KJS_EXPORT bool operator==(const CString &s1, const CString &s2);
0528 KJS_EXPORT inline UString operator+(const UString &s1, const UString &s2)
0529 {
0530     return UString(s1, s2);
0531 }
0532 
0533 KJS_EXPORT int compare(const UString &, const UString &);
0534 
0535 // Given a first byte, gives the length of the UTF-8 sequence it begins.
0536 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
0537 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
0538 int UTF8SequenceLength(char);
0539 
0540 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
0541 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
0542 // Returns -1 if the sequence is not valid (including presence of extra bytes).
0543 int decodeUTF8Sequence(const char *);
0544 
0545 KJS_EXPORT inline UString::UString()
0546     : m_rep(&Rep::null)
0547 {
0548 }
0549 
0550 // Rule from ECMA 15.2 about what an array index is.
0551 // Must exactly match string form of an unsigned integer, and be less than 2^32 - 1.
0552 inline unsigned UString::toArrayIndex(bool *ok) const
0553 {
0554     unsigned i = toStrictUInt32(ok);
0555     if (ok && i >= 0xFFFFFFFFU) {
0556         *ok = false;
0557     }
0558     return i;
0559 }
0560 
0561 // We'd rather not do shared substring append for small strings, since
0562 // this runs too much risk of a tiny initial string holding down a
0563 // huge buffer.
0564 // FIXME: this should be size_t but that would cause warnings until we
0565 // fix UString sizes to be size_t instead of int
0566 static const int minShareSize = Collector::minExtraCostSize / sizeof(UChar);
0567 
0568 inline size_t UString::cost() const
0569 {
0570     size_t capacity = (m_rep->baseString->capacity + m_rep->baseString->preCapacity) * sizeof(UChar);
0571     size_t reportedCost = m_rep->baseString->reportedCost;
0572     ASSERT(capacity >= reportedCost);
0573 
0574     size_t capacityDelta = capacity - reportedCost;
0575 
0576     if (capacityDelta < static_cast<size_t>(minShareSize)) {
0577         return 0;
0578     }
0579 
0580     m_rep->baseString->reportedCost = capacity;
0581     return capacityDelta;
0582 }
0583 
0584 } // namespace
0585 
0586 namespace WTF
0587 {
0588 
0589 template<typename T> struct DefaultHash;
0590 template<typename T> struct StrHash;
0591 
0592 template<> struct StrHash<KJS::UString::Rep *> {
0593     static unsigned hash(const KJS::UString::Rep *key)
0594     {
0595         return key->hash();
0596     }
0597     static bool equal(const KJS::UString::Rep *a, const KJS::UString::Rep *b)
0598     {
0599         return KJS::UString::equal(a, b);
0600     }
0601     static const bool safeToCompareToEmptyOrDeleted = false;
0602 };
0603 
0604 template<> struct DefaultHash<KJS::UString::Rep *> {
0605     typedef StrHash<KJS::UString::Rep *> Hash;
0606 };
0607 } // namespace WTF
0608 
0609 #endif