File indexing completed on 2024-05-12 15:43:36
0001 /* 0002 * This file is part of the KDE libraries 0003 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) 0004 * Copyright (C) 2004 Apple Computer, Inc. 0005 * 0006 * This library is free software; you can redistribute it and/or 0007 * modify it under the terms of the GNU Library General Public 0008 * License as published by the Free Software Foundation; either 0009 * version 2 of the License, or (at your option) any later version. 0010 * 0011 * This library is distributed in the hope that it will be useful, 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0014 * Library General Public License for more details. 0015 * 0016 * You should have received a copy of the GNU Library General Public License 0017 * along with this library; see the file COPYING.LIB. If not, write to 0018 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0019 * Boston, MA 02110-1301, USA. 0020 * 0021 */ 0022 0023 #ifndef _KJS_USTRING_H_ 0024 #define _KJS_USTRING_H_ 0025 0026 #include "kjs/global.h" 0027 0028 #include <wtf/AlwaysInline.h> 0029 #include <wtf/FastMalloc.h> 0030 #include <wtf/RefPtr.h> 0031 #include <wtf/PassRefPtr.h> 0032 #include <wtf/Vector.h> 0033 0034 #include <assert.h> 0035 #include "collector.h" 0036 #if HAVE_STDINT_H 0037 #include <stdint.h> 0038 #endif 0039 0040 /* On some ARM platforms GCC won't pack structures by default so sizeof(UChar) 0041 will end up being != 2 which causes crashes since the code depends on that. */ 0042 #if defined(WTF_COMPILER_GCC) && PLATFORM(FORCE_PACK) 0043 #define PACK_STRUCT __attribute__((packed)) 0044 #else 0045 #define PACK_STRUCT 0046 #endif 0047 0048 /** 0049 * @internal 0050 */ 0051 namespace DOM 0052 { 0053 class DOMString; 0054 } 0055 class QString; 0056 class QConstString; 0057 0058 namespace KJS 0059 { 0060 0061 class UString; 0062 0063 /** 0064 * @short Unicode character. 0065 * 0066 * UChar represents a 16 bit Unicode character. Its internal data 0067 * representation is compatible to XChar2b and QChar. It's therefore 0068 * possible to exchange data with X and Qt with shallow copies. 0069 */ 0070 struct KJS_EXPORT UChar { 0071 /** 0072 * Construct a character with uninitialized value. 0073 */ 0074 UChar(); 0075 /** 0076 * Construct a character with the value denoted by the arguments. 0077 * @param h higher byte 0078 * @param l lower byte 0079 */ 0080 UChar(unsigned char h, unsigned char l); 0081 /** 0082 * Construct a character with the given value. 0083 * @param u 16 bit Unicode value 0084 */ 0085 UChar(char u); 0086 UChar(unsigned char u); 0087 UChar(unsigned short u); 0088 /** 0089 * @return The higher byte of the character. 0090 */ 0091 unsigned char high() const 0092 { 0093 return static_cast<unsigned char>(uc >> 8); 0094 } 0095 0096 /** 0097 * @return The lower byte of the character. 0098 */ 0099 unsigned char low() const 0100 { 0101 return static_cast<unsigned char>(uc); 0102 } 0103 0104 /** 0105 * @return the 16 bit Unicode value of the character 0106 */ 0107 unsigned short unicode() const 0108 { 0109 return uc; 0110 } 0111 0112 unsigned short uc; 0113 } PACK_STRUCT; 0114 0115 inline UChar::UChar() { } 0116 inline UChar::UChar(unsigned char h, unsigned char l) : uc(h << 8 | l) { } 0117 inline UChar::UChar(char u) : uc((unsigned char)u) { } 0118 inline UChar::UChar(unsigned char u) : uc(u) { } 0119 inline UChar::UChar(unsigned short u) : uc(u) { } 0120 0121 /** 0122 * @short 8 bit char based string class 0123 */ 0124 class KJS_EXPORT CString 0125 { 0126 public: 0127 CString() : data(nullptr), length(0) { } 0128 CString(const char *c); 0129 CString(const char *c, size_t len); 0130 CString(const CString &); 0131 0132 ~CString(); 0133 0134 CString &operator=(const char *c); 0135 CString &operator=(const CString &); 0136 0137 size_t size() const 0138 { 0139 return length; 0140 } 0141 const char *c_str() const 0142 { 0143 return data; 0144 } 0145 private: 0146 char *data; 0147 size_t length; 0148 }; 0149 0150 /** 0151 * @short Unicode string class 0152 */ 0153 class KJS_EXPORT UString 0154 { 0155 KJS_EXPORT friend bool operator==(const UString &, const UString &); 0156 0157 public: 0158 /** 0159 * @internal 0160 */ 0161 struct KJS_EXPORT Rep { 0162 0163 static PassRefPtr<Rep> create(UChar *d, int l); 0164 static PassRefPtr<Rep> createCopying(const UChar *d, int l); 0165 static PassRefPtr<Rep> create(PassRefPtr<Rep> base, int offset, int length); 0166 0167 void destroy(); 0168 0169 bool baseIsSelf() const 0170 { 0171 return baseString == this; 0172 } 0173 UChar *data() const 0174 { 0175 return baseString->buf + baseString->preCapacity + offset; 0176 } 0177 int size() const 0178 { 0179 return len; 0180 } 0181 0182 unsigned hash() const 0183 { 0184 if (_hash == 0) { 0185 _hash = computeHash(data(), len); 0186 } return _hash; 0187 } 0188 unsigned computedHash() const 0189 { 0190 assert(_hash); // fast path for Identifiers 0191 return _hash; 0192 } 0193 static unsigned computeHash(const UChar *, int length); 0194 static unsigned computeHash(const char *s, int length); 0195 static unsigned computeHash(const char *); 0196 0197 Rep *ref() 0198 { 0199 ++rc; 0200 return this; 0201 } 0202 ALWAYS_INLINE void deref() 0203 { 0204 if (--rc == 0) { 0205 destroy(); 0206 } 0207 } 0208 0209 // unshared data 0210 int offset; 0211 int len; 0212 int rc; 0213 mutable unsigned _hash; 0214 bool isIdentifier; 0215 UString::Rep *baseString; 0216 size_t reportedCost; 0217 0218 // potentially shared data 0219 UChar *buf; 0220 int usedCapacity; 0221 int capacity; 0222 int usedPreCapacity; 0223 int preCapacity; 0224 0225 static Rep null; 0226 static Rep empty; 0227 }; 0228 0229 public: 0230 /** 0231 * Constructs a null string. 0232 */ 0233 UString(); 0234 /** 0235 * Constructs an empty string. 0236 */ 0237 enum Empty { empty }; 0238 UString(Empty); 0239 /** 0240 * Constructs a string from the single character c. 0241 */ 0242 explicit UString(char c); 0243 /** 0244 * Constructs a string from a classical zero determined char string. 0245 */ 0246 UString(const char *c); 0247 UString(const char *c, size_t length); 0248 /** 0249 * Constructs a string from an array of Unicode characters of the specified 0250 * length. 0251 */ 0252 UString(const UChar *c, int length); 0253 /** 0254 * If copy is false the string data will be adopted. 0255 * That means that the data will NOT be copied and the pointer will 0256 * be deleted when the UString object is modified or destroyed. 0257 * Behaviour defaults to a deep copy if copy is true. 0258 */ 0259 UString(UChar *c, int length, bool copy); 0260 /** 0261 * Copy constructor. Makes a shallow copy only. 0262 */ 0263 UString(const UString &s) : m_rep(s.m_rep) {} 0264 UString &operator=(const UString &s) { m_rep = s.m_rep; return *this; } 0265 0266 UString(const Vector<UChar> &buffer); 0267 0268 /** 0269 * Convenience declaration only ! You'll be on your own to write the 0270 * implementation for a construction from QString. 0271 * 0272 * Note: feel free to contact me if you want to see a dummy header for 0273 * your favorite FooString class here ! 0274 */ 0275 KJS_EXTERNAL_EXPORT UString(const QString &); 0276 /** 0277 * Convenience declaration only ! See UString(const QString&). 0278 */ 0279 KJS_EXTERNAL_EXPORT UString(const DOM::DOMString &); 0280 0281 /** 0282 * Concatenation constructor. Makes operator+ more efficient. 0283 */ 0284 UString(const UString &, const UString &); 0285 /** 0286 * Destructor. 0287 */ 0288 ~UString() {} 0289 0290 /** 0291 * Constructs a string from an int. 0292 */ 0293 static UString from(int i); 0294 /** 0295 * Constructs a string from an unsigned int. 0296 */ 0297 static UString from(unsigned int u); 0298 /** 0299 * Constructs a string from a long int. 0300 */ 0301 static UString from(long u); 0302 /** 0303 * Constructs a string from a double. 0304 */ 0305 static UString from(double d); 0306 0307 static bool equal(const UString::Rep *a, const UString::Rep *b); 0308 0309 struct Range { 0310 public: 0311 Range(int pos, int len) : position(pos), length(len) {} 0312 Range() {} 0313 int position; 0314 int length; 0315 }; 0316 0317 UString spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const; 0318 0319 /** 0320 * Append another string. 0321 */ 0322 UString &append(const UString &subStr, int subPos, int subLength = -1); 0323 UString &append(const UString &t); 0324 UString &append(const char *t); 0325 UString &append(const char *t, int tSize); 0326 UString &append(unsigned short); 0327 UString &append(char c) 0328 { 0329 return append(static_cast<unsigned short>(static_cast<unsigned char>(c))); 0330 } 0331 UString &append(UChar c) 0332 { 0333 return append(c.uc); 0334 } 0335 0336 /** 0337 * @return The string converted to the 8-bit string type CString(). 0338 */ 0339 CString cstring() const; 0340 /** 0341 * Convert the Unicode string to plain ASCII chars chopping of any higher 0342 * bytes. This method should only be used for *debugging* purposes as it 0343 * is neither Unicode safe nor free from side effects. In order not to 0344 * waste any memory the char buffer is static and *shared* by all UString 0345 * instances. 0346 */ 0347 char *ascii() const; 0348 0349 /** 0350 * Convert the string to UTF-8, assuming it is UTF-16 encoded. 0351 * Since this function is tolerant of badly formed UTF-16, it can create UTF-8 0352 * strings that are invalid because they have characters in the range 0353 * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to 0354 * be otherwise valid. 0355 */ 0356 CString UTF8String() const; 0357 0358 /** 0359 * @see UString(const QString&). 0360 */ 0361 KJS_EXTERNAL_EXPORT DOM::DOMString domString() const; 0362 /** 0363 * @see UString(const QString&). 0364 */ 0365 KJS_EXTERNAL_EXPORT QString qstring() const; 0366 /** 0367 * @see UString(const QString&). 0368 */ 0369 KJS_EXTERNAL_EXPORT QConstString qconststring() const; 0370 0371 /** 0372 * Assignment operator. 0373 */ 0374 UString &operator=(const char *c); 0375 UString &operator=(Empty); 0376 /** 0377 * Appends the specified string. 0378 */ 0379 UString &operator+=(const UString &s) 0380 { 0381 return append(s); 0382 } 0383 UString &operator+=(const char *s) 0384 { 0385 return append(s); 0386 } 0387 0388 /** 0389 * @return A pointer to the internal Unicode data. 0390 */ 0391 const UChar *data() const 0392 { 0393 return m_rep->data(); 0394 } 0395 /** 0396 * @return True if null. 0397 */ 0398 bool isNull() const 0399 { 0400 return (m_rep == &Rep::null); 0401 } 0402 /** 0403 * @return True if null or zero length. 0404 */ 0405 bool isEmpty() const 0406 { 0407 return (!m_rep->len); 0408 } 0409 /** 0410 * Use this if you want to make sure that this string is a plain ASCII 0411 * string. For example, if you don't want to lose any information when 0412 * using cstring() or ascii(). 0413 * 0414 * @return True if the string doesn't contain any non-ASCII characters. 0415 */ 0416 bool is8Bit() const; 0417 /** 0418 * @return The length of the string. 0419 */ 0420 int size() const 0421 { 0422 return m_rep->size(); 0423 } 0424 /** 0425 * Const character at specified position. 0426 */ 0427 const UChar operator[](int pos) const; 0428 /** 0429 * Attempts an conversion to a number. Apart from floating point numbers, 0430 * the algorithm will recognize hexadecimal representations (as 0431 * indicated by a 0x or 0X prefix) and +/- Infinity. 0432 * Returns NaN if the conversion failed. 0433 * @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number. 0434 * @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0. 0435 */ 0436 double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const; 0437 double toDouble(bool tolerateTrailingJunk) const; 0438 double toDouble() const; 0439 0440 /** 0441 * Attempts an conversion to a 32-bit integer. ok will be set 0442 * according to the success. 0443 */ 0444 uint32_t toStrictUInt32(bool *ok = nullptr) const; 0445 0446 /** 0447 * Attempts an conversion to an array index. The "ok" boolean will be set 0448 * to true if it is a valid array index according to the rule from 0449 * ECMA 15.2 about what an array index is. It must exactly match the string 0450 * form of an unsigned integer, and be less than 2^32 - 1. 0451 */ 0452 unsigned toArrayIndex(bool *ok = nullptr) const; 0453 0454 /** 0455 * @return Position of first occurrence of f starting at position pos. 0456 * -1 if the search was not successful. 0457 */ 0458 int find(const UString &f, int pos = 0) const; 0459 int find(UChar, int pos = 0) const; 0460 /** 0461 * @return Position of first occurrence of f searching backwards from 0462 * position pos. 0463 * -1 if the search was not successful. 0464 */ 0465 int rfind(const UString &f, int pos) const; 0466 int rfind(UChar, int pos) const; 0467 /** 0468 * @return The sub string starting at position pos and length len. 0469 */ 0470 UString substr(int pos = 0, int len = -1) const; 0471 /** 0472 * Static instance of a null string. 0473 */ 0474 static const UString &null(); 0475 0476 /** 0477 * Maximum permitted string length 0478 * @since 5.20 0479 */ 0480 static size_t maxUChars(); 0481 0482 Rep *rep() const 0483 { 0484 return m_rep.get(); 0485 } 0486 UString(PassRefPtr<Rep> r) : m_rep(r) 0487 { 0488 assert(m_rep); 0489 } 0490 void copyForWriting(); 0491 0492 size_t cost() const; 0493 private: 0494 size_t expandedSize(size_t size, size_t otherSize) const; 0495 int usedCapacity() const; 0496 int usedPreCapacity() const; 0497 void expandCapacity(int requiredLength); 0498 void expandPreCapacity(int requiredPreCap); 0499 void set(const char *c, int len); 0500 0501 RefPtr<Rep> m_rep; 0502 }; 0503 0504 KJS_EXPORT inline bool operator==(const UChar &c1, const UChar &c2) 0505 { 0506 return (c1.uc == c2.uc); 0507 } 0508 KJS_EXPORT bool operator==(const UString &s1, const UString &s2); 0509 KJS_EXPORT inline bool operator!=(const UString &s1, const UString &s2) 0510 { 0511 return !KJS::operator==(s1, s2); 0512 } 0513 KJS_EXPORT bool operator<(const UString &s1, const UString &s2); 0514 KJS_EXPORT bool operator==(const UString &s1, const char *s2); 0515 KJS_EXPORT inline bool operator!=(const UString &s1, const char *s2) 0516 { 0517 return !KJS::operator==(s1, s2); 0518 } 0519 KJS_EXPORT inline bool operator==(const char *s1, const UString &s2) 0520 { 0521 return operator==(s2, s1); 0522 } 0523 KJS_EXPORT inline bool operator!=(const char *s1, const UString &s2) 0524 { 0525 return !KJS::operator==(s1, s2); 0526 } 0527 KJS_EXPORT bool operator==(const CString &s1, const CString &s2); 0528 KJS_EXPORT inline UString operator+(const UString &s1, const UString &s2) 0529 { 0530 return UString(s1, s2); 0531 } 0532 0533 KJS_EXPORT int compare(const UString &, const UString &); 0534 0535 // Given a first byte, gives the length of the UTF-8 sequence it begins. 0536 // Returns 0 for bytes that are not legal starts of UTF-8 sequences. 0537 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). 0538 int UTF8SequenceLength(char); 0539 0540 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. 0541 // Only allows Unicode characters (U-00000000 to U-0010FFFF). 0542 // Returns -1 if the sequence is not valid (including presence of extra bytes). 0543 int decodeUTF8Sequence(const char *); 0544 0545 KJS_EXPORT inline UString::UString() 0546 : m_rep(&Rep::null) 0547 { 0548 } 0549 0550 // Rule from ECMA 15.2 about what an array index is. 0551 // Must exactly match string form of an unsigned integer, and be less than 2^32 - 1. 0552 inline unsigned UString::toArrayIndex(bool *ok) const 0553 { 0554 unsigned i = toStrictUInt32(ok); 0555 if (ok && i >= 0xFFFFFFFFU) { 0556 *ok = false; 0557 } 0558 return i; 0559 } 0560 0561 // We'd rather not do shared substring append for small strings, since 0562 // this runs too much risk of a tiny initial string holding down a 0563 // huge buffer. 0564 // FIXME: this should be size_t but that would cause warnings until we 0565 // fix UString sizes to be size_t instead of int 0566 static const int minShareSize = Collector::minExtraCostSize / sizeof(UChar); 0567 0568 inline size_t UString::cost() const 0569 { 0570 size_t capacity = (m_rep->baseString->capacity + m_rep->baseString->preCapacity) * sizeof(UChar); 0571 size_t reportedCost = m_rep->baseString->reportedCost; 0572 ASSERT(capacity >= reportedCost); 0573 0574 size_t capacityDelta = capacity - reportedCost; 0575 0576 if (capacityDelta < static_cast<size_t>(minShareSize)) { 0577 return 0; 0578 } 0579 0580 m_rep->baseString->reportedCost = capacity; 0581 return capacityDelta; 0582 } 0583 0584 } // namespace 0585 0586 namespace WTF 0587 { 0588 0589 template<typename T> struct DefaultHash; 0590 template<typename T> struct StrHash; 0591 0592 template<> struct StrHash<KJS::UString::Rep *> { 0593 static unsigned hash(const KJS::UString::Rep *key) 0594 { 0595 return key->hash(); 0596 } 0597 static bool equal(const KJS::UString::Rep *a, const KJS::UString::Rep *b) 0598 { 0599 return KJS::UString::equal(a, b); 0600 } 0601 static const bool safeToCompareToEmptyOrDeleted = false; 0602 }; 0603 0604 template<> struct DefaultHash<KJS::UString::Rep *> { 0605 typedef StrHash<KJS::UString::Rep *> Hash; 0606 }; 0607 } // namespace WTF 0608 0609 #endif