src/xml/dom_stringimpl.cpp

0001 /**
0002  * This file is part of the DOM implementation for KDE.
0003  *
0004  * Copyright (C) 1999-2003 Lars Knoll (knoll@kde.org)
0005  *           (C) 1999 Antti Koivisto (koivisto@kde.org)
0006  *           (C) 2001-2003 Dirk Mueller ( mueller@kde.org )
0007  *           (C) 2002, 2004 Apple Computer, Inc.
0008  *
0009  * This library is free software; you can redistribute it and/or
0010  * modify it under the terms of the GNU Library General Public
0011  * License as published by the Free Software Foundation; either
0012  * version 2 of the License, or (at your option) any later version.
0013  *
0014  * This library is distributed in the hope that it will be useful,
0015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0017  * Library General Public License for more details.
0018  *
0019  * You should have received a copy of the GNU Library General Public License
0020  * along with this library; see the file COPYING.LIB.  If not, write to
0021  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0022  * Boston, MA 02110-1301, USA.
0023  *
0024  */
0025
0026 #include "dom_stringimpl.h"
0027
0028 #include <string.h>
0029 #include <QMutableStringListIterator>
0030 #include "misc/AtomicString.h"
0031
0032 using namespace DOM;
0033 using namespace khtml;
0034
0035 DOMStringImpl::DOMStringImpl(const char *str) : m_hash(0), m_inTable(0), m_shallowCopy(0)
0036 {
0037     if (str && *str) {
0038         l = strlen(str);
0039         s = QT_ALLOC_QCHAR_VEC(l);
0040         int i = l;
0041         QChar *ptr = s;
0042         while (i--) {
0043             *ptr++ = *str++;
0044         }
0045     } else {
0046         s = QT_ALLOC_QCHAR_VEC(1);    // crash protection
0047         s[0] = 0x0; // == QChar::null;
0048         l = 0;
0049     }
0050 }
0051
0052 DOMStringImpl::DOMStringImpl(const char *str, uint len) : m_hash(0), m_inTable(0), m_shallowCopy(0)
0053 {
0054     if (str && *str) {
0055         l = len;
0056         s = QT_ALLOC_QCHAR_VEC(l);
0057         int i = l;
0058         QChar *ptr = s;
0059         while (i--) {
0060             *ptr++ = *str++;
0061         }
0062     } else {
0063         s = QT_ALLOC_QCHAR_VEC(1);    // crash protection
0064         s[0] = 0x0; // == QChar::null;
0065         l = 0;
0066     }
0067 }
0068
0069 DOMStringImpl::DOMStringImpl(const char *str, unsigned len/*gth*/, unsigned hash) : m_hash(hash), m_inTable(true), m_shallowCopy(0)
0070 {
0071     if (str && *str) {
0072         l = len;
0073         s = QT_ALLOC_QCHAR_VEC(l);
0074         int i = l;
0075         QChar *ptr = s;
0076         while (i--) {
0077             *ptr++ = *str++;
0078         }
0079     } else {
0080         s = QT_ALLOC_QCHAR_VEC(1);    // crash protection
0081         s[0] = 0x0; // == QChar::null;
0082         l = 0;
0083     }
0084 }
0085
0086 DOMStringImpl::~DOMStringImpl()
0087 {
0088     if (m_shallowCopy) {
0089         return;
0090     }
0091     if (m_inTable) {
0092         khtml::AtomicString::remove(this);
0093     }
0094     if (s) {
0095         QT_DELETE_QCHAR_VEC(s);
0096     }
0097 }
0098
0099 // FIXME: should be a cached flag maybe.
0100 bool DOMStringImpl::containsOnlyWhitespace() const
0101 {
0102     if (!s) {
0103         return true;
0104     }
0105
0106     for (uint i = 0; i < l; i++) {
0107         QChar c = s[i];
0108         if (c.unicode() <= 0x7F) {
0109             if (c.unicode() > ' ') {
0110                 return false;
0111             }
0112         } else {
0113             if (c.direction() != QChar::DirWS) {
0114                 return false;
0115             }
0116         }
0117     }
0118     return true;
0119 }
0120
0121 void DOMStringImpl::append(DOMStringImpl *str)
0122 {
0123     if (str && str->l != 0) {
0124         int newlen = l + str->l;
0125         QChar *c = QT_ALLOC_QCHAR_VEC(newlen);
0126         memcpy(c, s, l * sizeof(QChar));
0127         memcpy(c + l, str->s, str->l * sizeof(QChar));
0128         if (s) {
0129             QT_DELETE_QCHAR_VEC(s);
0130         }
0131         s = c;
0132         l = newlen;
0133     }
0134 }
0135
0136 void DOMStringImpl::insert(DOMStringImpl *str, unsigned int pos)
0137 {
0138     if (pos > l) {
0139         append(str);
0140         return;
0141     }
0142     if (str && str->l != 0) {
0143         int newlen = l + str->l;
0144         QChar *c = QT_ALLOC_QCHAR_VEC(newlen);
0145         memcpy(c, s, pos * sizeof(QChar));
0146         memcpy(c + pos, str->s, str->l * sizeof(QChar));
0147         memcpy(c + pos + str->l, s + pos, (l - pos)*sizeof(QChar));
0148         if (s) {
0149             QT_DELETE_QCHAR_VEC(s);
0150         }
0151         s = c;
0152         l = newlen;
0153     }
0154 }
0155
0156 void DOMStringImpl::truncate(int len)
0157 {
0158     if (len > (int)l) {
0159         return;
0160     }
0161
0162     int nl = len < 1 ? 1 : len;
0163     QChar *c = QT_ALLOC_QCHAR_VEC(nl);
0164     memcpy(c, s, nl * sizeof(QChar));
0165     if (s) {
0166         QT_DELETE_QCHAR_VEC(s);
0167     }
0168     s = c;
0169     l = len;
0170 }
0171
0172 void DOMStringImpl::remove(unsigned int pos, int len)
0173 {
0174     if (pos >= l) {
0175         return;
0176     }
0177     if (pos + len > l) {
0178         len = l - pos;
0179     }
0180
0181     uint newLen = l - len;
0182     QChar *c = QT_ALLOC_QCHAR_VEC(newLen);
0183     memcpy(c, s, pos * sizeof(QChar));
0184     memcpy(c + pos, s + pos + len, (l - len - pos)*sizeof(QChar));
0185     if (s) {
0186         QT_DELETE_QCHAR_VEC(s);
0187     }
0188     s = c;
0189     l = newLen;
0190 }
0191
0192 DOMStringImpl *DOMStringImpl::split(unsigned int pos)
0193 {
0194     if (pos >= l) {
0195         return new DOMStringImpl();
0196     }
0197
0198     uint newLen = l - pos;
0199     DOMStringImpl *str = new DOMStringImpl(s + pos, newLen);
0200     truncate(pos);
0201     return str;
0202 }
0203
0204 DOMStringImpl *DOMStringImpl::substring(unsigned int pos, unsigned int len)
0205 {
0206     if (pos >= l) {
0207         return new DOMStringImpl();
0208     }
0209     if (len == UINT_MAX || pos + len > l) {
0210         len = l - pos;
0211     }
0212
0213     return new DOMStringImpl(s + pos, len);
0214 }
0215
0216 // Collapses white-space according to CSS 2.1 rules
0217 DOMStringImpl *DOMStringImpl::collapseWhiteSpace(bool preserveLF, bool preserveWS)
0218 {
0219     if (preserveLF && preserveWS) {
0220         return this;
0221     }
0222
0223     // Notice we are likely allocating more space than needed (worst case)
0224     QChar *n = QT_ALLOC_QCHAR_VEC(l);
0225
0226     unsigned int pos = 0;
0227     bool collapsing = false;   // collapsing white-space
0228     bool collapsingLF = false; // collapsing around linefeed
0229     bool changedLF = false;
0230     for (unsigned int i = 0; i < l; i++) {
0231         ushort ch = s[i].unicode();
0232
0233         // We act on \r as we would on \n because CSS uses it to indicate new-line
0234         if (ch == '\r') {
0235             ch = '\n';
0236         } else
0237             // ### The XML parser lets \t through, for now treat them as spaces
0238             if (ch == '\t') {
0239                 ch = ' ';
0240             }
0241
0242         if (!preserveLF && ch == '\n') {
0243             // ### Not strictly correct according to CSS3 text-module.
0244             // - In ideographic languages linefeed should be ignored
0245             // - and in Thai and Khmer it should be treated as a zero-width space
0246             ch = ' '; // Treat as space
0247             changedLF = true;
0248         }
0249
0250         if (collapsing) {
0251             if (ch == ' ') {
0252                 continue;
0253             }
0254             if (ch == '\n') {
0255                 collapsingLF = true;
0256                 continue;
0257             }
0258
0259             n[pos++] = (collapsingLF) ? QLatin1Char('\n') : QLatin1Char(' ');
0260             collapsing = false;
0261             collapsingLF = false;
0262         } else if (!preserveWS && ch == ' ') {
0263             collapsing = true;
0264             continue;
0265         } else if (!preserveWS && ch == '\n') {
0266             collapsing = true;
0267             collapsingLF = true;
0268             continue;
0269         }
0270
0271         n[pos++] = ch;
0272     }
0273     if (collapsing) {
0274         n[pos++] = ((collapsingLF) ? QLatin1Char('\n') : QLatin1Char(' '));
0275     }
0276
0277     if (pos == l && !changedLF) {
0278         QT_DELETE_QCHAR_VEC(n);
0279         return this;
0280     } else {
0281         DOMStringImpl *out = new DOMStringImpl();
0282         out->s = n;
0283         out->l = pos;
0284
0285         return out;
0286     }
0287 }
0288
0289 static Length parseLength(const QChar *s, unsigned int l)
0290 {
0291     if (l == 0) {
0292         return Length(1, Relative);
0293     }
0294
0295     unsigned i = 0;
0296     while (i < l && s[i].isSpace()) {
0297         ++i;
0298     }
0299     if (i < l && (s[i] == '+' || s[i] == '-')) {
0300         ++i;
0301     }
0302     while (i < l && s[i].isDigit()) {
0303         ++i;
0304     }
0305
0306     bool ok;
0307     int r = QString::fromRawData(s, i).toInt(&ok);
0308
0309     /* Skip over any remaining digits, we are not that accurate (5.5% => 5%) */
0310     while (i < l && (s[i].isDigit() || s[i] == '.')) {
0311         ++i;
0312     }
0313
0314     /* IE Quirk: Skip any whitespace (20 % => 20%) */
0315     while (i < l && s[i].isSpace()) {
0316         ++i;
0317     }
0318
0319     if (ok) {
0320         if (i == l) {
0321             return Length(r, Fixed);
0322         } else {
0323             const QChar *next = s + i;
0324
0325             if (*next == '%') {
0326                 return Length(static_cast<double>(r), Percent);
0327             }
0328
0329             if (*next == '*') {
0330                 return Length(r, Relative);
0331             }
0332         }
0333         return Length(r, Fixed);
0334     } else {
0335         if (i < l) {
0336             const QChar *next = s + i;
0337
0338             if (*next == '*') {
0339                 return Length(1, Relative);
0340             }
0341
0342             if (*next == '%') {
0343                 return Length(1, Relative);
0344             }
0345         }
0346     }
0347     return Length(0, Relative);
0348 }
0349
0350 khtml::Length *DOMStringImpl::toCoordsArray(int &len) const
0351 {
0352     QString str(s, l);
0353     for (unsigned int i = 0; i < l; i++) {
0354         QChar cc = s[i];
0355         if (cc > '9' || (cc < '0' && cc != '-' && cc != '*' && cc != '.')) {
0356             str[i] = ' ';
0357         }
0358     }
0359     str = str.simplified();
0360
0361     len = str.count(' ') + 1;
0362     khtml::Length *r = new khtml::Length[len];
0363
0364     int j = 0;
0365     int pos = 0;
0366     int pos2;
0367
0368     while ((pos2 = str.indexOf(QLatin1Char(' '), pos)) != -1) {
0369         r[j++] = parseLength((QChar *) str.unicode() + pos, pos2 - pos);
0370         pos = pos2 + 1;
0371     }
0372     r[j] = parseLength((QChar *) str.unicode() + pos, str.length() - pos);
0373
0374     return r;
0375 }
0376
0377 khtml::Length *DOMStringImpl::toLengthArray(int &len) const
0378 {
0379     QString str(s, l);
0380     str = str.simplified();
0381
0382     len = str.count(QLatin1Char(',')) + 1;
0383
0384     // If we have no commas, we have no array.
0385     if (len == 1) {
0386         return nullptr;
0387     }
0388
0389     khtml::Length *r = new khtml::Length[len];
0390
0391     int i = 0;
0392     int pos = 0;
0393     int pos2;
0394
0395     while ((pos2 = str.indexOf(QLatin1Char(','), pos)) != -1) {
0396         r[i++] = parseLength((QChar *) str.unicode() + pos, pos2 - pos);
0397         pos = pos2 + 1;
0398     }
0399
0400     /* IE Quirk: If the last comma is the last char skip it and reduce len by one */
0401     if (str.length() - pos > 0) {
0402         r[i] = parseLength((QChar *) str.unicode() + pos, str.length() - pos);
0403     } else {
0404         len--;
0405     }
0406
0407     return r;
0408 }
0409
0410 bool DOMStringImpl::isLower() const
0411 {
0412     unsigned int i;
0413     for (i = 0; i < l; i++)
0414         if (s[i].toLower() != s[i]) {
0415             return false;
0416         }
0417     return true;
0418 }
0419
0420 DOMStringImpl *DOMStringImpl::lower() const
0421 {
0422     DOMStringImpl *c = new DOMStringImpl;
0423     if (!l) {
0424         return c;
0425     }
0426
0427     c->s = QT_ALLOC_QCHAR_VEC(l);
0428     c->l = l;
0429
0430     for (unsigned int i = 0; i < l; i++) {
0431         c->s[i] = s[i].toLower();
0432     }
0433
0434     return c;
0435 }
0436
0437 DOMStringImpl *DOMStringImpl::upper() const
0438 {
0439     DOMStringImpl *c = new DOMStringImpl;
0440     if (!l) {
0441         return c;
0442     }
0443
0444     c->s = QT_ALLOC_QCHAR_VEC(l);
0445     c->l = l;
0446
0447     for (unsigned int i = 0; i < l; i++) {
0448         c->s[i] = s[i].toUpper();
0449     }
0450
0451     return c;
0452 }
0453
0454 DOMStringImpl *DOMStringImpl::capitalize(bool noFirstCap) const
0455 {
0456     bool canCapitalize = !noFirstCap;
0457     DOMStringImpl *c = new DOMStringImpl;
0458     if (!l) {
0459         return c;
0460     }
0461
0462     c->s = QT_ALLOC_QCHAR_VEC(l);
0463     c->l = l;
0464
0465     for (unsigned int i = 0; i < l; i++) {
0466         if (s[i].isLetterOrNumber() && canCapitalize) {
0467             c->s[i] = s[i].toUpper();
0468             canCapitalize = false;
0469         } else {
0470             c->s[i] = s[i];
0471             if (s[i].isSpace()) {
0472                 canCapitalize = true;
0473             }
0474         }
0475     }
0476
0477     return c;
0478 }
0479
0480 QString DOMStringImpl::string() const
0481 {
0482     return QString(s, l);
0483 }
0484
0485 int DOMStringImpl::toInt(bool *ok) const
0486 {
0487     // match \s*[+-]?\d*
0488     unsigned i = 0;
0489     while (i < l && s[i].isSpace()) {
0490         ++i;
0491     }
0492     if (i < l && (s[i] == '+' || s[i] == '-')) {
0493         ++i;
0494     }
0495     while (i < l && s[i].isDigit()) {
0496         ++i;
0497     }
0498
0499     return QString::fromRawData(s, i).toInt(ok);
0500 }
0501
0502 float DOMStringImpl::toFloat(bool *ok) const
0503 {
0504     return QString::fromRawData(s, l).toFloat(ok);
0505 }
0506
0507 bool DOMStringImpl::endsWith(DOMStringImpl *str, CaseSensitivity cs) const
0508 {
0509     if (l < str->l) {
0510         return false;
0511     }
0512     const QChar *a = s + l - 1;
0513     const QChar *b = str->s + str->l - 1;
0514     int i = str->l;
0515     if (cs == CaseSensitive) {
0516         while (i--) {
0517             if (*a != *b) {
0518                 return false;
0519             }
0520             a--, b--;
0521         }
0522     } else {
0523         while (i--) {
0524             if (a->toLower() != b->toLower()) {
0525                 return false;
0526             }
0527             a--, b--;
0528         }
0529     }
0530     return true;
0531 }
0532
0533 bool DOMStringImpl::startsWith(DOMStringImpl *str, CaseSensitivity cs) const
0534 {
0535     if (l < str->l) {
0536         return false;
0537     }
0538     const QChar *a = s;
0539     const QChar *b = str->s;
0540     int i = str->l;
0541     if (cs == CaseSensitive) {
0542         while (i--) {
0543             if (*a != *b) {
0544                 return false;
0545             }
0546             a++, b++;
0547         }
0548     } else {
0549         while (i--) {
0550             if (a->toLower() != b->toLower()) {
0551                 return false;
0552             }
0553             a++, b++;
0554         }
0555     }
0556     return true;
0557 }
0558
0559 DOMStringImpl *DOMStringImpl::substring(unsigned pos, unsigned len) const
0560 {
0561     if (pos >= l) {
0562         return nullptr;
0563     }
0564     if (len > l - pos) {
0565         len = l - pos;
0566     }
0567     return new DOMStringImpl(s + pos, len);
0568 }
0569
0570 static const unsigned short amp[] = {'&', 'a', 'm', 'p', ';'};
0571 static const unsigned short lt[] =  {'&', 'l', 't', ';'};
0572 static const unsigned short gt[] =  {'&', 'g', 't', ';'};
0573 static const unsigned short nbsp[] =  {'&', 'n', 'b', 's', 'p', ';'};
0574
0575 DOMStringImpl *DOMStringImpl::escapeHTML()
0576 {
0577     unsigned outL = 0;
0578     for (unsigned int i = 0; i < l; ++i) {
0579         if (s[i] == '&') {
0580             outL += 5;    //&amp;
0581         } else if (s[i] == '<' || s[i] == '>') {
0582             outL += 4;    //&gt;/&lt;
0583         } else if (s[i] == QChar::Nbsp) {
0584             outL += 6;    //&nbsp;
0585         } else {
0586             ++outL;
0587         }
0588     }
0589     if (outL == l) {
0590         return this;
0591     }
0592
0593     DOMStringImpl *toRet = new DOMStringImpl();
0594     toRet->s = QT_ALLOC_QCHAR_VEC(outL);
0595     toRet->l = outL;
0596
0597     unsigned outP = 0;
0598     for (unsigned int i = 0; i < l; ++i) {
0599         if (s[i] == '&') {
0600             memcpy(&toRet->s[outP], amp, sizeof(amp));
0601             outP += 5;
0602         } else if (s[i] == '<') {
0603             memcpy(&toRet->s[outP], lt, sizeof(lt));
0604             outP += 4;
0605         } else if (s[i] == '>') {
0606             memcpy(&toRet->s[outP], gt, sizeof(gt));
0607             outP += 4;
0608         } else if (s[i] == QChar::Nbsp) {
0609             memcpy(&toRet->s[outP], nbsp, sizeof(nbsp));
0610             outP += 6;
0611         } else {
0612             toRet->s[outP] = s[i];
0613             ++outP;
0614         }
0615     }
0616     return toRet;
0617 }
0618
0619 enum NoFoldTag    { DoNotFold };
0620 enum FoldLowerTag { FoldLower };
0621 enum FoldUpperTag { FoldUpper };
0622
0623 static inline
0624 unsigned short foldChar(unsigned short c, NoFoldTag)
0625 {
0626     return c;
0627 }
0628
0629 static inline
0630 unsigned short foldChar(unsigned short c, FoldLowerTag)
0631 {
0632     // ### fast path for first ones?
0633     return QChar::toLower(c);
0634 }
0635
0636 static inline
0637 unsigned short foldChar(unsigned short c, FoldUpperTag)
0638 {
0639     // ### fast path for first ones?
0640     return QChar::toUpper(c);
0641 }
0642
0643 // Paul Hsieh's SuperFastHash
0644 // http://www.azillionmonkeys.com/qed/hash.html
0645
0646 // Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
0647 // or anything like that.
0648 const unsigned PHI = 0x9e3779b9U;
0649
0650 template<typename FoldTag>
0651 static unsigned calcHash(const QChar *s, unsigned l, FoldTag foldMode)
0652 {
0653     // Note: this is originally from KJS
0654     unsigned hash = PHI;
0655     unsigned tmp;
0656
0657     int rem = l & 1;
0658     l >>= 1;
0659
0660     // Main loop
0661     for (; l > 0; l--) {
0662         hash += foldChar(s[0].unicode(), foldMode);
0663         tmp = (foldChar(s[1].unicode(), foldMode) << 11) ^ hash;
0664         hash = (hash << 16) ^ tmp;
0665         s += 2;
0666         hash += hash >> 11;
0667     }
0668
0669     // Handle end case
0670     if (rem) {
0671         hash += foldChar(s[0].unicode(), foldMode);
0672         hash ^= hash << 11;
0673         hash += hash >> 17;
0674     }
0675
0676     // Force "avalanching" of final 127 bits
0677     hash ^= hash << 3;
0678     hash += hash >> 5;
0679     hash ^= hash << 2;
0680     hash += hash >> 15;
0681     hash ^= hash << 10;
0682
0683     // this avoids ever returning a hash code of 0, since that is used to
0684     // signal "hash not computed yet", using a value that is likely to be
0685     // effectively the same as 0 when the low bits are masked
0686     if (hash == 0) {
0687         hash = 0x80000000;
0688     }
0689
0690     return hash;
0691 }
0692
0693 unsigned DOMStringImpl::hash() const
0694 {
0695     if (m_hash != 0) {
0696         return m_hash;
0697     }
0698
0699     return m_hash = calcHash(s, l, DoNotFold);
0700 }
0701
0702 unsigned DOMStringImpl::lowerHash() const
0703 {
0704     return calcHash(s, l, FoldLower);
0705 }
0706
0707 unsigned DOMStringImpl::upperHash() const
0708 {
0709     return calcHash(s, l, FoldUpper);
0710 }
0711
0712 unsigned DOMStringImpl::computeHash(const QChar *str, unsigned int length)
0713 {
0714     return calcHash(str, length, DoNotFold);
0715 }
0716
0717 DOMStringImpl *DOMStringImpl::empty()
0718 {
0719     static DOMString e("");
0720     return e.implementation();
0721 }
0722
0723 bool DOM::strcasecmp(const DOMStringImpl *a, const DOMStringImpl *b)
0724 {
0725     if (!(a && b)) {
0726         return (a && a->l) || (b && b->l);
0727     }
0728     if (a->l != b->l) {
0729         return true;
0730     }
0731     QChar *ai = a->s;
0732     QChar *bi = b->s;
0733     int l = a->l;
0734     while (l--) {
0735         if (*ai != *bi && ai->toLower() != bi->toLower()) {
0736             return true;
0737         }
0738         ++ai, ++bi;
0739     }
0740     return false;
0741 }
0742