src/misc/StringHash.h

0001 /*
0002  * Copyright (C) 2006, 2007, 2008 Apple Inc. All rights reserved
0003  *
0004  * This library is free software; you can redistribute it and/or
0005  * modify it under the terms of the GNU Library General Public
0006  * License as published by the Free Software Foundation; either
0007  * version 2 of the License, or (at your option) any later version.
0008  *
0009  * This library is distributed in the hope that it will be useful,
0010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0012  * Library General Public License for more details.
0013  *
0014  * You should have received a copy of the GNU Library General Public License
0015  * along with this library; see the file COPYING.LIB.  If not, write to
0016  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0017  * Boston, MA 02110-1301, USA.
0018  *
0019  */
0020
0021 #ifndef StringHash_h
0022 #define StringHash_h
0023
0024 #include "AtomicStringImpl.h"
0025 #include "dom/dom_string.h"
0026 #include "xml/dom_stringimpl.h"
0027 #include <wtf/HashTraits.h>
0028
0029 using DOM::DOMString;
0030 using DOM::DOMStringImpl;
0031
0032 namespace khtml
0033 {
0034
0035 // FIXME: We should really figure out a way to put the computeHash function that's
0036 // currently a member function of DOMStringImpl into this file so we can be a little
0037 // closer to having all the nearly-identical hash functions in one place.
0038
0039 struct StringHash {
0040     static unsigned hash(DOMStringImpl *key)
0041     {
0042         return key->hash();
0043     }
0044     static bool equal(DOMStringImpl *a, DOMStringImpl *b)
0045     {
0046         if (a == b) {
0047             return true;
0048         }
0049         if (!a || !b) {
0050             return false;
0051         }
0052
0053         unsigned aLength = a->length();
0054         unsigned bLength = b->length();
0055         if (aLength != bLength) {
0056             return false;
0057         }
0058
0059         const uint32_t *aChars = reinterpret_cast<const uint32_t *>(a->unicode());
0060         const uint32_t *bChars = reinterpret_cast<const uint32_t *>(b->unicode());
0061
0062         unsigned halfLength = aLength >> 1;
0063         for (unsigned i = 0; i != halfLength; ++i)
0064             if (*aChars++ != *bChars++) {
0065                 return false;
0066             }
0067
0068         if (aLength & 1 && *reinterpret_cast<const uint16_t *>(aChars) != *reinterpret_cast<const uint16_t *>(bChars)) {
0069             return false;
0070         }
0071
0072         return true;
0073     }
0074
0075     static unsigned hash(const RefPtr<DOMStringImpl> &key)
0076     {
0077         return key->hash();
0078     }
0079     static bool equal(const RefPtr<DOMStringImpl> &a, const RefPtr<DOMStringImpl> &b)
0080     {
0081         return equal(a.get(), b.get());
0082     }
0083
0084     static unsigned hash(const DOMString &key)
0085     {
0086         return key.implementation()->hash();
0087     }
0088     static bool equal(const DOMString &a, const DOMString &b)
0089     {
0090         return equal(a.implementation(), b.implementation());
0091     }
0092
0093     static const bool safeToCompareToEmptyOrDeleted = false;
0094 };
0095
0096 class CaseFoldingHash
0097 {
0098 private:
0099     // Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
0100     static const unsigned PHI = 0x9e3779b9U;
0101 public:
0102     // Paul Hsieh's SuperFastHash
0103     // http://www.azillionmonkeys.com/qed/hash.html
0104     static unsigned hash(const QChar *data, unsigned length)
0105     {
0106         unsigned l = length;
0107         const QChar *s = data;
0108         uint32_t hash = PHI;
0109         uint32_t tmp;
0110
0111         int rem = l & 1;
0112         l >>= 1;
0113
0114         // Main loop.
0115         for (; l > 0; l--) {
0116             hash += s[0].toCaseFolded().unicode();
0117             tmp = (s[1].toCaseFolded().unicode() << 11) ^ hash;
0118             hash = (hash << 16) ^ tmp;
0119             s += 2;
0120             hash += hash >> 11;
0121         }
0122
0123         // Handle end case.
0124         if (rem) {
0125             hash += s[0].toCaseFolded().unicode();
0126             hash ^= hash << 11;
0127             hash += hash >> 17;
0128         }
0129
0130         // Force "avalanching" of final 127 bits.
0131         hash ^= hash << 3;
0132         hash += hash >> 5;
0133         hash ^= hash << 2;
0134         hash += hash >> 15;
0135         hash ^= hash << 10;
0136
0137         // This avoids ever returning a hash code of 0, since that is used to
0138         // signal "hash not computed yet", using a value that is likely to be
0139         // effectively the same as 0 when the low bits are masked.
0140         hash |= !hash << 31;
0141
0142         return hash;
0143     }
0144
0145     static unsigned hash(DOMStringImpl *str)
0146     {
0147         return hash(str->unicode(), str->length());
0148     }
0149
0150     static unsigned hash(const char *str, unsigned length)
0151     {
0152         // This hash is designed to work on 16-bit chunks at a time. But since the normal case
0153         // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
0154         // were 16-bit chunks, which will give matching results.
0155
0156         unsigned l = length;
0157         const char *s = str;
0158         uint32_t hash = PHI;
0159         uint32_t tmp;
0160
0161         int rem = l & 1;
0162         l >>= 1;
0163
0164         // Main loop
0165         for (; l > 0; l--) {
0166             hash += QChar::toCaseFolded((unsigned int)s[0]);
0167             tmp = (QChar::toCaseFolded((unsigned int)s[1]) << 11) ^ hash;
0168             hash = (hash << 16) ^ tmp;
0169             s += 2;
0170             hash += hash >> 11;
0171         }
0172
0173         // Handle end case
0174         if (rem) {
0175             hash += QChar::toCaseFolded((unsigned int)s[0]);
0176             hash ^= hash << 11;
0177             hash += hash >> 17;
0178         }
0179
0180         // Force "avalanching" of final 127 bits
0181         hash ^= hash << 3;
0182         hash += hash >> 5;
0183         hash ^= hash << 2;
0184         hash += hash >> 15;
0185         hash ^= hash << 10;
0186
0187         // this avoids ever returning a hash code of 0, since that is used to
0188         // signal "hash not computed yet", using a value that is likely to be
0189         // effectively the same as 0 when the low bits are masked
0190         hash |= !hash << 31;
0191
0192         return hash;
0193     }
0194
0195     static bool equal(DOMStringImpl *a, DOMStringImpl *b)
0196     {
0197         if (a == b) {
0198             return true;
0199         }
0200         if (!a || !b) {
0201             return false;
0202         }
0203         unsigned length = a->length();
0204         if (length != b->length()) {
0205             return false;
0206         }
0207         for (unsigned i = 0; i < length; ++i)
0208             if (a->unicode()[i].toCaseFolded() != b->unicode()[i].toCaseFolded()) {
0209                 return false;
0210             }
0211         return true;
0212     }
0213
0214     static unsigned hash(const RefPtr<DOMStringImpl> &key)
0215     {
0216         return hash(key.get());
0217     }
0218
0219     static bool equal(const RefPtr<DOMStringImpl> &a, const RefPtr<DOMStringImpl> &b)
0220     {
0221         return equal(a.get(), b.get());
0222     }
0223
0224     static unsigned hash(const DOMString &key)
0225     {
0226         return hash(key.implementation());
0227     }
0228     static bool equal(const DOMString &a, const DOMString &b)
0229     {
0230         return equal(a.implementation(), b.implementation());
0231     }
0232
0233     static const bool safeToCompareToEmptyOrDeleted = false;
0234 };
0235
0236 // This hash can be used in cases where the key is a hash of a string, but we don't
0237 // want to store the string. It's not really specific to string hashing, but all our
0238 // current uses of it are for strings.
0239 struct AlreadyHashed : IntHash<unsigned> {
0240     static unsigned hash(unsigned key)
0241     {
0242         return key;
0243     }
0244
0245     // To use a hash value as a key for a hash table, we need to eliminate the
0246     // "deleted" value, which is negative one. That could be done by changing
0247     // the string hash function to never generate negative one, but this works
0248     // and is still relatively efficient.
0249     static unsigned avoidDeletedValue(unsigned hash)
0250     {
0251         ASSERT(hash);
0252         unsigned newHash = hash | (!(hash + 1) << 31);
0253         ASSERT(newHash);
0254         ASSERT(newHash != 0xFFFFFFFF);
0255         return newHash;
0256     }
0257 };
0258
0259 }
0260
0261 namespace WTF
0262 {
0263
0264 /*template<> struct HashTraits<DOM::DOMString> : GenericHashTraits<DOM::DOMString> {
0265     static const bool emptyValueIsZero = true;
0266     static void constructDeletedValue(DOM::DOMString& slot) { new (&slot) DOM::DOMString(HashTableDeletedValue); }
0267     static bool isDeletedValue(const DOM::DOMString& slot) { return slot.isHashTableDeletedValue(); }
0268 };*/
0269
0270 }
0271
0272 #endif
0273