third-party/ad-block/hash_set.h

0001 /*
0002     SPDX-License-Identifier: MPL-2.0
0003 */
0004
0005 /* Copyright (c) 2015 Brian R. Bondy. Distributed under the MPL2 license.
0006  * This Source Code Form is subject to the terms of the Mozilla Public
0007  * License, v. 2.0. If a copy of the MPL was not distributed with this
0008  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
0009
0010 // You should probably not be using this.  This is only useful in environments
0011 // without std lib and having specific serialization and memory requirements.
0012 // Instead consider using `hash_set` which is a more generic implementation
0013 // with templates.
0014
0015 #ifndef HASH_SET_H_
0016 #define HASH_SET_H_
0017
0018 #include <stdint.h>
0019 #include <math.h>
0020 #include <string.h>
0021 #include <stdio.h>
0022 #include <vector>
0023
0024 #include "./base.h"
0025 #include "./hash_item.h"
0026
0027 template<class T>
0028 class HashSet {
0029  public:
0030   typedef uint64_t (*HashSetFnPtr)(const T *hash_item);
0031
0032   /**
0033    * @param bucket_count The number of buckets to create for the hash set
0034    * @param multi_set Allow multiple items with the same hash to be added.
0035    */
0036   explicit HashSet(uint32_t bucket_count, bool multi_set)
0037       : multi_set_(multi_set) {
0038     Init(bucket_count);
0039   }
0040
0041   ~HashSet() {
0042     Cleanup();
0043   }
0044
0045   /**
0046    * Adds the specified data if it doesn't exist
0047    * A copy of the data will be created, so the memory used to do the add
0048    * doesn't need to stick around.
0049    *
0050    * @param item_to_add The node to add
0051    * @param update_if_exists true if the item should be updated if it is already there
0052    *   false if the add should fail if it is alraedy there. If multi_set is true and
0053    *   upate_if_exists is false than the same hash'ed item will be added again.
0054    * @return true if the data was added
0055    */
0056   bool Add(const T &item_to_add, bool update_if_exists = true) {
0057     uint64_t hash = item_to_add.GetHash();
0058     HashItem<T> *hash_item = buckets_[hash % bucket_count_];
0059     if (!hash_item) {
0060       hash_item = new HashItem<T>();
0061       hash_item->hash_item_storage_ = new T(item_to_add);
0062       buckets_[hash % bucket_count_] = hash_item;
0063       size_++;
0064       return true;
0065     }
0066
0067     while (true) {
0068       if (*hash_item->hash_item_storage_ == item_to_add) {
0069         if (update_if_exists) {
0070           hash_item->hash_item_storage_->Update(item_to_add);
0071           return false;
0072         } else if (!multi_set_) {
0073           return false;
0074         }
0075       }
0076       if (!hash_item->next_) {
0077         HashItem<T> *created_hash_item = new HashItem<T>();
0078         created_hash_item->hash_item_storage_ = new T(item_to_add);
0079         hash_item->next_ = created_hash_item;
0080         break;
0081       }
0082       hash_item = hash_item->next_;
0083     }
0084
0085     size_++;
0086     return true;
0087   }
0088
0089   /**
0090    * Determines if the specified data exists in the set or not`
0091    * @param data_to_check The data to check
0092    * @return true if the data found
0093    */
0094   bool Exists(const T &data_to_check) {
0095     uint64_t hash = data_to_check.GetHash();
0096     HashItem<T> *hash_item = buckets_[hash % bucket_count_];
0097     if (!hash_item) {
0098       return false;
0099     }
0100
0101     while (hash_item) {
0102       if (*hash_item->hash_item_storage_ == data_to_check) {
0103         return true;
0104       }
0105       hash_item = hash_item->next_;
0106     }
0107
0108     return false;
0109   }
0110
0111   /**
0112    * Determines if the specified data exists in the set or not`
0113    * @param data_to_check The data to check
0114    * @return true if the data found
0115    */
0116   size_t GetMatchingCount(const T &data_to_check) {
0117     size_t count = 0;
0118     uint64_t hash = data_to_check.GetHash();
0119     HashItem<T> *hash_item = buckets_[hash % bucket_count_];
0120     if (!hash_item) {
0121       return count;
0122     }
0123
0124     while (hash_item) {
0125       if (*hash_item->hash_item_storage_ == data_to_check) {
0126         count++;
0127       }
0128       hash_item = hash_item->next_;
0129     }
0130
0131     return count;
0132   }
0133
0134   /**
0135    * Finds the specific data in the hash set.
0136    * This is useful because sometimes it contains more context
0137    * than the object used for the lookup.
0138    * @param data_to_check The data to check
0139    * @return The data stored in the hash set or nullptr if none is found.
0140    */
0141   T * Find(const T &data_to_check) {
0142     uint64_t hash = data_to_check.GetHash();
0143     HashItem<T> *hash_item = buckets_[hash % bucket_count_];
0144     if (!hash_item) {
0145       return nullptr;
0146     }
0147
0148     while (hash_item) {
0149       if (*hash_item->hash_item_storage_ == data_to_check) {
0150         return hash_item->hash_item_storage_;
0151       }
0152       hash_item = hash_item->next_;
0153     }
0154
0155     return nullptr;
0156   }
0157
0158   /**
0159    * Finds the specific data in the hash set.
0160    * This is useful because sometimes it contains more context
0161    * than the object used for the lookup.
0162    * @param data_to_check The data to check
0163    * @return The data stored in the hash set or nullptr if none is found.
0164    */
0165   void FindAll(const T &data_to_check, std::vector<T*> *result) {
0166     uint64_t hash = data_to_check.GetHash();
0167     HashItem<T> *hash_item = buckets_[hash % bucket_count_];
0168     if (!hash_item) {
0169       return;
0170     }
0171
0172     while (hash_item) {
0173       if (*hash_item->hash_item_storage_ == data_to_check) {
0174         result->push_back(hash_item->hash_item_storage_);
0175       }
0176       hash_item = hash_item->next_;
0177     }
0178   }
0179
0180   /**
0181    * Removes the specific data in the hash set.
0182    * @param data_to_check The data to remove
0183    * @return true if an item matching the data was removed
0184    */
0185   bool Remove(const T &data_to_check) {
0186     uint64_t hash = data_to_check.GetHash();
0187     HashItem<T> *hash_item = buckets_[hash % bucket_count_];
0188     if (!hash_item) {
0189       return false;
0190     }
0191
0192     HashItem<T> *last_item = nullptr;
0193     while (hash_item) {
0194       if (*hash_item->hash_item_storage_ == data_to_check) {
0195         if (last_item) {
0196           last_item->next_ = hash_item->next_;
0197           delete hash_item;
0198         } else {
0199           buckets_[hash % bucket_count_] = hash_item->next_;
0200           delete hash_item;
0201         }
0202         size_--;
0203         return true;
0204       }
0205
0206       last_item = hash_item;
0207       hash_item = hash_item->next_;
0208     }
0209
0210     return false;
0211   }
0212
0213
0214   /**
0215    * Obtains the number of items in the Hash Set
0216    */
0217   uint32_t GetSize() {
0218     return size_;
0219   }
0220
0221   /**
0222    * Serializes the parsed data and bloom filter data into a single buffer.
0223    * @param size The size is returned in the out parameter if it's needed to
0224    * write to a file.
0225    * @return The returned buffer should be deleted by the caller.
0226    */
0227   char * Serialize(uint32_t *size) {
0228      *size = 0;
0229      *size += SerializeBuckets(nullptr);
0230      char *buffer = new char[*size];
0231      memset(buffer, 0, *size);
0232      SerializeBuckets(buffer);
0233      return buffer;
0234   }
0235
0236   /**
0237    * Deserializes the buffer.
0238    * Memory passed in will be used by this instance directly without copying
0239    * it in.
0240    * @param buffer The serialized data to deserialize
0241    * @param buffer_size the size of the buffer to deserialize
0242    * @return true if the operation was successful
0243    */
0244   bool Deserialize(char *buffer, uint32_t buffer_size) {
0245     Cleanup();
0246     uint32_t pos = 0;
0247     if (!HasNewlineBefore(buffer, buffer_size)) {
0248       return false;
0249     }
0250
0251     uint32_t multi_set = 0;
0252     sscanf(buffer + pos, "%x,%x", &bucket_count_, &multi_set);
0253     multi_set_ = multi_set != 0;
0254     buckets_ = new HashItem<T> *[bucket_count_];
0255     memset(buckets_, 0, sizeof(HashItem<T>*) * bucket_count_);
0256     pos += static_cast<uint32_t>(strlen(buffer + pos)) + 1;
0257     if (pos >= buffer_size) {
0258       return false;
0259     }
0260     for (uint32_t i = 0; i < bucket_count_; i++) {
0261       HashItem<T> *last_hash_item = nullptr;
0262       while (*(buffer + pos) != '\0') {
0263         if (pos >= buffer_size) {
0264           return false;
0265         }
0266
0267         HashItem<T> *hash_item = new HashItem<T>();
0268         hash_item->hash_item_storage_ = new T();
0269         uint32_t deserialize_size =
0270           hash_item->hash_item_storage_->Deserialize(buffer + pos,
0271               buffer_size - pos);
0272         pos += deserialize_size;
0273         if (pos >= buffer_size || deserialize_size == 0) {
0274           return false;
0275         }
0276
0277         size_++;
0278
0279         if (last_hash_item) {
0280           last_hash_item->next_ = hash_item;
0281         } else {
0282           buckets_[i] = hash_item;
0283         }
0284         last_hash_item = hash_item;
0285       }
0286       pos++;
0287     }
0288     return true;
0289   }
0290
0291   /**
0292    * Clears the HashSet back to the original dimensions but
0293    * with no data.
0294    */
0295   void Clear() {
0296     auto old_bucket_count = bucket_count_;
0297     Cleanup();
0298     Init(old_bucket_count);
0299   }
0300
0301   /**
0302    * Returns true if the hash_set is a multi_set
0303    */
0304   bool IsMultiSet() const {
0305     return multi_set_;
0306   }
0307
0308  private:
0309   bool HasNewlineBefore(char *buffer, uint32_t buffer_size) {
0310     char *p = buffer;
0311     for (uint32_t i = 0; i < buffer_size; ++i) {
0312       if (*p == '\0')
0313         return true;
0314       p++;
0315     }
0316     return false;
0317   }
0318
0319   void Init(uint32_t num_buckets) {
0320     bucket_count_ = num_buckets;
0321     buckets_ = nullptr;
0322     size_ = 0;
0323     if (bucket_count_ != 0) {
0324       buckets_ = new HashItem<T>*[bucket_count_];
0325       memset(buckets_, 0, sizeof(HashItem<T>*) * bucket_count_);
0326     }
0327   }
0328
0329   void Cleanup() {
0330     if (buckets_) {
0331       for (uint32_t i = 0; i < bucket_count_; i++) {
0332         HashItem<T> *hash_item = buckets_[i];
0333         while (hash_item) {
0334           HashItem<T> *temp_hash_item = hash_item;
0335           hash_item = hash_item->next_;
0336           delete temp_hash_item;
0337         }
0338       }
0339       delete[] buckets_;
0340       buckets_ = nullptr;
0341       bucket_count_ = 0;
0342       size_ = 0;
0343     }
0344   }
0345
0346   uint32_t SerializeBuckets(char *buffer) {
0347     uint32_t total_size = 0;
0348     char sz[512];
0349     total_size += 1 + snprintf(sz, sizeof(sz), "%x,%x",
0350         bucket_count_, multi_set_ ? 1 : 0);
0351     if (buffer) {
0352       memcpy(buffer, sz, total_size);
0353     }
0354     for (uint32_t i = 0; i < bucket_count_; i++) {
0355       HashItem<T> *hash_item = buckets_[i];
0356       while (hash_item) {
0357         if (buffer) {
0358           total_size +=
0359             hash_item->hash_item_storage_->Serialize(buffer + total_size);
0360         } else {
0361           total_size += hash_item->hash_item_storage_->Serialize(nullptr);
0362         }
0363         hash_item = hash_item->next_;
0364       }
0365       if (buffer) {
0366         buffer[total_size] = '\0';
0367       }
0368       // Second null terminator to show next bucket
0369       total_size++;
0370     }
0371     return total_size;
0372   }
0373
0374  protected:
0375   bool multi_set_;
0376   uint32_t bucket_count_;
0377   HashItem<T> **buckets_;
0378   uint32_t size_;
0379 };
0380
0381 #endif  // HASH_SET_H_