File indexing completed on 2024-11-24 04:54:36
0001 /* 0002 SPDX-License-Identifier: MPL-2.0 0003 */ 0004 0005 /* Copyright (c) 2015 Brian R. Bondy. Distributed under the MPL2 license. 0006 * This Source Code Form is subject to the terms of the Mozilla Public 0007 * License, v. 2.0. If a copy of the MPL was not distributed with this 0008 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 0009 0010 // You should probably not be using this. This is only useful in environments 0011 // without std lib and having specific serialization and memory requirements. 0012 // Instead consider using `hash_set` which is a more generic implementation 0013 // with templates. 0014 0015 #ifndef HASH_SET_H_ 0016 #define HASH_SET_H_ 0017 0018 #include <stdint.h> 0019 #include <math.h> 0020 #include <string.h> 0021 #include <stdio.h> 0022 #include <vector> 0023 0024 #include "./base.h" 0025 #include "./hash_item.h" 0026 0027 template<class T> 0028 class HashSet { 0029 public: 0030 typedef uint64_t (*HashSetFnPtr)(const T *hash_item); 0031 0032 /** 0033 * @param bucket_count The number of buckets to create for the hash set 0034 * @param multi_set Allow multiple items with the same hash to be added. 0035 */ 0036 explicit HashSet(uint32_t bucket_count, bool multi_set) 0037 : multi_set_(multi_set) { 0038 Init(bucket_count); 0039 } 0040 0041 ~HashSet() { 0042 Cleanup(); 0043 } 0044 0045 /** 0046 * Adds the specified data if it doesn't exist 0047 * A copy of the data will be created, so the memory used to do the add 0048 * doesn't need to stick around. 0049 * 0050 * @param item_to_add The node to add 0051 * @param update_if_exists true if the item should be updated if it is already there 0052 * false if the add should fail if it is alraedy there. If multi_set is true and 0053 * upate_if_exists is false than the same hash'ed item will be added again. 0054 * @return true if the data was added 0055 */ 0056 bool Add(const T &item_to_add, bool update_if_exists = true) { 0057 uint64_t hash = item_to_add.GetHash(); 0058 HashItem<T> *hash_item = buckets_[hash % bucket_count_]; 0059 if (!hash_item) { 0060 hash_item = new HashItem<T>(); 0061 hash_item->hash_item_storage_ = new T(item_to_add); 0062 buckets_[hash % bucket_count_] = hash_item; 0063 size_++; 0064 return true; 0065 } 0066 0067 while (true) { 0068 if (*hash_item->hash_item_storage_ == item_to_add) { 0069 if (update_if_exists) { 0070 hash_item->hash_item_storage_->Update(item_to_add); 0071 return false; 0072 } else if (!multi_set_) { 0073 return false; 0074 } 0075 } 0076 if (!hash_item->next_) { 0077 HashItem<T> *created_hash_item = new HashItem<T>(); 0078 created_hash_item->hash_item_storage_ = new T(item_to_add); 0079 hash_item->next_ = created_hash_item; 0080 break; 0081 } 0082 hash_item = hash_item->next_; 0083 } 0084 0085 size_++; 0086 return true; 0087 } 0088 0089 /** 0090 * Determines if the specified data exists in the set or not` 0091 * @param data_to_check The data to check 0092 * @return true if the data found 0093 */ 0094 bool Exists(const T &data_to_check) { 0095 uint64_t hash = data_to_check.GetHash(); 0096 HashItem<T> *hash_item = buckets_[hash % bucket_count_]; 0097 if (!hash_item) { 0098 return false; 0099 } 0100 0101 while (hash_item) { 0102 if (*hash_item->hash_item_storage_ == data_to_check) { 0103 return true; 0104 } 0105 hash_item = hash_item->next_; 0106 } 0107 0108 return false; 0109 } 0110 0111 /** 0112 * Determines if the specified data exists in the set or not` 0113 * @param data_to_check The data to check 0114 * @return true if the data found 0115 */ 0116 size_t GetMatchingCount(const T &data_to_check) { 0117 size_t count = 0; 0118 uint64_t hash = data_to_check.GetHash(); 0119 HashItem<T> *hash_item = buckets_[hash % bucket_count_]; 0120 if (!hash_item) { 0121 return count; 0122 } 0123 0124 while (hash_item) { 0125 if (*hash_item->hash_item_storage_ == data_to_check) { 0126 count++; 0127 } 0128 hash_item = hash_item->next_; 0129 } 0130 0131 return count; 0132 } 0133 0134 /** 0135 * Finds the specific data in the hash set. 0136 * This is useful because sometimes it contains more context 0137 * than the object used for the lookup. 0138 * @param data_to_check The data to check 0139 * @return The data stored in the hash set or nullptr if none is found. 0140 */ 0141 T * Find(const T &data_to_check) { 0142 uint64_t hash = data_to_check.GetHash(); 0143 HashItem<T> *hash_item = buckets_[hash % bucket_count_]; 0144 if (!hash_item) { 0145 return nullptr; 0146 } 0147 0148 while (hash_item) { 0149 if (*hash_item->hash_item_storage_ == data_to_check) { 0150 return hash_item->hash_item_storage_; 0151 } 0152 hash_item = hash_item->next_; 0153 } 0154 0155 return nullptr; 0156 } 0157 0158 /** 0159 * Finds the specific data in the hash set. 0160 * This is useful because sometimes it contains more context 0161 * than the object used for the lookup. 0162 * @param data_to_check The data to check 0163 * @return The data stored in the hash set or nullptr if none is found. 0164 */ 0165 void FindAll(const T &data_to_check, std::vector<T*> *result) { 0166 uint64_t hash = data_to_check.GetHash(); 0167 HashItem<T> *hash_item = buckets_[hash % bucket_count_]; 0168 if (!hash_item) { 0169 return; 0170 } 0171 0172 while (hash_item) { 0173 if (*hash_item->hash_item_storage_ == data_to_check) { 0174 result->push_back(hash_item->hash_item_storage_); 0175 } 0176 hash_item = hash_item->next_; 0177 } 0178 } 0179 0180 /** 0181 * Removes the specific data in the hash set. 0182 * @param data_to_check The data to remove 0183 * @return true if an item matching the data was removed 0184 */ 0185 bool Remove(const T &data_to_check) { 0186 uint64_t hash = data_to_check.GetHash(); 0187 HashItem<T> *hash_item = buckets_[hash % bucket_count_]; 0188 if (!hash_item) { 0189 return false; 0190 } 0191 0192 HashItem<T> *last_item = nullptr; 0193 while (hash_item) { 0194 if (*hash_item->hash_item_storage_ == data_to_check) { 0195 if (last_item) { 0196 last_item->next_ = hash_item->next_; 0197 delete hash_item; 0198 } else { 0199 buckets_[hash % bucket_count_] = hash_item->next_; 0200 delete hash_item; 0201 } 0202 size_--; 0203 return true; 0204 } 0205 0206 last_item = hash_item; 0207 hash_item = hash_item->next_; 0208 } 0209 0210 return false; 0211 } 0212 0213 0214 /** 0215 * Obtains the number of items in the Hash Set 0216 */ 0217 uint32_t GetSize() { 0218 return size_; 0219 } 0220 0221 /** 0222 * Serializes the parsed data and bloom filter data into a single buffer. 0223 * @param size The size is returned in the out parameter if it's needed to 0224 * write to a file. 0225 * @return The returned buffer should be deleted by the caller. 0226 */ 0227 char * Serialize(uint32_t *size) { 0228 *size = 0; 0229 *size += SerializeBuckets(nullptr); 0230 char *buffer = new char[*size]; 0231 memset(buffer, 0, *size); 0232 SerializeBuckets(buffer); 0233 return buffer; 0234 } 0235 0236 /** 0237 * Deserializes the buffer. 0238 * Memory passed in will be used by this instance directly without copying 0239 * it in. 0240 * @param buffer The serialized data to deserialize 0241 * @param buffer_size the size of the buffer to deserialize 0242 * @return true if the operation was successful 0243 */ 0244 bool Deserialize(char *buffer, uint32_t buffer_size) { 0245 Cleanup(); 0246 uint32_t pos = 0; 0247 if (!HasNewlineBefore(buffer, buffer_size)) { 0248 return false; 0249 } 0250 0251 uint32_t multi_set = 0; 0252 sscanf(buffer + pos, "%x,%x", &bucket_count_, &multi_set); 0253 multi_set_ = multi_set != 0; 0254 buckets_ = new HashItem<T> *[bucket_count_]; 0255 memset(buckets_, 0, sizeof(HashItem<T>*) * bucket_count_); 0256 pos += static_cast<uint32_t>(strlen(buffer + pos)) + 1; 0257 if (pos >= buffer_size) { 0258 return false; 0259 } 0260 for (uint32_t i = 0; i < bucket_count_; i++) { 0261 HashItem<T> *last_hash_item = nullptr; 0262 while (*(buffer + pos) != '\0') { 0263 if (pos >= buffer_size) { 0264 return false; 0265 } 0266 0267 HashItem<T> *hash_item = new HashItem<T>(); 0268 hash_item->hash_item_storage_ = new T(); 0269 uint32_t deserialize_size = 0270 hash_item->hash_item_storage_->Deserialize(buffer + pos, 0271 buffer_size - pos); 0272 pos += deserialize_size; 0273 if (pos >= buffer_size || deserialize_size == 0) { 0274 return false; 0275 } 0276 0277 size_++; 0278 0279 if (last_hash_item) { 0280 last_hash_item->next_ = hash_item; 0281 } else { 0282 buckets_[i] = hash_item; 0283 } 0284 last_hash_item = hash_item; 0285 } 0286 pos++; 0287 } 0288 return true; 0289 } 0290 0291 /** 0292 * Clears the HashSet back to the original dimensions but 0293 * with no data. 0294 */ 0295 void Clear() { 0296 auto old_bucket_count = bucket_count_; 0297 Cleanup(); 0298 Init(old_bucket_count); 0299 } 0300 0301 /** 0302 * Returns true if the hash_set is a multi_set 0303 */ 0304 bool IsMultiSet() const { 0305 return multi_set_; 0306 } 0307 0308 private: 0309 bool HasNewlineBefore(char *buffer, uint32_t buffer_size) { 0310 char *p = buffer; 0311 for (uint32_t i = 0; i < buffer_size; ++i) { 0312 if (*p == '\0') 0313 return true; 0314 p++; 0315 } 0316 return false; 0317 } 0318 0319 void Init(uint32_t num_buckets) { 0320 bucket_count_ = num_buckets; 0321 buckets_ = nullptr; 0322 size_ = 0; 0323 if (bucket_count_ != 0) { 0324 buckets_ = new HashItem<T>*[bucket_count_]; 0325 memset(buckets_, 0, sizeof(HashItem<T>*) * bucket_count_); 0326 } 0327 } 0328 0329 void Cleanup() { 0330 if (buckets_) { 0331 for (uint32_t i = 0; i < bucket_count_; i++) { 0332 HashItem<T> *hash_item = buckets_[i]; 0333 while (hash_item) { 0334 HashItem<T> *temp_hash_item = hash_item; 0335 hash_item = hash_item->next_; 0336 delete temp_hash_item; 0337 } 0338 } 0339 delete[] buckets_; 0340 buckets_ = nullptr; 0341 bucket_count_ = 0; 0342 size_ = 0; 0343 } 0344 } 0345 0346 uint32_t SerializeBuckets(char *buffer) { 0347 uint32_t total_size = 0; 0348 char sz[512]; 0349 total_size += 1 + snprintf(sz, sizeof(sz), "%x,%x", 0350 bucket_count_, multi_set_ ? 1 : 0); 0351 if (buffer) { 0352 memcpy(buffer, sz, total_size); 0353 } 0354 for (uint32_t i = 0; i < bucket_count_; i++) { 0355 HashItem<T> *hash_item = buckets_[i]; 0356 while (hash_item) { 0357 if (buffer) { 0358 total_size += 0359 hash_item->hash_item_storage_->Serialize(buffer + total_size); 0360 } else { 0361 total_size += hash_item->hash_item_storage_->Serialize(nullptr); 0362 } 0363 hash_item = hash_item->next_; 0364 } 0365 if (buffer) { 0366 buffer[total_size] = '\0'; 0367 } 0368 // Second null terminator to show next bucket 0369 total_size++; 0370 } 0371 return total_size; 0372 } 0373 0374 protected: 0375 bool multi_set_; 0376 uint32_t bucket_count_; 0377 HashItem<T> **buckets_; 0378 uint32_t size_; 0379 }; 0380 0381 #endif // HASH_SET_H_