File indexing completed on 2024-11-17 04:55:17
0001 /* 0002 SPDX-License-Identifier: MPL-2.0 0003 */ 0004 0005 /* Copyright (c) 2015 Brian R. Bondy. Distributed under the MPL2 license. 0006 * This Source Code Form is subject to the terms of the Mozilla Public 0007 * License, v. 2.0. If a copy of the MPL was not distributed with this 0008 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 0009 0010 #ifndef BLOOMFILTER_H_ 0011 #define BLOOMFILTER_H_ 0012 0013 #include <math.h> 0014 #include <stdint.h> 0015 #include "./hashFn.h" 0016 #include "./base.h" 0017 0018 static HashFn h1(13); 0019 static HashFn h2(17); 0020 static HashFn h3(31); 0021 static HashFn h4(41); 0022 static HashFn h5(53); 0023 static HashFn defaultHashFns[5] = {h1, h2, h3, h4, h5}; 0024 0025 0026 /** 0027 * Implements a Bloom Filter using Rabin Karp for char* buffer lookups 0028 */ 0029 class BloomFilter { 0030 public: 0031 BloomFilter(unsigned int bitsPerElement = 10, 0032 unsigned int estimatedNumElements = 50000, 0033 HashFn hashFns[] = defaultHashFns, 0034 int numHashFns = sizeof(defaultHashFns)/sizeof(defaultHashFns[0])); 0035 BloomFilter(const char *buffer, int byteBufferSize, 0036 HashFn hashFns[] = defaultHashFns, 0037 int numHashFns = sizeof(defaultHashFns)/sizeof(defaultHashFns[0])); 0038 virtual ~BloomFilter(); 0039 // Sets the specified bit in the buffer 0040 void setBit(unsigned int bitLocation); 0041 // Checks if the specified bit is set in the buffer 0042 bool isBitSet(unsigned int bitLocation); 0043 // Adds the specified buffer to the bloom filter 0044 void add(const char *input, int len); 0045 void add(const char *sz); 0046 // Empty the Bloom Filter 0047 void clear(); 0048 0049 /** 0050 * Checks whether an element probably exists in the set, or definitely 0051 * doesn't. 0052 * @param sz Either a string to check for existance or an array of the 0053 * string's char codes. The main reason why you'd want to pass in a char 0054 * code array is because passing a string will use JS directly to get 0055 * the char codes which is very inneficient compared to calling into C++ 0056 * code to get it and then making the call. 0057 * 0058 * Returns true if the element probably exists in the set 0059 * Returns false if the element definitely does not exist in the set 0060 */ 0061 bool exists(const char *input, int len); 0062 bool exists(const char *sz); 0063 0064 /** 0065 * Checks if any substring of length substringLength probably exists or 0066 * definitely doesn't. If false is returned then no substring of the 0067 * specified string of the specified length is in the bloom filter 0068 * 0069 * @param data The substring or char array to check substrings on. 0070 */ 0071 bool substringExists(const char *data, int dataLen, int substringLength); 0072 bool substringExists(const char *sz, int substringLength); 0073 0074 /** 0075 * Obtains the buffer used as the bloom filter data 0076 */ 0077 const char * getBuffer() { 0078 return buffer; 0079 } 0080 0081 /** 0082 * Obtains the Bloom Filter's buffer size in bytes 0083 */ 0084 int getByteBufferSize() { 0085 return byteBufferSize; 0086 } 0087 0088 private: 0089 HashFn *hashFns; 0090 uint64_t *lastHashes; 0091 int numHashFns; 0092 unsigned int byteBufferSize; 0093 unsigned int bitBufferSize; 0094 char *buffer; 0095 0096 /** 0097 * Obtains the hashes for the specified charCodes 0098 * See "Rabin fingerprint" in 0099 * https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm 0100 * for more information. 0101 * 0102 * @param charCodes An array of the char codes to use for the hash 0103 * @param lastHashes Input and output for the last hash value 0104 * function for a faster computation. Must be called with lastCharCode but 0105 * can be nullptr otherwise. 0106 * 0107 * @param newHashses fills in the corresponding new hashes, can be the same 0108 * as lastHashes 0109 * @param lastCharCode if specified, it will pass the last char code 0110 * to the hashing function for a faster computation. Must be called 0111 * with lastHashes. 0112 */ 0113 void getHashesForCharCodes(const char *input, int inputLen, 0114 uint64_t *lastHashes, uint64_t *newHashes, unsigned char lastCharCode); 0115 }; 0116 0117 #endif // BLOOMFILTER_H_