File indexing completed on 2024-11-17 04:55:17

0001 /*
0002     SPDX-License-Identifier: MPL-2.0
0003 */
0004 
0005 /* Copyright (c) 2015 Brian R. Bondy. Distributed under the MPL2 license.
0006  * This Source Code Form is subject to the terms of the Mozilla Public
0007  * License, v. 2.0. If a copy of the MPL was not distributed with this
0008  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
0009 
0010 #ifndef BLOOMFILTER_H_
0011 #define BLOOMFILTER_H_
0012 
0013 #include <math.h>
0014 #include <stdint.h>
0015 #include "./hashFn.h"
0016 #include "./base.h"
0017 
0018 static HashFn h1(13);
0019 static HashFn h2(17);
0020 static HashFn h3(31);
0021 static HashFn h4(41);
0022 static HashFn h5(53);
0023 static HashFn defaultHashFns[5] = {h1, h2, h3, h4, h5};
0024 
0025 
0026 /**
0027  * Implements a Bloom Filter using Rabin Karp for char* buffer lookups
0028  */
0029 class BloomFilter {
0030  public:
0031     BloomFilter(unsigned int bitsPerElement = 10,
0032         unsigned int estimatedNumElements = 50000,
0033         HashFn hashFns[] = defaultHashFns,
0034         int numHashFns = sizeof(defaultHashFns)/sizeof(defaultHashFns[0]));
0035     BloomFilter(const char *buffer, int byteBufferSize,
0036         HashFn hashFns[] = defaultHashFns,
0037         int numHashFns = sizeof(defaultHashFns)/sizeof(defaultHashFns[0]));
0038     virtual ~BloomFilter();
0039     // Sets the specified bit in the buffer
0040     void setBit(unsigned int bitLocation);
0041     // Checks if the specified bit is set in the buffer
0042     bool isBitSet(unsigned int bitLocation);
0043     // Adds the specified buffer to the bloom filter
0044     void add(const char *input, int len);
0045     void add(const char *sz);
0046     // Empty the Bloom Filter
0047     void clear();
0048 
0049     /**
0050      * Checks whether an element probably exists in the set, or definitely
0051      * doesn't.
0052      * @param sz Either a string to check for existance or an array of the
0053      *   string's char codes. The main reason why you'd want to pass in a char
0054      *   code array is because passing a string will use JS directly to get
0055      *   the char codes which is very inneficient compared to calling into C++
0056      *   code to get it and then making the call.
0057      *
0058      * Returns true if the element probably exists in the set
0059      * Returns false if the element definitely does not exist in the set
0060      */
0061     bool exists(const char *input, int len);
0062     bool exists(const char *sz);
0063 
0064     /**
0065      * Checks if any substring of length substringLength probably exists or
0066      * definitely doesn't. If false is returned then no substring of the
0067      * specified string of the specified length is in the bloom filter
0068      *
0069      * @param data The substring or char array to check substrings on.
0070      */
0071     bool substringExists(const char *data, int dataLen, int substringLength);
0072     bool substringExists(const char *sz, int substringLength);
0073 
0074     /**
0075      * Obtains the buffer used as the bloom filter data
0076      */
0077     const char * getBuffer() {
0078       return buffer;
0079     }
0080 
0081     /**
0082      * Obtains the Bloom Filter's buffer size in bytes
0083      */
0084     int getByteBufferSize() {
0085       return byteBufferSize;
0086     }
0087 
0088  private:
0089   HashFn *hashFns;
0090   uint64_t *lastHashes;
0091   int numHashFns;
0092   unsigned int byteBufferSize;
0093   unsigned int bitBufferSize;
0094   char *buffer;
0095 
0096   /**
0097    * Obtains the hashes for the specified charCodes
0098    * See "Rabin fingerprint" in
0099    * https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
0100    * for more information.
0101    *
0102    * @param charCodes An array of the char codes to use for the hash
0103    * @param lastHashes Input and output for the last hash value
0104    * function for a faster computation.  Must be called with lastCharCode but
0105    * can be nullptr otherwise.
0106    *
0107    * @param newHashses fills in the corresponding new hashes, can be the same
0108    *  as lastHashes
0109    * @param lastCharCode if specified, it will pass the last char code
0110    *  to the hashing function for a faster computation. Must be called
0111    *  with lastHashes.
0112    */
0113   void getHashesForCharCodes(const char *input, int inputLen,
0114       uint64_t *lastHashes, uint64_t *newHashes, unsigned char lastCharCode);
0115 };
0116 
0117 #endif  // BLOOMFILTER_H_