1use crate::lsm_tree::types::FuzzyHash;
6use anyhow::{Error, Result};
7use bit_vec::BitVec;
8use rustc_hash::FxHasher;
9use std::hash::{Hash as _, Hasher as _};
10use std::marker::PhantomData;
11
12fn generate_hash_nonces(seed: u64, num_nonces: usize) -> Vec<u64> {
43 use rand::rngs::SmallRng;
44 use rand::{Rng as _, SeedableRng as _};
45 let mut output = Vec::with_capacity(num_nonces);
46 let mut rng = SmallRng::seed_from_u64(seed);
47 for _ in 0..num_nonces {
48 let nonce: u64 = rng.gen();
49 output.push(nonce);
50 }
51 output
52}
53
54fn estimate_params(num_items: usize) -> (usize, usize) {
60 if num_items == 0 {
61 return (0, 0);
62 }
63 const TARGET_FP_RATE: f64 = 0.001;
64 use std::f64::consts::LN_2;
65
66 let n = num_items as f64;
67 let bits = ((n * TARGET_FP_RATE.ln()) / (-LN_2 * LN_2)).ceil() as u64;
68 let bits = std::cmp::max(bits, 8).next_power_of_two();
70 let m = bits as f64;
71
72 let mut k: f64 = 0.0;
74 let mut fp_rate = 1.0;
75 while fp_rate > TARGET_FP_RATE {
76 k += 1.0;
77 fp_rate = (1f64 - (-k / (m / n)).exp()).powf(k);
78 }
79 (bits as usize, k.round() as usize)
80}
81
82pub struct BloomFilterReader<V> {
85 data: BitVec,
86 hash_nonces: Vec<u64>,
87 _type: PhantomData<V>,
88}
89
90pub struct BloomFilterStats {
91 pub size: usize,
92 pub num_nonces: usize,
93 pub fill_percentage: usize,
95}
96
97impl<V: FuzzyHash> BloomFilterReader<V> {
98 pub fn read(buf: &[u8], seed: u64, num_nonces: usize) -> Result<Self, Error> {
101 Ok(Self {
102 data: BitVec::from_bytes(buf),
103 hash_nonces: generate_hash_nonces(seed, num_nonces),
104 _type: PhantomData::default(),
105 })
106 }
107
108 pub fn maybe_contains(&self, value: &V) -> bool {
111 let mut num = 0;
112 for hash in value.fuzzy_hash() {
113 if self.maybe_contains_inner(hash) {
114 return true;
115 }
116 num += 1;
117 debug_assert!(num < 4, "Too many hash partitions");
118 }
119 false
120 }
121
122 fn maybe_contains_inner(&self, initial_hash: u64) -> bool {
123 let mut hasher = FxHasher::default();
124 initial_hash.hash(&mut hasher);
125 for nonce in &self.hash_nonces {
126 hasher.write_u64(*nonce);
127 let idx = hasher.finish() as usize % self.data.len();
128 if !self.data.get(idx).unwrap() {
129 return false;
130 }
131 }
132 true
133 }
134
135 pub fn stats(&self) -> BloomFilterStats {
138 BloomFilterStats {
139 size: self.data.len().div_ceil(8),
140 num_nonces: self.hash_nonces.len(),
141 fill_percentage: self.compute_fill_percentage(),
142 }
143 }
144
145 fn compute_fill_percentage(&self) -> usize {
147 if self.data.is_empty() {
148 return 0;
149 }
150 (100 * self.data.iter().filter(|x| *x).count()).div_ceil(self.data.len())
151 }
152
153 #[cfg(test)]
154 fn new_empty(num_items: usize) -> Self {
155 let (bits, num_nonces) = estimate_params(num_items);
156 Self {
157 data: BitVec::from_elem(bits, false),
158 hash_nonces: generate_hash_nonces(0, num_nonces),
159 _type: PhantomData::default(),
160 }
161 }
162
163 #[cfg(test)]
164 fn new_full(num_items: usize) -> Self {
165 let (bits, num_nonces) = estimate_params(num_items);
166 Self {
167 data: BitVec::from_elem(bits, true),
168 hash_nonces: generate_hash_nonces(0, num_nonces),
169 _type: PhantomData::default(),
170 }
171 }
172}
173
174pub struct BloomFilterWriter<V> {
177 data: BitVec,
178 seed: u64,
179 hash_nonces: Vec<u64>,
180 _type: PhantomData<V>,
181}
182
183impl<V: FuzzyHash> BloomFilterWriter<V> {
184 pub fn new(seed: u64, num_items: usize) -> Self {
190 let (bits, num_nonces) = estimate_params(num_items);
191 Self {
192 data: BitVec::from_elem(bits, false),
193 seed,
194 hash_nonces: generate_hash_nonces(seed, num_nonces),
195 _type: PhantomData::default(),
196 }
197 }
198
199 pub fn serialized_size(&self) -> usize {
201 self.data.len() / 8
202 }
203
204 pub fn seed(&self) -> u64 {
205 self.seed
206 }
207
208 pub fn num_nonces(&self) -> usize {
209 self.hash_nonces.len()
210 }
211
212 pub fn write<W>(&self, writer: &mut W) -> Result<(), Error>
213 where
214 W: std::io::Write,
215 {
216 Ok(writer.write_all(&self.data.to_bytes()[..])?)
217 }
218
219 pub fn insert(&mut self, value: &V) {
220 for hash in value.fuzzy_hash() {
221 self.insert_inner(hash);
222 }
223 }
224
225 fn insert_inner(&mut self, initial_hash: u64) {
226 let mut hasher = FxHasher::default();
227 initial_hash.hash(&mut hasher);
228 for nonce in &self.hash_nonces {
229 hasher.write_u64(*nonce);
230 let idx = hasher.finish() as usize % self.data.len();
231 self.data.set(idx, true);
232 }
233 }
234}
235
236#[cfg(test)]
237impl<T> From<BloomFilterWriter<T>> for BloomFilterReader<T> {
238 fn from(writer: BloomFilterWriter<T>) -> Self {
239 Self { data: writer.data, hash_nonces: writer.hash_nonces, _type: PhantomData::default() }
240 }
241}
242
243#[cfg(test)]
244mod tests {
245 use crate::lsm_tree::bloom_filter::{
246 estimate_params, generate_hash_nonces, BloomFilterReader, BloomFilterWriter,
247 };
248 use crate::object_store::allocator::AllocatorKey;
249
250 #[test]
251 fn estimated_params() {
252 assert_eq!(estimate_params(0), (0, 0));
255 assert_eq!(estimate_params(1), (16, 6));
256 assert_eq!(estimate_params(5000), (131072, 4));
257 assert_eq!(estimate_params(50000), (1048576, 4));
258 assert_eq!(estimate_params(5_000_000), (134217728, 4));
259 }
260
261 #[test]
262 fn hash_nonces_are_stable() {
263 let expected: [u64; 16] = [
264 15601892068231251798,
265 4249550343631144082,
266 11170505403239506035,
267 3141899402926357684,
268 10455158105512296971,
269 1544954577306875038,
270 7546141422416683882,
271 10809374736430664972,
272 14270886949990153586,
273 12890306703619732195,
274 10531031577317334640,
275 2369458071968706433,
276 8554654157920588268,
277 4945022529079265586,
278 4849687277068177250,
279 11824207122193630909,
280 ];
281 for i in 1..=expected.len() {
282 let generated = generate_hash_nonces(0xfeedbad, i);
283 assert_eq!(&generated[..], &expected[..i]);
284 }
285 }
286
287 const TEST_KEYS: [i32; 4] = [0, 65535, i32::MAX, i32::MIN];
288
289 #[test]
290 fn test_empty() {
291 let filter = BloomFilterReader::new_empty(TEST_KEYS.len());
292 for key in &TEST_KEYS {
293 assert!(!filter.maybe_contains(key));
294 }
295 }
296
297 #[test]
298 fn test_full() {
299 let filter = BloomFilterReader::new_full(TEST_KEYS.len());
300 for key in &TEST_KEYS {
301 assert!(filter.maybe_contains(key));
302 }
303 }
304
305 #[test]
306 fn test_insert() {
307 for key in &TEST_KEYS {
308 let mut filter = BloomFilterWriter::new(0, TEST_KEYS.len());
310 filter.insert(key);
311 let filter = BloomFilterReader::from(filter);
312 assert!(filter.maybe_contains(key));
313 }
314 }
315
316 #[test]
317 fn test_range_key() {
318 let mut filter = BloomFilterWriter::new(0, 2);
319 filter.insert(&AllocatorKey { device_range: 0..2097152 });
320 filter.insert(&AllocatorKey { device_range: 4194304..4194305 });
321 let filter = BloomFilterReader::from(filter);
322
323 assert!(filter.maybe_contains(&AllocatorKey { device_range: 0..1 }));
324 assert!(filter.maybe_contains(&AllocatorKey { device_range: 2097151..2097152 }));
325 assert!(!filter.maybe_contains(&AllocatorKey { device_range: 2097152..2097153 }));
326 assert!(!filter.maybe_contains(&AllocatorKey { device_range: 3145727..3145728 }));
327 assert!(filter.maybe_contains(&AllocatorKey { device_range: 4193404..4194305 }));
328
329 assert!(filter.maybe_contains(&AllocatorKey { device_range: 0..2097153 }));
330 assert!(filter.maybe_contains(&AllocatorKey { device_range: 2097152..4194305 }));
331 assert!(!filter.maybe_contains(&AllocatorKey { device_range: 104857600..104857601 }));
332 }
333
334 #[test]
335 fn test_serde() {
336 for key in &TEST_KEYS {
337 let mut filter = BloomFilterWriter::new(0, 1);
339 filter.insert(key);
340 let num_nonces = filter.num_nonces();
341 let mut buf = vec![];
342 {
343 let mut cursor = std::io::Cursor::new(&mut buf);
344 filter.write(&mut cursor).expect("write failed");
345 }
346 let filter = BloomFilterReader::read(&buf[..], 0, num_nonces).expect("read failed");
347 assert!(filter.maybe_contains(key));
348 }
349 }
350}