fxfs/lsm_tree/
bloom_filter.rs1use crate::lsm_tree::types::FuzzyHash;
6use anyhow::{Error, Result};
7use bit_vec::BitVec;
8use std::marker::PhantomData;
9
10fn estimate_params(num_items: usize) -> (usize, usize) {
43 if num_items == 0 {
44 return (0, 0);
45 }
46 const TARGET_FP_RATE: f64 = 0.001;
47 use std::f64::consts::LN_2;
48
49 let n = num_items as f64;
50 let bits = ((n * TARGET_FP_RATE.ln()) / (-LN_2 * LN_2)).ceil() as u64;
51 let bits = std::cmp::max(bits, 8).next_power_of_two();
53 let m = bits as f64;
54
55 let mut k: f64 = 0.0;
57 let mut fp_rate = 1.0;
58 while fp_rate > TARGET_FP_RATE {
59 k += 1.0;
60 fp_rate = (1f64 - (-k / (m / n)).exp()).powf(k);
61 }
62 (bits as usize, k.round() as usize)
63}
64
65pub struct BloomFilterReader<V> {
68 data: BitVec,
69 seed: u64,
70 num_hashes: usize,
71 _type: PhantomData<V>,
72}
73
74pub struct BloomFilterStats {
75 pub size: usize,
76 pub num_hashes: usize,
77 pub fill_percentage: usize,
79}
80
81impl<V: FuzzyHash> BloomFilterReader<V> {
82 pub fn read(buf: &[u8], seed: u64, num_hashes: usize) -> Result<Self, Error> {
85 Ok(Self { data: BitVec::from_bytes(buf), seed, num_hashes, _type: PhantomData::default() })
86 }
87
88 pub fn maybe_contains(&self, value: &V) -> bool {
91 let mut num = 0;
92 for hash in value.fuzzy_hash() {
93 if self.maybe_contains_inner(hash) {
94 return true;
95 }
96 num += 1;
97 debug_assert!(num < 4, "Too many hash partitions");
98 }
99 false
100 }
101
102 fn maybe_contains_inner(&self, initial_hash: u64) -> bool {
103 for i in 0..self.num_hashes as u64 {
104 let seed = self.seed.wrapping_add(i);
105 let hash = crate::stable_hash::stable_hash_seeded(initial_hash, seed);
106 let idx = hash as usize % self.data.len();
107 if !self.data.get(idx).unwrap() {
108 return false;
109 }
110 }
111 true
112 }
113
114 pub fn stats(&self) -> BloomFilterStats {
117 BloomFilterStats {
118 size: self.data.len().div_ceil(8),
119 num_hashes: self.num_hashes,
120 fill_percentage: self.compute_fill_percentage(),
121 }
122 }
123
124 fn compute_fill_percentage(&self) -> usize {
126 if self.data.is_empty() {
127 return 0;
128 }
129 (100 * self.data.iter().filter(|x| *x).count()).div_ceil(self.data.len())
130 }
131
132 #[cfg(test)]
133 fn new_empty(num_items: usize) -> Self {
134 let (bits, num_hashes) = estimate_params(num_items);
135 Self {
136 data: BitVec::from_elem(bits, false),
137 seed: 0,
138 num_hashes,
139 _type: PhantomData::default(),
140 }
141 }
142
143 #[cfg(test)]
144 fn new_full(num_items: usize) -> Self {
145 let (bits, num_hashes) = estimate_params(num_items);
146 Self {
147 data: BitVec::from_elem(bits, true),
148 seed: 0,
149 num_hashes,
150 _type: PhantomData::default(),
151 }
152 }
153}
154
155pub struct BloomFilterWriter<V> {
158 data: BitVec,
159 seed: u64,
160 num_hashes: usize,
161 _type: PhantomData<V>,
162}
163
164impl<V: FuzzyHash> BloomFilterWriter<V> {
165 pub fn new(seed: u64, num_items: usize) -> Self {
171 let (bits, num_hashes) = estimate_params(num_items);
172 Self {
173 data: BitVec::from_elem(bits, false),
174 seed,
175 num_hashes,
176 _type: PhantomData::default(),
177 }
178 }
179
180 pub fn serialized_size(&self) -> usize {
182 self.data.len() / 8
183 }
184
185 pub fn seed(&self) -> u64 {
186 self.seed
187 }
188
189 pub fn num_hashes(&self) -> usize {
190 self.num_hashes
191 }
192
193 pub fn write<W>(&self, writer: &mut W) -> Result<(), Error>
194 where
195 W: std::io::Write,
196 {
197 Ok(writer.write_all(&self.data.to_bytes()[..])?)
198 }
199
200 pub fn insert(&mut self, value: &V) {
201 for hash in value.fuzzy_hash() {
202 self.insert_inner(hash);
203 }
204 }
205
206 fn insert_inner(&mut self, initial_hash: u64) {
207 for i in 0..self.num_hashes as u64 {
208 let seed = self.seed.wrapping_add(i);
209 let hash = crate::stable_hash::stable_hash_seeded(initial_hash, seed);
210 let idx = hash as usize % self.data.len();
211 self.data.set(idx, true);
212 }
213 }
214
215 #[cfg(test)]
218 pub(crate) fn clear(&mut self) {
219 self.data.clear();
220 }
221}
222
223#[cfg(test)]
224impl<T> From<BloomFilterWriter<T>> for BloomFilterReader<T> {
225 fn from(writer: BloomFilterWriter<T>) -> Self {
226 Self {
227 data: writer.data,
228 seed: writer.seed,
229 num_hashes: writer.num_hashes,
230 _type: PhantomData::default(),
231 }
232 }
233}
234
235#[cfg(test)]
236mod tests {
237 use crate::lsm_tree::bloom_filter::{BloomFilterReader, BloomFilterWriter, estimate_params};
238 use crate::object_store::allocator::AllocatorKey;
239
240 #[test]
241 fn estimated_params() {
242 assert_eq!(estimate_params(0), (0, 0));
245 assert_eq!(estimate_params(1), (16, 6));
246 assert_eq!(estimate_params(5000), (131072, 4));
247 assert_eq!(estimate_params(50000), (1048576, 4));
248 assert_eq!(estimate_params(5_000_000), (134217728, 4));
249 }
250
251 const TEST_KEYS: [i32; 4] = [0, 65535, i32::MAX, i32::MIN];
252
253 #[test]
254 fn test_empty() {
255 let filter = BloomFilterReader::new_empty(TEST_KEYS.len());
256 for key in &TEST_KEYS {
257 assert!(!filter.maybe_contains(key));
258 }
259 }
260
261 #[test]
262 fn test_full() {
263 let filter = BloomFilterReader::new_full(TEST_KEYS.len());
264 for key in &TEST_KEYS {
265 assert!(filter.maybe_contains(key));
266 }
267 }
268
269 #[test]
270 fn test_insert() {
271 for key in &TEST_KEYS {
272 let mut filter = BloomFilterWriter::new(0, TEST_KEYS.len());
274 filter.insert(key);
275 let filter = BloomFilterReader::from(filter);
276 assert!(filter.maybe_contains(key));
277 }
278 }
279
280 #[test]
281 fn test_range_key() {
282 let mut filter = BloomFilterWriter::new(0, 2);
283 filter.insert(&AllocatorKey { device_range: 0..2097152 });
284 filter.insert(&AllocatorKey { device_range: 4194304..4194305 });
285 let filter = BloomFilterReader::from(filter);
286
287 assert!(filter.maybe_contains(&AllocatorKey { device_range: 0..1 }));
288 assert!(filter.maybe_contains(&AllocatorKey { device_range: 2097151..2097152 }));
289 assert!(!filter.maybe_contains(&AllocatorKey { device_range: 2097152..2097153 }));
290 assert!(!filter.maybe_contains(&AllocatorKey { device_range: 3145727..3145728 }));
291 assert!(filter.maybe_contains(&AllocatorKey { device_range: 4193404..4194305 }));
292
293 assert!(filter.maybe_contains(&AllocatorKey { device_range: 0..2097153 }));
294 assert!(filter.maybe_contains(&AllocatorKey { device_range: 2097152..4194305 }));
295 assert!(!filter.maybe_contains(&AllocatorKey { device_range: 104857600..104857601 }));
296 }
297
298 #[test]
299 fn test_serde() {
300 for key in &TEST_KEYS {
301 let mut filter = BloomFilterWriter::new(0, 1);
303 filter.insert(key);
304 let num_hashes = filter.num_hashes();
305 let mut buf = vec![];
306 {
307 let mut cursor = std::io::Cursor::new(&mut buf);
308 filter.write(&mut cursor).expect("write failed");
309 }
310 let filter = BloomFilterReader::read(&buf[..], 0, num_hashes).expect("read failed");
311 assert!(filter.maybe_contains(key));
312 }
313 }
314}