fxfs/object_store/allocator/
strategy.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! A strategy tracks free space and decides where allocations are to be made.
6//!
7//! Note that strategy *excludes*:
8//!
9//!   * get/set byte limits
10//!   * reservation tracking of future allocations.
11//!   * deallocated but not yet usable regions (awaiting flush).
12//!   * Any TRIM-specific logic
13//!   * Volume Deletion
14//!
15//! These sorts of higher-level concepts should be implemented in `Allocator`.
16//!
17//! Strategies should be concerned only with selecting which free regions of disk to hand out.
18
19use crate::object_store::FxfsError;
20use anyhow::{Context, Error};
21use std::collections::{btree_map, BTreeMap, BTreeSet};
22use std::fmt::Debug;
23use std::ops::Range;
24
25/// Holds the offsets of extents of a given length.
26#[derive(Debug, Default)]
27struct SizeBucket {
28    /// True if this bucket has overflowed, indicating that there may be more extents of this
29    /// size on disk if we look again.
30    overflow: bool,
31    offsets: BTreeSet<u64>,
32}
33
34/// Tests (in allocator.rs) may set max extent size to low values for other reasons.
35/// We want to make sure that we never go below a value that is big enough for superblocks
36/// so the value we use here is chosen to be larger than the value allocator.rs calculates and
37/// a convenient power of two.
38const DEFAULT_MAX_EXTENT_SIZE: u64 = 2 << 20;
39
40/// The largest size bucket is a catch-all for extents this size and up so we
41/// explicitly keep more than we would otherwise.
42const NUM_MAX_SIZED_EXTENTS_TO_KEEP: u64 = 512;
43
44/// Returns the maximum free ranges we keep in RAM for a given range length.
45/// We use the reciprocal function (2MiB/len) for this as we tend to see a lot more short
46/// extents. One exception is 'DEFAULT_MAX_EXTENT_SIZE'. This is a catch-all bucket so we
47/// intentionally bump this to cover the fact that a wide range of allocation sizes may hit this
48/// bucket.
49fn max_entries(len: u64) -> usize {
50    if len < DEFAULT_MAX_EXTENT_SIZE {
51        // With a 4kiB block size up to 2MiB (i.e. 512 possible lengths), this works out to be a
52        // total of 6748 entries.
53        (4 << 20) / len as usize
54    } else {
55        NUM_MAX_SIZED_EXTENTS_TO_KEEP as usize
56    }
57}
58
59/// An allocation strategy that returns the smallest extent that is large enough to hold the
60/// requested allocation, or the next best if no extent is big enough.
61///
62/// This strategy either leads to a perfect fit of a free extent to an allocation or a small
63/// amount of free space being left over, creating even smaller fragments of free space.
64///
65/// This tendency to create smaller fragments of free space starts to affect file fragmentation
66/// when filesystem use approaches capacity and there are nothing but small fragments of free
67/// space remaining.
68#[derive(Debug, Default)]
69pub struct BestFit {
70    /// Holds free storage ranges for the filesystem ordered by length, then start.
71    ranges: BTreeMap<u64, SizeBucket>,
72    /// A secondary index used to speed up coalescing of 'free' calls. Keyed by end -> start
73    by_end: BTreeMap<u64, u64>,
74}
75
76impl BestFit {
77    /// Tries to assign a set of contiguous `bytes` and returns the range, removing it
78    /// from the pool of available bytes and returning it.
79    ///
80    /// If insufficient contiguous space is available, the largest available range will
81    /// be returned. If no bytes are available, None will be returned.
82    ///
83    /// There are no special requirements on alignment of `bytes` but the caller is generally
84    /// encouraged to align to device block size.
85    pub fn allocate(&mut self, bytes: u64) -> Result<Range<u64>, FxfsError> {
86        let mut result = self.ranges.range_mut(bytes..).next();
87        if result.is_none() {
88            // Insufficient space. Return the biggest range we have.
89            result = self.ranges.iter_mut().rev().next();
90        }
91        if let Some((&size, size_bucket)) = result {
92            if size_bucket.offsets.is_empty() && size_bucket.overflow {
93                return Err(FxfsError::NotFound);
94            }
95            debug_assert!(!size_bucket.offsets.is_empty());
96            let offset = size_bucket.offsets.pop_first().unwrap();
97            if size_bucket.offsets.is_empty() && !size_bucket.overflow {
98                self.ranges.remove(&size);
99            }
100            self.by_end.remove(&(offset + size));
101            if size > bytes {
102                self.free(offset + bytes..offset + size).expect("give extra back");
103                Ok(offset..offset + bytes)
104            } else {
105                Ok(offset..offset + size)
106            }
107        } else {
108            Err(FxfsError::NoSpace)
109        }
110    }
111
112    /// This is effectively an optimized path for "remove" followed by "free" on the same range.
113    /// Returns true if this resulted in changes to overall free space.
114    pub fn force_free(&mut self, range: Range<u64>) -> Result<bool, Error> {
115        let mut to_add = Vec::new();
116        let mut last = range.start;
117        for (&end, &offset) in self.by_end.range(range.start + 1..) {
118            if offset >= range.end {
119                break;
120            }
121            if offset > last {
122                to_add.push(last..offset);
123            }
124            last = end;
125            assert!(end > range.start);
126        }
127        if last < range.end {
128            to_add.push(last..range.end);
129        }
130        Ok(if to_add.is_empty() {
131            false
132        } else {
133            for range in to_add {
134                self.free(range)?;
135            }
136            true
137        })
138    }
139
140    /// Removes all free extents in the given range.
141    pub fn remove(&mut self, range: Range<u64>) {
142        let mut to_add = Vec::new();
143        let mut to_remove = Vec::new();
144        for (&end, &offset) in self.by_end.range(range.start + 1..) {
145            if offset >= range.end {
146                break;
147            }
148            assert!(end > range.start);
149            to_remove.push(offset..end);
150            if offset < range.start {
151                to_add.push(offset..range.start);
152            }
153            if end > range.end {
154                to_add.push(range.end..end);
155            }
156        }
157        for range in to_remove {
158            self.remove_range(range);
159        }
160        for range in to_add {
161            self.free(range).unwrap();
162        }
163    }
164
165    /// Internal helper function. Assumes range exists.
166    fn remove_range(&mut self, range: Range<u64>) {
167        let btree_map::Entry::Occupied(mut size_bucket) =
168            self.ranges.entry(range.end - range.start)
169        else {
170            unreachable!()
171        };
172        size_bucket.get_mut().offsets.remove(&range.start);
173        if size_bucket.get().offsets.is_empty() && !size_bucket.get().overflow {
174            size_bucket.remove_entry();
175        }
176        assert_eq!(Some(range.start), self.by_end.remove(&range.end));
177    }
178
179    /// Returns the set of buckets that have overflow markers.
180    pub fn overflow_markers(&self) -> Vec<u64> {
181        self.ranges
182            .iter()
183            .filter_map(|(&size, size_bucket)| size_bucket.overflow.then_some(size))
184            .collect()
185    }
186
187    // Overflow markers persist until removed.
188    // Before we refresh data from disk, we need to remove these markers as there is no guarantee
189    // that we will overflow the same bucket every time we refresh.
190    pub fn reset_overflow_markers(&mut self) {
191        let mut to_remove = Vec::new();
192        for (&size, size_bucket) in self.ranges.iter_mut() {
193            size_bucket.overflow = false;
194            if size_bucket.offsets.is_empty() {
195                to_remove.push(size);
196            }
197        }
198        for size in to_remove {
199            self.ranges.remove(&size);
200        }
201    }
202
203    /// Internal helper function. Inserts range if room to do so, appends overflow marker if needed.
204    fn insert_range(&mut self, range: Range<u64>) -> Result<(), Error> {
205        let len = range.end - range.start;
206        let entry = self.ranges.entry(len).or_default();
207        if !entry.offsets.insert(range.start) {
208            // This might happen if there is some high-level logic error that causes a range to be
209            // freed twice. It may indicate a disk corruption or a software bug and shouldn't happen
210            // under normal circumstances. While the low-level data structure held here will not be
211            // compromised by this error, it is probably not safe to continue if a condition such
212            // as this is detected, thus we return "Inconsistent".
213            return Err(FxfsError::Inconsistent).context("Range already in 'ranges'.");
214        }
215        let other = self.by_end.insert(range.end, range.start);
216        if let Some(_other) = other {
217            // self.by_end and self.ranges should always remain consistant.
218            // If this occurs it is almost certainly a software bug and continued use of this
219            // data structure will likely lead to undefined behaviour.
220            return Err(FxfsError::Inconsistent)
221                .context("Range already in 'by_end'. Potential logic bug.");
222        };
223        if entry.offsets.len() > max_entries(len) {
224            let last = entry.offsets.pop_last().unwrap();
225            if self.by_end.remove(&(last + len)) != Some(last) {
226                return Err(FxfsError::Inconsistent)
227                    .context("Expected range missing or invalid 'by_end'");
228            };
229            entry.overflow = true;
230        }
231        Ok(())
232    }
233
234    /// Adds an arbitrary range of bytes to the pool of available ranges.
235    ///
236    /// Note that we keep these ranges in a map keyed by their length. To bound the size of this
237    /// map we only track ranges up to N blocks long (up to 2MB). Longer ranges
238    /// are broken up into ranges of this size.
239    pub fn free(&mut self, mut range: Range<u64>) -> Result<(), Error> {
240        // If there is a free range immediately before this one, merge with it.
241        let mut iter = self.by_end.range(range.start..);
242        let mut next_item = iter.next();
243        if let Some((&end, &start)) = next_item {
244            if end == range.start {
245                self.remove_range(start..end);
246                range.start = start;
247                iter = self.by_end.range(range.start + 1..);
248                next_item = iter.next();
249            } else if start < range.end {
250                // There exists a range already that overlaps with this.
251                // We have a double free or freeing of overlapping allocations.
252                // While the data-structure here remains valid after this error, the fact that
253                // an overlapping free was attempted indicates that the filesystem is not in a
254                // safe state to continue, thus we return "Inconsistent".
255                return Err(FxfsError::Inconsistent).context("overlapping free");
256            }
257        }
258        // If there is a free range immediately after this one, merge with it.
259        if let Some((&end, &start)) = next_item {
260            if start == range.end {
261                self.remove_range(start..end);
262                range.end = end;
263            }
264        }
265
266        // We don't allow ranges longer than maximum extent size, but if we need to split such
267        // a range, we want the smaller fragment to come first (pushing small fragments together at
268        // the start of the device).
269        while (range.end - range.start) >= DEFAULT_MAX_EXTENT_SIZE {
270            self.insert_range(range.end - DEFAULT_MAX_EXTENT_SIZE..range.end)
271                .context("adding max_extent_size fragment")?;
272            range.end -= DEFAULT_MAX_EXTENT_SIZE;
273        }
274        if range.start < range.end {
275            self.insert_range(range).context("adding final range")?;
276        }
277        Ok(())
278    }
279}
280
281#[cfg(test)]
282mod test {
283    use super::*;
284
285    #[test]
286    fn allocate() {
287        let mut bestfit = BestFit::default();
288        bestfit.free(0..0).unwrap(); // NOOP
289        bestfit.free(0..100).unwrap();
290        assert_eq!(bestfit.allocate(10), Ok(0..10));
291        assert_eq!(bestfit.allocate(10), Ok(10..20));
292        assert_eq!(bestfit.allocate(10), Ok(20..30));
293        assert_eq!(bestfit.allocate(10), Ok(30..40));
294        assert_eq!(bestfit.allocate(10), Ok(40..50));
295        // Make some holes.
296        bestfit.free(30..40).unwrap();
297        bestfit.free(10..20).unwrap();
298        // Holes get filled first.
299        assert_eq!(bestfit.allocate(10), Ok(10..20));
300        assert_eq!(bestfit.allocate(10), Ok(30..40));
301        assert_eq!(bestfit.allocate(10), Ok(50..60));
302        // Free a contiguous bunch of allocations at once.
303        bestfit.free(0..50).unwrap();
304        // Return less than requested.
305        assert_eq!(bestfit.allocate(100), Ok(0..50));
306        // Return all remaining space.
307        assert_eq!(bestfit.allocate(100), Ok(60..100));
308        // No space left. Return None.
309        assert_eq!(bestfit.allocate(100), Err(FxfsError::NoSpace));
310        // Now we have some more back.
311        bestfit.free(50..100).unwrap();
312        assert_eq!(bestfit.allocate(100), Ok(50..100));
313    }
314
315    #[test]
316    fn remove() {
317        let mut bestfit = BestFit::default();
318        bestfit.free(0..100).unwrap();
319        bestfit.remove(25..50);
320        assert_eq!(bestfit.allocate(30), Ok(50..80));
321        assert_eq!(bestfit.allocate(25), Ok(0..25));
322
323        // Test removal across disjoint extents.
324        let mut bestfit = BestFit::default();
325        bestfit.free(0..20).unwrap();
326        bestfit.free(30..50).unwrap();
327        bestfit.remove(10..40);
328        assert_eq!(bestfit.allocate(10), Ok(0..10));
329        assert_eq!(bestfit.allocate(10), Ok(40..50));
330        assert_eq!(bestfit.allocate(10), Err(FxfsError::NoSpace));
331
332        // Test coalescing of adjacent available ranges if the request is large.
333        let mut bestfit = BestFit::default();
334        let end = DEFAULT_MAX_EXTENT_SIZE * (NUM_MAX_SIZED_EXTENTS_TO_KEEP + 1) + 10000;
335        bestfit.free(0..end).unwrap();
336        // We slice from the back so we have sizes: 10000,2M,2M,2M,2M,...
337        // Free up most of the 2MB tail entries -- we just needed to trigger an overflow marker.
338        bestfit.remove(DEFAULT_MAX_EXTENT_SIZE * 3 + 10000..end);
339        // Free up the firs two slices worth. (This exercises unaligned removal.)
340        bestfit.remove(0..DEFAULT_MAX_EXTENT_SIZE * 2);
341        // Allocate the last 2MB extent.
342        assert_eq!(
343            bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE),
344            Ok(DEFAULT_MAX_EXTENT_SIZE * 2 + 10000..DEFAULT_MAX_EXTENT_SIZE * 3 + 10000)
345        );
346        // Allocate all the space.
347        assert_eq!(
348            bestfit.allocate(10000),
349            Ok(DEFAULT_MAX_EXTENT_SIZE * 2..DEFAULT_MAX_EXTENT_SIZE * 2 + 10000)
350        );
351        // Now there is insufficient space, but we should get the overflow marker.
352        assert_eq!(
353            bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE),
354            Err(FxfsError::NotFound) // Overflow.
355        );
356        // Reset the overflow marker. We should see NoSpace.
357        bestfit.reset_overflow_markers();
358        assert_eq!(bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE), Err(FxfsError::NoSpace));
359    }
360
361    #[test]
362    fn coalescing_free() {
363        let mut bestfit = BestFit::default();
364        // Free some bytes at the start and end.
365        bestfit.free(0..10).unwrap();
366        bestfit.free(20..32).unwrap();
367        // Now free the space in the middle, which should coalesce with ranges on both sides.
368        bestfit.free(10..20).unwrap();
369        // Confirm that we can allocate one block of 32 bytes. This will fail if coalescing
370        // didn't occur.
371        bestfit.remove(0..32);
372        assert_eq!(bestfit.allocate(32), Err(FxfsError::NoSpace));
373    }
374
375    #[test]
376    fn max_range() {
377        let mut bestfit = BestFit::default();
378        bestfit.free(10..10 + 10 * DEFAULT_MAX_EXTENT_SIZE).unwrap();
379
380        // We can't allocate bigger than DEFAULT_MAX_EXTENT_SIZE.
381        assert_eq!(
382            bestfit.allocate(2 * DEFAULT_MAX_EXTENT_SIZE),
383            Ok(10..10 + DEFAULT_MAX_EXTENT_SIZE)
384        );
385
386        // Make sure that coalescing still works properly
387        bestfit.free(10..10 + DEFAULT_MAX_EXTENT_SIZE).unwrap();
388        assert_eq!(bestfit.allocate(10), Ok(10..20));
389        assert_eq!(bestfit.allocate(10), Ok(20..30));
390        assert_eq!(
391            bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE - 20),
392            Ok(30..10 + DEFAULT_MAX_EXTENT_SIZE)
393        );
394    }
395
396    #[test]
397    fn fragmenting_free() {
398        // It shouldn't matter how we allocate and free here so long as there are no overlaps.
399        let mut bestfit = BestFit::default();
400        bestfit.free(0..100).unwrap();
401        assert_eq!(bestfit.allocate(30), Ok(0..30));
402        assert!(bestfit.free(20..30).is_ok());
403        assert_eq!(bestfit.allocate(10), Ok(20..30));
404        assert!(bestfit.free(0..30).is_ok());
405        assert!(bestfit.free(0..30).is_err());
406
407        // Merge left and right and middle.
408        let mut bestfit = BestFit::default();
409        bestfit.free(10..20).unwrap();
410        bestfit.free(30..40).unwrap();
411        bestfit.free(0..10).unwrap(); // before
412        bestfit.free(40..50).unwrap(); // after
413        bestfit.free(20..30).unwrap(); // middle
414
415        // Check all combinations of overlaps.
416        let mut bestfit = BestFit::default();
417        bestfit.free(10..20).unwrap();
418        assert!(bestfit.free(9..11).is_err());
419        assert!(bestfit.free(19..21).is_err());
420        assert!(bestfit.free(11..29).is_err());
421        assert!(bestfit.free(10..20).is_err());
422        assert!(bestfit.free(9..10).is_ok());
423        assert!(bestfit.free(20..21).is_ok());
424    }
425
426    #[test]
427    fn force_free() {
428        let mut bestfit = BestFit::default();
429        bestfit.free(0..100).unwrap();
430        assert_eq!(bestfit.force_free(0..100).unwrap(), false);
431        assert_eq!(bestfit.force_free(10..100).unwrap(), false);
432        assert_eq!(bestfit.force_free(0..90).unwrap(), false);
433        assert_eq!(bestfit.force_free(10..90).unwrap(), false);
434
435        let mut bestfit = BestFit::default();
436        bestfit.free(0..40).unwrap();
437        bestfit.free(60..100).unwrap();
438        assert_eq!(bestfit.force_free(10..90).unwrap(), true);
439        assert_eq!(bestfit.allocate(100), Ok(0..100));
440
441        let mut bestfit = BestFit::default();
442        bestfit.free(0..40).unwrap();
443        bestfit.free(60..100).unwrap();
444        assert_eq!(bestfit.force_free(10..100).unwrap(), true);
445        assert_eq!(bestfit.allocate(100), Ok(0..100));
446
447        let mut bestfit = BestFit::default();
448        bestfit.free(10..40).unwrap();
449        bestfit.free(60..100).unwrap();
450        assert_eq!(bestfit.force_free(0..110).unwrap(), true);
451        assert_eq!(bestfit.allocate(110), Ok(0..110));
452
453        let mut bestfit = BestFit::default();
454        bestfit.free(10..40).unwrap();
455        bestfit.free(60..100).unwrap();
456        assert_eq!(bestfit.force_free(10..110).unwrap(), true);
457        assert_eq!(bestfit.allocate(100), Ok(10..110));
458
459        let mut bestfit = BestFit::default();
460        bestfit.free(10..40).unwrap();
461        bestfit.free(60..100).unwrap();
462        assert_eq!(bestfit.force_free(0..100).unwrap(), true);
463        assert_eq!(bestfit.allocate(100), Ok(0..100));
464
465        let mut bestfit = BestFit::default();
466        assert_eq!(bestfit.force_free(0..100).unwrap(), true);
467        assert_eq!(bestfit.allocate(100), Ok(0..100));
468    }
469
470    #[test]
471    fn test_overflow_changes() {
472        // Test for b/345105394 conditions.
473        // This doesn't exactly recreate the differences though because
474        let mut bestfit = BestFit::default();
475        let mut ofs: u64 = 0;
476        // Fill 256kb.
477        for i in 0..max_entries(256 << 10) as u64 {
478            assert!(bestfit.force_free(i * 2 * (256 << 10)..(i * 2 + 1) * (256 << 10)).unwrap());
479        }
480        ofs += (256 << 10) * max_entries(256 << 10) as u64 * 3;
481
482        // Overflow 768kb
483        for i in 0..max_entries(768 << 10) as u64 + 1 {
484            assert!(bestfit
485                .force_free(ofs + i * 2 * (768 << 10)..ofs + (i * 2 + 1) * (768 << 10))
486                .unwrap());
487        }
488        ofs += (768 << 10) * (max_entries(768 << 10) + 1) as u64 * 3;
489
490        // Overflow 512kb.
491        for i in 0..max_entries(512 << 10) as u64 + 1 {
492            assert!(bestfit
493                .force_free(ofs + i * 2 * (512 << 10)..ofs + (i * 2 + 1) * (512 << 10))
494                .unwrap());
495        }
496        // We want to put the next free adjacent to the tail of the last 512kb block, so
497        // we drop the "+1" and add 512kb.
498        ofs += (512 << 10) * max_entries(512 << 10) as u64 * 2 + (512 << 10);
499
500        assert_eq!(bestfit.overflow_markers(), vec![524288, 786432]);
501
502        // This is a 256kB free just after a 512kB free, so it will form a 768kb free.
503        assert!(bestfit.force_free(ofs..ofs + (256 << 10)).unwrap());
504        // In the actual bug, we would not add this until after the next flush, so
505        // force_free would not return true and the only change would be the overflow marker.
506        assert_eq!(bestfit.overflow_markers(), vec![262144, 524288, 786432]);
507    }
508}