1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
45//! A strategy tracks free space and decides where allocations are to be made.
6//!
7//! Note that strategy *excludes*:
8//!
9//! * get/set byte limits
10//! * reservation tracking of future allocations.
11//! * deallocated but not yet usable regions (awaiting flush).
12//! * Any TRIM-specific logic
13//! * Volume Deletion
14//!
15//! These sorts of higher-level concepts should be implemented in `Allocator`.
16//!
17//! Strategies should be concerned only with selecting which free regions of disk to hand out.
1819use crate::object_store::FxfsError;
20use anyhow::{Context, Error};
21use std::collections::{btree_map, BTreeMap, BTreeSet};
22use std::fmt::Debug;
23use std::ops::Range;
2425/// Holds the offsets of extents of a given length.
26#[derive(Debug, Default)]
27struct SizeBucket {
28/// True if this bucket has overflowed, indicating that there may be more extents of this
29 /// size on disk if we look again.
30overflow: bool,
31 offsets: BTreeSet<u64>,
32}
3334/// Tests (in allocator.rs) may set max extent size to low values for other reasons.
35/// We want to make sure that we never go below a value that is big enough for superblocks
36/// so the value we use here is chosen to be larger than the value allocator.rs calculates and
37/// a convenient power of two.
38const DEFAULT_MAX_EXTENT_SIZE: u64 = 2 << 20;
3940/// The largest size bucket is a catch-all for extents this size and up so we
41/// explicitly keep more than we would otherwise.
42const NUM_MAX_SIZED_EXTENTS_TO_KEEP: u64 = 512;
4344/// Returns the maximum free ranges we keep in RAM for a given range length.
45/// We use the reciprocal function (2MiB/len) for this as we tend to see a lot more short
46/// extents. One exception is 'DEFAULT_MAX_EXTENT_SIZE'. This is a catch-all bucket so we
47/// intentionally bump this to cover the fact that a wide range of allocation sizes may hit this
48/// bucket.
49fn max_entries(len: u64) -> usize {
50if len < DEFAULT_MAX_EXTENT_SIZE {
51// With a 4kiB block size up to 2MiB (i.e. 512 possible lengths), this works out to be a
52 // total of 6748 entries.
53(4 << 20) / len as usize
54 } else {
55 NUM_MAX_SIZED_EXTENTS_TO_KEEP as usize
56 }
57}
5859/// An allocation strategy that returns the smallest extent that is large enough to hold the
60/// requested allocation, or the next best if no extent is big enough.
61///
62/// This strategy either leads to a perfect fit of a free extent to an allocation or a small
63/// amount of free space being left over, creating even smaller fragments of free space.
64///
65/// This tendency to create smaller fragments of free space starts to affect file fragmentation
66/// when filesystem use approaches capacity and there are nothing but small fragments of free
67/// space remaining.
68#[derive(Debug, Default)]
69pub struct BestFit {
70/// Holds free storage ranges for the filesystem ordered by length, then start.
71ranges: BTreeMap<u64, SizeBucket>,
72/// A secondary index used to speed up coalescing of 'free' calls. Keyed by end -> start
73by_end: BTreeMap<u64, u64>,
74}
7576impl BestFit {
77/// Tries to assign a set of contiguous `bytes` and returns the range, removing it
78 /// from the pool of available bytes and returning it.
79 ///
80 /// If insufficient contiguous space is available, the largest available range will
81 /// be returned. If no bytes are available, None will be returned.
82 ///
83 /// There are no special requirements on alignment of `bytes` but the caller is generally
84 /// encouraged to align to device block size.
85pub fn allocate(&mut self, bytes: u64) -> Result<Range<u64>, FxfsError> {
86let mut result = self.ranges.range_mut(bytes..).next();
87if result.is_none() {
88// Insufficient space. Return the biggest range we have.
89result = self.ranges.iter_mut().rev().next();
90 }
91if let Some((&size, size_bucket)) = result {
92if size_bucket.offsets.is_empty() && size_bucket.overflow {
93return Err(FxfsError::NotFound);
94 }
95debug_assert!(!size_bucket.offsets.is_empty());
96let offset = size_bucket.offsets.pop_first().unwrap();
97if size_bucket.offsets.is_empty() && !size_bucket.overflow {
98self.ranges.remove(&size);
99 }
100self.by_end.remove(&(offset + size));
101if size > bytes {
102self.free(offset + bytes..offset + size).expect("give extra back");
103Ok(offset..offset + bytes)
104 } else {
105Ok(offset..offset + size)
106 }
107 } else {
108Err(FxfsError::NoSpace)
109 }
110 }
111112/// This is effectively an optimized path for "remove" followed by "free" on the same range.
113 /// Returns true if this resulted in changes to overall free space.
114pub fn force_free(&mut self, range: Range<u64>) -> Result<bool, Error> {
115let mut to_add = Vec::new();
116let mut last = range.start;
117for (&end, &offset) in self.by_end.range(range.start + 1..) {
118if offset >= range.end {
119break;
120 }
121if offset > last {
122 to_add.push(last..offset);
123 }
124 last = end;
125assert!(end > range.start);
126 }
127if last < range.end {
128 to_add.push(last..range.end);
129 }
130Ok(if to_add.is_empty() {
131false
132} else {
133for range in to_add {
134self.free(range)?;
135 }
136true
137})
138 }
139140/// Removes all free extents in the given range.
141pub fn remove(&mut self, range: Range<u64>) {
142let mut to_add = Vec::new();
143let mut to_remove = Vec::new();
144for (&end, &offset) in self.by_end.range(range.start + 1..) {
145if offset >= range.end {
146break;
147 }
148assert!(end > range.start);
149 to_remove.push(offset..end);
150if offset < range.start {
151 to_add.push(offset..range.start);
152 }
153if end > range.end {
154 to_add.push(range.end..end);
155 }
156 }
157for range in to_remove {
158self.remove_range(range);
159 }
160for range in to_add {
161self.free(range).unwrap();
162 }
163 }
164165/// Internal helper function. Assumes range exists.
166fn remove_range(&mut self, range: Range<u64>) {
167let btree_map::Entry::Occupied(mut size_bucket) =
168self.ranges.entry(range.end - range.start)
169else {
170unreachable!()
171 };
172 size_bucket.get_mut().offsets.remove(&range.start);
173if size_bucket.get().offsets.is_empty() && !size_bucket.get().overflow {
174 size_bucket.remove_entry();
175 }
176assert_eq!(Some(range.start), self.by_end.remove(&range.end));
177 }
178179/// Returns the set of buckets that have overflow markers.
180pub fn overflow_markers(&self) -> Vec<u64> {
181self.ranges
182 .iter()
183 .filter_map(|(&size, size_bucket)| size_bucket.overflow.then_some(size))
184 .collect()
185 }
186187// Overflow markers persist until removed.
188 // Before we refresh data from disk, we need to remove these markers as there is no guarantee
189 // that we will overflow the same bucket every time we refresh.
190pub fn reset_overflow_markers(&mut self) {
191let mut to_remove = Vec::new();
192for (&size, size_bucket) in self.ranges.iter_mut() {
193 size_bucket.overflow = false;
194if size_bucket.offsets.is_empty() {
195 to_remove.push(size);
196 }
197 }
198for size in to_remove {
199self.ranges.remove(&size);
200 }
201 }
202203/// Internal helper function. Inserts range if room to do so, appends overflow marker if needed.
204fn insert_range(&mut self, range: Range<u64>) -> Result<(), Error> {
205let len = range.end - range.start;
206let entry = self.ranges.entry(len).or_default();
207if !entry.offsets.insert(range.start) {
208// This might happen if there is some high-level logic error that causes a range to be
209 // freed twice. It may indicate a disk corruption or a software bug and shouldn't happen
210 // under normal circumstances. While the low-level data structure held here will not be
211 // compromised by this error, it is probably not safe to continue if a condition such
212 // as this is detected, thus we return "Inconsistent".
213return Err(FxfsError::Inconsistent).context("Range already in 'ranges'.");
214 }
215let other = self.by_end.insert(range.end, range.start);
216if let Some(_other) = other {
217// self.by_end and self.ranges should always remain consistant.
218 // If this occurs it is almost certainly a software bug and continued use of this
219 // data structure will likely lead to undefined behaviour.
220return Err(FxfsError::Inconsistent)
221 .context("Range already in 'by_end'. Potential logic bug.");
222 };
223if entry.offsets.len() > max_entries(len) {
224let last = entry.offsets.pop_last().unwrap();
225if self.by_end.remove(&(last + len)) != Some(last) {
226return Err(FxfsError::Inconsistent)
227 .context("Expected range missing or invalid 'by_end'");
228 };
229 entry.overflow = true;
230 }
231Ok(())
232 }
233234/// Adds an arbitrary range of bytes to the pool of available ranges.
235 ///
236 /// Note that we keep these ranges in a map keyed by their length. To bound the size of this
237 /// map we only track ranges up to N blocks long (up to 2MB). Longer ranges
238 /// are broken up into ranges of this size.
239pub fn free(&mut self, mut range: Range<u64>) -> Result<(), Error> {
240// If there is a free range immediately before this one, merge with it.
241let mut iter = self.by_end.range(range.start..);
242let mut next_item = iter.next();
243if let Some((&end, &start)) = next_item {
244if end == range.start {
245self.remove_range(start..end);
246 range.start = start;
247 iter = self.by_end.range(range.start + 1..);
248 next_item = iter.next();
249 } else if start < range.end {
250// There exists a range already that overlaps with this.
251 // We have a double free or freeing of overlapping allocations.
252 // While the data-structure here remains valid after this error, the fact that
253 // an overlapping free was attempted indicates that the filesystem is not in a
254 // safe state to continue, thus we return "Inconsistent".
255return Err(FxfsError::Inconsistent).context("overlapping free");
256 }
257 }
258// If there is a free range immediately after this one, merge with it.
259if let Some((&end, &start)) = next_item {
260if start == range.end {
261self.remove_range(start..end);
262 range.end = end;
263 }
264 }
265266// We don't allow ranges longer than maximum extent size, but if we need to split such
267 // a range, we want the smaller fragment to come first (pushing small fragments together at
268 // the start of the device).
269while (range.end - range.start) >= DEFAULT_MAX_EXTENT_SIZE {
270self.insert_range(range.end - DEFAULT_MAX_EXTENT_SIZE..range.end)
271 .context("adding max_extent_size fragment")?;
272 range.end -= DEFAULT_MAX_EXTENT_SIZE;
273 }
274if range.start < range.end {
275self.insert_range(range).context("adding final range")?;
276 }
277Ok(())
278 }
279}
280281#[cfg(test)]
282mod test {
283use super::*;
284285#[test]
286fn allocate() {
287let mut bestfit = BestFit::default();
288 bestfit.free(0..0).unwrap(); // NOOP
289bestfit.free(0..100).unwrap();
290assert_eq!(bestfit.allocate(10), Ok(0..10));
291assert_eq!(bestfit.allocate(10), Ok(10..20));
292assert_eq!(bestfit.allocate(10), Ok(20..30));
293assert_eq!(bestfit.allocate(10), Ok(30..40));
294assert_eq!(bestfit.allocate(10), Ok(40..50));
295// Make some holes.
296bestfit.free(30..40).unwrap();
297 bestfit.free(10..20).unwrap();
298// Holes get filled first.
299assert_eq!(bestfit.allocate(10), Ok(10..20));
300assert_eq!(bestfit.allocate(10), Ok(30..40));
301assert_eq!(bestfit.allocate(10), Ok(50..60));
302// Free a contiguous bunch of allocations at once.
303bestfit.free(0..50).unwrap();
304// Return less than requested.
305assert_eq!(bestfit.allocate(100), Ok(0..50));
306// Return all remaining space.
307assert_eq!(bestfit.allocate(100), Ok(60..100));
308// No space left. Return None.
309assert_eq!(bestfit.allocate(100), Err(FxfsError::NoSpace));
310// Now we have some more back.
311bestfit.free(50..100).unwrap();
312assert_eq!(bestfit.allocate(100), Ok(50..100));
313 }
314315#[test]
316fn remove() {
317let mut bestfit = BestFit::default();
318 bestfit.free(0..100).unwrap();
319 bestfit.remove(25..50);
320assert_eq!(bestfit.allocate(30), Ok(50..80));
321assert_eq!(bestfit.allocate(25), Ok(0..25));
322323// Test removal across disjoint extents.
324let mut bestfit = BestFit::default();
325 bestfit.free(0..20).unwrap();
326 bestfit.free(30..50).unwrap();
327 bestfit.remove(10..40);
328assert_eq!(bestfit.allocate(10), Ok(0..10));
329assert_eq!(bestfit.allocate(10), Ok(40..50));
330assert_eq!(bestfit.allocate(10), Err(FxfsError::NoSpace));
331332// Test coalescing of adjacent available ranges if the request is large.
333let mut bestfit = BestFit::default();
334let end = DEFAULT_MAX_EXTENT_SIZE * (NUM_MAX_SIZED_EXTENTS_TO_KEEP + 1) + 10000;
335 bestfit.free(0..end).unwrap();
336// We slice from the back so we have sizes: 10000,2M,2M,2M,2M,...
337 // Free up most of the 2MB tail entries -- we just needed to trigger an overflow marker.
338bestfit.remove(DEFAULT_MAX_EXTENT_SIZE * 3 + 10000..end);
339// Free up the firs two slices worth. (This exercises unaligned removal.)
340bestfit.remove(0..DEFAULT_MAX_EXTENT_SIZE * 2);
341// Allocate the last 2MB extent.
342assert_eq!(
343 bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE),
344Ok(DEFAULT_MAX_EXTENT_SIZE * 2 + 10000..DEFAULT_MAX_EXTENT_SIZE * 3 + 10000)
345 );
346// Allocate all the space.
347assert_eq!(
348 bestfit.allocate(10000),
349Ok(DEFAULT_MAX_EXTENT_SIZE * 2..DEFAULT_MAX_EXTENT_SIZE * 2 + 10000)
350 );
351// Now there is insufficient space, but we should get the overflow marker.
352assert_eq!(
353 bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE),
354Err(FxfsError::NotFound) // Overflow.
355);
356// Reset the overflow marker. We should see NoSpace.
357bestfit.reset_overflow_markers();
358assert_eq!(bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE), Err(FxfsError::NoSpace));
359 }
360361#[test]
362fn coalescing_free() {
363let mut bestfit = BestFit::default();
364// Free some bytes at the start and end.
365bestfit.free(0..10).unwrap();
366 bestfit.free(20..32).unwrap();
367// Now free the space in the middle, which should coalesce with ranges on both sides.
368bestfit.free(10..20).unwrap();
369// Confirm that we can allocate one block of 32 bytes. This will fail if coalescing
370 // didn't occur.
371bestfit.remove(0..32);
372assert_eq!(bestfit.allocate(32), Err(FxfsError::NoSpace));
373 }
374375#[test]
376fn max_range() {
377let mut bestfit = BestFit::default();
378 bestfit.free(10..10 + 10 * DEFAULT_MAX_EXTENT_SIZE).unwrap();
379380// We can't allocate bigger than DEFAULT_MAX_EXTENT_SIZE.
381assert_eq!(
382 bestfit.allocate(2 * DEFAULT_MAX_EXTENT_SIZE),
383Ok(10..10 + DEFAULT_MAX_EXTENT_SIZE)
384 );
385386// Make sure that coalescing still works properly
387bestfit.free(10..10 + DEFAULT_MAX_EXTENT_SIZE).unwrap();
388assert_eq!(bestfit.allocate(10), Ok(10..20));
389assert_eq!(bestfit.allocate(10), Ok(20..30));
390assert_eq!(
391 bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE - 20),
392Ok(30..10 + DEFAULT_MAX_EXTENT_SIZE)
393 );
394 }
395396#[test]
397fn fragmenting_free() {
398// It shouldn't matter how we allocate and free here so long as there are no overlaps.
399let mut bestfit = BestFit::default();
400 bestfit.free(0..100).unwrap();
401assert_eq!(bestfit.allocate(30), Ok(0..30));
402assert!(bestfit.free(20..30).is_ok());
403assert_eq!(bestfit.allocate(10), Ok(20..30));
404assert!(bestfit.free(0..30).is_ok());
405assert!(bestfit.free(0..30).is_err());
406407// Merge left and right and middle.
408let mut bestfit = BestFit::default();
409 bestfit.free(10..20).unwrap();
410 bestfit.free(30..40).unwrap();
411 bestfit.free(0..10).unwrap(); // before
412bestfit.free(40..50).unwrap(); // after
413bestfit.free(20..30).unwrap(); // middle
414415 // Check all combinations of overlaps.
416let mut bestfit = BestFit::default();
417 bestfit.free(10..20).unwrap();
418assert!(bestfit.free(9..11).is_err());
419assert!(bestfit.free(19..21).is_err());
420assert!(bestfit.free(11..29).is_err());
421assert!(bestfit.free(10..20).is_err());
422assert!(bestfit.free(9..10).is_ok());
423assert!(bestfit.free(20..21).is_ok());
424 }
425426#[test]
427fn force_free() {
428let mut bestfit = BestFit::default();
429 bestfit.free(0..100).unwrap();
430assert_eq!(bestfit.force_free(0..100).unwrap(), false);
431assert_eq!(bestfit.force_free(10..100).unwrap(), false);
432assert_eq!(bestfit.force_free(0..90).unwrap(), false);
433assert_eq!(bestfit.force_free(10..90).unwrap(), false);
434435let mut bestfit = BestFit::default();
436 bestfit.free(0..40).unwrap();
437 bestfit.free(60..100).unwrap();
438assert_eq!(bestfit.force_free(10..90).unwrap(), true);
439assert_eq!(bestfit.allocate(100), Ok(0..100));
440441let mut bestfit = BestFit::default();
442 bestfit.free(0..40).unwrap();
443 bestfit.free(60..100).unwrap();
444assert_eq!(bestfit.force_free(10..100).unwrap(), true);
445assert_eq!(bestfit.allocate(100), Ok(0..100));
446447let mut bestfit = BestFit::default();
448 bestfit.free(10..40).unwrap();
449 bestfit.free(60..100).unwrap();
450assert_eq!(bestfit.force_free(0..110).unwrap(), true);
451assert_eq!(bestfit.allocate(110), Ok(0..110));
452453let mut bestfit = BestFit::default();
454 bestfit.free(10..40).unwrap();
455 bestfit.free(60..100).unwrap();
456assert_eq!(bestfit.force_free(10..110).unwrap(), true);
457assert_eq!(bestfit.allocate(100), Ok(10..110));
458459let mut bestfit = BestFit::default();
460 bestfit.free(10..40).unwrap();
461 bestfit.free(60..100).unwrap();
462assert_eq!(bestfit.force_free(0..100).unwrap(), true);
463assert_eq!(bestfit.allocate(100), Ok(0..100));
464465let mut bestfit = BestFit::default();
466assert_eq!(bestfit.force_free(0..100).unwrap(), true);
467assert_eq!(bestfit.allocate(100), Ok(0..100));
468 }
469470#[test]
471fn test_overflow_changes() {
472// Test for b/345105394 conditions.
473 // This doesn't exactly recreate the differences though because
474let mut bestfit = BestFit::default();
475let mut ofs: u64 = 0;
476// Fill 256kb.
477for i in 0..max_entries(256 << 10) as u64 {
478assert!(bestfit.force_free(i * 2 * (256 << 10)..(i * 2 + 1) * (256 << 10)).unwrap());
479 }
480 ofs += (256 << 10) * max_entries(256 << 10) as u64 * 3;
481482// Overflow 768kb
483for i in 0..max_entries(768 << 10) as u64 + 1 {
484assert!(bestfit
485 .force_free(ofs + i * 2 * (768 << 10)..ofs + (i * 2 + 1) * (768 << 10))
486 .unwrap());
487 }
488 ofs += (768 << 10) * (max_entries(768 << 10) + 1) as u64 * 3;
489490// Overflow 512kb.
491for i in 0..max_entries(512 << 10) as u64 + 1 {
492assert!(bestfit
493 .force_free(ofs + i * 2 * (512 << 10)..ofs + (i * 2 + 1) * (512 << 10))
494 .unwrap());
495 }
496// We want to put the next free adjacent to the tail of the last 512kb block, so
497 // we drop the "+1" and add 512kb.
498ofs += (512 << 10) * max_entries(512 << 10) as u64 * 2 + (512 << 10);
499500assert_eq!(bestfit.overflow_markers(), vec![524288, 786432]);
501502// This is a 256kB free just after a 512kB free, so it will form a 768kb free.
503assert!(bestfit.force_free(ofs..ofs + (256 << 10)).unwrap());
504// In the actual bug, we would not add this until after the next flush, so
505 // force_free would not return true and the only change would be the overflow marker.
506assert_eq!(bestfit.overflow_markers(), vec![262144, 524288, 786432]);
507 }
508}