starnix_core/vfs/
file_system.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::{CurrentTask, Kernel};
7use crate::vfs::fs_args::MountParams;
8use crate::vfs::fs_node_cache::FsNodeCache;
9use crate::vfs::{
10    DirEntry, DirEntryHandle, FsNode, FsNodeHandle, FsNodeInfo, FsNodeOps, FsStr, FsString,
11};
12use flyweights::FlyByteStr;
13use linked_hash_map::LinkedHashMap;
14use ref_cast::RefCast;
15use smallvec::SmallVec;
16use starnix_crypt::CryptService;
17use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, Mutex};
18use starnix_uapi::arc_key::ArcKey;
19use starnix_uapi::as_any::AsAny;
20use starnix_uapi::auth::FsCred;
21use starnix_uapi::device_type::DeviceType;
22use starnix_uapi::errors::Errno;
23use starnix_uapi::file_mode::mode;
24use starnix_uapi::mount_flags::MountFlags;
25use starnix_uapi::{error, ino_t, statfs};
26use std::collections::HashSet;
27use std::ops::Range;
28use std::sync::{Arc, OnceLock, Weak};
29
30pub const DEFAULT_LRU_CAPACITY: usize = 32;
31
32/// A file system that can be mounted in a namespace.
33pub struct FileSystem {
34    pub kernel: Weak<Kernel>,
35    root: OnceLock<DirEntryHandle>,
36    ops: Box<dyn FileSystemOps>,
37
38    /// The options specified when mounting the filesystem. Saved here for display in
39    /// /proc/[pid]/mountinfo.
40    pub options: FileSystemOptions,
41
42    /// The device ID of this filesystem. Returned in the st_dev field when stating an inode in
43    /// this filesystem.
44    pub dev_id: DeviceType,
45
46    /// A file-system global mutex to serialize rename operations.
47    ///
48    /// This mutex is useful because the invariants enforced during a rename
49    /// operation involve many DirEntry objects. In the future, we might be
50    /// able to remove this mutex, but we will need to think carefully about
51    /// how rename operations can interleave.
52    ///
53    /// See DirEntry::rename.
54    pub rename_mutex: Mutex<()>,
55
56    /// The FsNode cache for this file system.
57    ///
58    /// When two directory entries are hard links to the same underlying inode,
59    /// this cache lets us re-use the same FsNode object for both directory
60    /// entries.
61    ///
62    /// Rather than calling FsNode::new directly, file systems should call
63    /// FileSystem::get_or_create_node to see if the FsNode already exists in
64    /// the cache.
65    node_cache: Arc<FsNodeCache>,
66
67    /// DirEntryHandle cache for the filesystem. Holds strong references to DirEntry objects. For
68    /// filesystems with permanent entries, this will hold a strong reference to every node to make
69    /// sure it doesn't get freed without being explicitly unlinked. Otherwise, entries are
70    /// maintained in an LRU cache.
71    dcache: DirEntryCache,
72
73    /// Holds security state for this file system, which is created and used by the Linux Security
74    /// Modules subsystem hooks.
75    pub security_state: security::FileSystemState,
76}
77
78impl std::fmt::Debug for FileSystem {
79    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
80        write!(f, "FileSystem")
81    }
82}
83
84#[derive(Clone, Debug, Default)]
85pub struct FileSystemOptions {
86    /// The source string passed as the first argument to mount(), e.g. a block device.
87    pub source: FlyByteStr,
88    /// Flags kept per-superblock, i.e. included in MountFlags::STORED_ON_FILESYSTEM.
89    pub flags: MountFlags,
90    /// Filesystem options passed as the last argument to mount().
91    pub params: MountParams,
92}
93
94impl FileSystemOptions {
95    pub fn source_for_display(&self) -> &FsStr {
96        if self.source.is_empty() {
97            return "none".into();
98        }
99        self.source.as_ref()
100    }
101}
102
103struct LruCache {
104    capacity: usize,
105    entries: Mutex<LinkedHashMap<ArcKey<DirEntry>, ()>>,
106}
107
108enum DirEntryCache {
109    Permanent(Mutex<HashSet<ArcKey<DirEntry>>>),
110    Lru(LruCache),
111    Uncached,
112}
113
114/// Configuration for CacheMode::Cached.
115pub struct CacheConfig {
116    pub capacity: usize,
117}
118
119impl Default for CacheConfig {
120    fn default() -> Self {
121        Self { capacity: DEFAULT_LRU_CAPACITY }
122    }
123}
124
125pub enum CacheMode {
126    /// Entries are pemanent, instead of a cache of the backing storage. An example is tmpfs: the
127    /// DirEntry tree *is* the backing storage, as opposed to ext4, which uses the DirEntry tree as
128    /// a cache and removes unused nodes from it.
129    Permanent,
130    /// Entries are cached.
131    Cached(CacheConfig),
132    /// Entries are uncached. This can be appropriate in cases where it is difficult for the
133    /// filesystem to keep the cache coherent: e.g. the /proc/<pid>/task directory.
134    Uncached,
135}
136
137impl FileSystem {
138    /// Create a new filesystem.
139    pub fn new<L>(
140        locked: &mut Locked<L>,
141        kernel: &Kernel,
142        cache_mode: CacheMode,
143        ops: impl FileSystemOps,
144        mut options: FileSystemOptions,
145    ) -> Result<FileSystemHandle, Errno>
146    where
147        L: LockEqualOrBefore<FileOpsCore>,
148    {
149        let uses_external_node_ids = ops.uses_external_node_ids();
150        let node_cache = Arc::new(FsNodeCache::new(uses_external_node_ids));
151        assert_eq!(ops.uses_external_node_ids(), node_cache.uses_external_node_ids());
152
153        let mount_options = security::sb_eat_lsm_opts(&kernel, &mut options.params)?;
154        let security_state = security::file_system_init_security(&mount_options, &ops)?;
155
156        let file_system = Arc::new(FileSystem {
157            kernel: kernel.weak_self.clone(),
158            root: OnceLock::new(),
159            ops: Box::new(ops),
160            options,
161            dev_id: kernel.device_registry.next_anonymous_dev_id(locked),
162            rename_mutex: Mutex::new(()),
163            node_cache,
164            dcache: match cache_mode {
165                CacheMode::Permanent => DirEntryCache::Permanent(Mutex::new(HashSet::new())),
166                CacheMode::Cached(CacheConfig { capacity }) => DirEntryCache::Lru(LruCache {
167                    capacity,
168                    entries: Mutex::new(LinkedHashMap::new()),
169                }),
170                CacheMode::Uncached => DirEntryCache::Uncached,
171            },
172            security_state,
173        });
174
175        // TODO: https://fxbug.dev/366405587 - Workaround to allow SELinux to note that this
176        // `FileSystem` needs labeling, once a policy has been loaded.
177        security::file_system_post_init_security(kernel, &file_system);
178
179        Ok(file_system)
180    }
181
182    fn set_root(self: &FileSystemHandle, root: FsNodeHandle) {
183        // No need to cache the root directory, it is owned by the filesystem.
184        let root_dir = DirEntry::new_uncached(root, None, FsString::default());
185        assert!(
186            self.root.set(root_dir).is_ok(),
187            "FileSystem::set_root can't be called more than once"
188        );
189    }
190
191    pub fn has_permanent_entries(&self) -> bool {
192        matches!(self.dcache, DirEntryCache::Permanent(_))
193    }
194
195    /// The root directory entry of this file system.
196    ///
197    /// Panics if this file system does not have a root directory.
198    pub fn root(&self) -> &DirEntryHandle {
199        self.root.get().unwrap_or_else(|| panic!("FileSystem {} has no root", self.name()))
200    }
201
202    /// The root directory entry of this `FileSystem`, if it has one.
203    pub fn maybe_root(&self) -> Option<&DirEntryHandle> {
204        self.root.get()
205    }
206
207    pub fn get_or_create_node<F>(
208        &self,
209        node_key: ino_t,
210        create_fn: F,
211    ) -> Result<FsNodeHandle, Errno>
212    where
213        F: FnOnce() -> Result<FsNodeHandle, Errno>,
214    {
215        self.get_and_validate_or_create_node(node_key, |_| true, create_fn)
216    }
217
218    /// Get a node that is validated with the callback, or create an FsNode for
219    /// this file system.
220    ///
221    /// If node_id is Some, then this function checks the node cache to
222    /// determine whether this node is already open. If so, the function
223    /// returns the existing FsNode if it passes the validation check. If no
224    /// node exists, or a node does but fails the validation check, the function
225    /// calls the given create_fn function to create the FsNode.
226    ///
227    /// If node_id is None, then this function assigns a new identifier number
228    /// and calls the given create_fn function to create the FsNode with the
229    /// assigned number.
230    ///
231    /// Returns Err only if create_fn returns Err.
232    pub fn get_and_validate_or_create_node<V, C>(
233        &self,
234        node_key: ino_t,
235        validate_fn: V,
236        create_fn: C,
237    ) -> Result<FsNodeHandle, Errno>
238    where
239        V: FnOnce(&FsNodeHandle) -> bool,
240        C: FnOnce() -> Result<FsNodeHandle, Errno>,
241    {
242        self.node_cache.get_and_validate_or_create_node(node_key, validate_fn, create_fn)
243    }
244
245    /// File systems that produce their own IDs for nodes should invoke this
246    /// function. The ones who leave to this object to assign the IDs should
247    /// call |create_node_and_allocate_node_id|.
248    pub fn create_node(
249        self: &Arc<Self>,
250        ino: ino_t,
251        ops: impl Into<Box<dyn FsNodeOps>>,
252        info: FsNodeInfo,
253    ) -> FsNodeHandle {
254        let node = FsNode::new_uncached(ino, ops, self, info);
255        self.node_cache.insert_node(&node);
256        node
257    }
258
259    pub fn create_node_and_allocate_node_id(
260        self: &Arc<Self>,
261        ops: impl Into<Box<dyn FsNodeOps>>,
262        info: FsNodeInfo,
263    ) -> FsNodeHandle {
264        let ino = self.allocate_ino();
265        self.create_node(ino, ops, info)
266    }
267
268    /// Create a node for a directory that has no parent.
269    pub fn create_detached_node(
270        self: &Arc<Self>,
271        ino: ino_t,
272        ops: impl Into<Box<dyn FsNodeOps>>,
273        info: FsNodeInfo,
274    ) -> FsNodeHandle {
275        assert!(info.mode.is_dir());
276        let node = FsNode::new_uncached(ino, ops, self, info);
277        self.node_cache.insert_node(&node);
278        node
279    }
280
281    /// Create a root node for the filesystem.
282    ///
283    /// This is a convenience function that creates a root node with the default
284    /// directory mode and root credentials.
285    pub fn create_root(self: &Arc<Self>, ino: ino_t, ops: impl Into<Box<dyn FsNodeOps>>) {
286        let info = FsNodeInfo::new(mode!(IFDIR, 0o777), FsCred::root());
287        self.create_root_with_info(ino, ops, info);
288    }
289
290    pub fn create_root_with_info(
291        self: &Arc<Self>,
292        ino: ino_t,
293        ops: impl Into<Box<dyn FsNodeOps>>,
294        info: FsNodeInfo,
295    ) {
296        let node = self.create_detached_node(ino, ops, info);
297        self.set_root(node);
298    }
299
300    /// Remove the given FsNode from the node cache.
301    ///
302    /// Called from the Release trait of FsNode.
303    pub fn remove_node(&self, node: &FsNode) {
304        self.node_cache.remove_node(node);
305    }
306
307    pub fn allocate_ino(&self) -> ino_t {
308        self.node_cache
309            .allocate_ino()
310            .expect("allocate_ino called on a filesystem that uses external node IDs")
311    }
312
313    /// Allocate a contiguous block of node ids.
314    pub fn allocate_ino_range(&self, size: usize) -> Range<ino_t> {
315        self.node_cache
316            .allocate_ino_range(size)
317            .expect("allocate_ino_range called on a filesystem that uses external node IDs")
318    }
319
320    /// Move |renamed| that is at |old_name| in |old_parent| to |new_name| in |new_parent|
321    /// replacing |replaced|.
322    /// If |replaced| exists and is a directory, this function must check that |renamed| is n
323    /// directory and that |replaced| is empty.
324    pub fn rename<L>(
325        &self,
326        locked: &mut Locked<L>,
327        current_task: &CurrentTask,
328        old_parent: &FsNodeHandle,
329        old_name: &FsStr,
330        new_parent: &FsNodeHandle,
331        new_name: &FsStr,
332        renamed: &FsNodeHandle,
333        replaced: Option<&FsNodeHandle>,
334    ) -> Result<(), Errno>
335    where
336        L: LockEqualOrBefore<FileOpsCore>,
337    {
338        let locked = locked.cast_locked::<FileOpsCore>();
339        self.ops.rename(
340            locked,
341            self,
342            current_task,
343            old_parent,
344            old_name,
345            new_parent,
346            new_name,
347            renamed,
348            replaced,
349        )
350    }
351
352    /// Exchanges `node1` and `node2`. Parent directory node and the corresponding names
353    /// for the two exchanged nodes are passed as `parent1`, `name1`, `parent2`, `name2`.
354    pub fn exchange(
355        &self,
356        current_task: &CurrentTask,
357        node1: &FsNodeHandle,
358        parent1: &FsNodeHandle,
359        name1: &FsStr,
360        node2: &FsNodeHandle,
361        parent2: &FsNodeHandle,
362        name2: &FsStr,
363    ) -> Result<(), Errno> {
364        self.ops.exchange(self, current_task, node1, parent1, name1, node2, parent2, name2)
365    }
366
367    /// Forces a FileSystem unmount.
368    // TODO(https://fxbug.dev/394694891): kernel shutdown should ideally unmount FileSystems via
369    // their drop impl, which should be triggered by Mount.unmount().
370    pub fn force_unmount_ops(&self) {
371        self.ops.unmount();
372    }
373
374    /// Returns the `statfs` for this filesystem.
375    ///
376    /// Each `FileSystemOps` impl is expected to override this to return the specific statfs for
377    /// the filesystem.
378    ///
379    /// Returns `ENOSYS` if the `FileSystemOps` don't implement `stat`.
380    pub fn statfs<L>(
381        &self,
382        locked: &mut Locked<L>,
383        current_task: &CurrentTask,
384    ) -> Result<statfs, Errno>
385    where
386        L: LockEqualOrBefore<FileOpsCore>,
387    {
388        security::sb_statfs(current_task, &self)?;
389        let locked = locked.cast_locked::<FileOpsCore>();
390        let mut stat = self.ops.statfs(locked, self, current_task)?;
391        if stat.f_frsize == 0 {
392            stat.f_frsize = stat.f_bsize as i64;
393        }
394        Ok(stat)
395    }
396
397    pub fn did_create_dir_entry(&self, entry: &DirEntryHandle) {
398        match &self.dcache {
399            DirEntryCache::Permanent(p) => {
400                p.lock().insert(ArcKey(entry.clone()));
401            }
402            DirEntryCache::Lru(LruCache { entries, .. }) => {
403                entries.lock().insert(ArcKey(entry.clone()), ());
404            }
405            DirEntryCache::Uncached => {}
406        }
407    }
408
409    pub fn will_destroy_dir_entry(&self, entry: &DirEntryHandle) {
410        match &self.dcache {
411            DirEntryCache::Permanent(p) => {
412                p.lock().remove(ArcKey::ref_cast(entry));
413            }
414            DirEntryCache::Lru(LruCache { entries, .. }) => {
415                entries.lock().remove(ArcKey::ref_cast(entry));
416            }
417            DirEntryCache::Uncached => {}
418        };
419    }
420
421    /// Informs the cache that the entry was used.
422    pub fn did_access_dir_entry(&self, entry: &DirEntryHandle) {
423        if let DirEntryCache::Lru(LruCache { entries, .. }) = &self.dcache {
424            entries.lock().get_refresh(ArcKey::ref_cast(entry));
425        }
426    }
427
428    /// Purges old entries from the cache. This is done as a separate step to avoid potential
429    /// deadlocks that could occur if done at admission time (where locks might be held that are
430    /// required when dropping old entries). This should be called after any new entries are
431    /// admitted with no locks held that might be required for dropping entries.
432    pub fn purge_old_entries(&self) {
433        if let DirEntryCache::Lru(l) = &self.dcache {
434            let mut purged = SmallVec::<[DirEntryHandle; 4]>::new();
435            {
436                let mut entries = l.entries.lock();
437                while entries.len() > l.capacity {
438                    purged.push(entries.pop_front().unwrap().0.0);
439                }
440            }
441            // Entries will get dropped here whilst we're not holding a lock.
442            std::mem::drop(purged);
443        }
444    }
445
446    /// Returns the `FileSystem`'s `FileSystemOps` as a `&T`, or `None` if the downcast fails.
447    pub fn downcast_ops<T: 'static>(&self) -> Option<&T> {
448        self.ops.as_ref().as_any().downcast_ref()
449    }
450
451    pub fn name(&self) -> &'static FsStr {
452        self.ops.name()
453    }
454
455    pub fn manages_timestamps(&self) -> bool {
456        self.ops.manages_timestamps()
457    }
458
459    /// Returns the crypt service associated with this filesystem, if any. The crypt service
460    /// implements the fuchsia.fxfs.Crypt protocol and maintains an internal structure that maps
461    /// each encryption key id to the actual key.
462    pub fn crypt_service(&self) -> Option<Arc<CryptService>> {
463        self.ops.crypt_service()
464    }
465}
466
467/// The filesystem-implementation-specific data for FileSystem.
468pub trait FileSystemOps: AsAny + Send + Sync + 'static {
469    /// Return information about this filesystem.
470    ///
471    /// A typical implementation looks like this:
472    /// ```
473    /// Ok(statfs::default(FILE_SYSTEM_MAGIC))
474    /// ```
475    /// or, if the filesystem wants to customize fields:
476    /// ```
477    /// Ok(statfs {
478    ///     f_blocks: self.blocks,
479    ///     ..statfs::default(FILE_SYSTEM_MAGIC)
480    /// })
481    /// ```
482    fn statfs(
483        &self,
484        _locked: &mut Locked<FileOpsCore>,
485        _fs: &FileSystem,
486        _current_task: &CurrentTask,
487    ) -> Result<statfs, Errno>;
488
489    fn name(&self) -> &'static FsStr;
490
491    /// Whether this file system uses external node IDs.
492    ///
493    /// If this is true, then the file system is responsible for assigning node IDs to its nodes.
494    /// Otherwise, the VFS will assign node IDs to the nodes.
495    fn uses_external_node_ids(&self) -> bool {
496        false
497    }
498
499    /// Rename the given node.
500    ///
501    /// The node to be renamed is passed as "renamed". It currently has
502    /// old_name in old_parent. After the rename operation, it should have
503    /// new_name in new_parent.
504    ///
505    /// If new_parent already has a child named new_name, that node is passed as
506    /// "replaced". In that case, both "renamed" and "replaced" will be
507    /// directories and the rename operation should succeed only if "replaced"
508    /// is empty. The VFS will check that there are no children of "replaced" in
509    /// the DirEntry cache, but the implementation of this function is
510    /// responsible for checking that there are no children of replaced that are
511    /// known only to the file system implementation (e.g., present on-disk but
512    /// not in the DirEntry cache).
513    fn rename(
514        &self,
515        _locked: &mut Locked<FileOpsCore>,
516        _fs: &FileSystem,
517        _current_task: &CurrentTask,
518        _old_parent: &FsNodeHandle,
519        _old_name: &FsStr,
520        _new_parent: &FsNodeHandle,
521        _new_name: &FsStr,
522        _renamed: &FsNodeHandle,
523        _replaced: Option<&FsNodeHandle>,
524    ) -> Result<(), Errno> {
525        error!(EROFS)
526    }
527
528    fn exchange(
529        &self,
530        _fs: &FileSystem,
531        _current_task: &CurrentTask,
532        _node1: &FsNodeHandle,
533        _parent1: &FsNodeHandle,
534        _name1: &FsStr,
535        _node2: &FsNodeHandle,
536        _parent2: &FsNodeHandle,
537        _name2: &FsStr,
538    ) -> Result<(), Errno> {
539        error!(EINVAL)
540    }
541
542    /// Called when the filesystem is unmounted.
543    fn unmount(&self) {}
544
545    /// Indicates if the filesystem can manage the timestamps (i.e. ctime and mtime).
546    ///
547    /// Starnix updates the timestamps in FsNode's `info` directly. However, if the filesystem can
548    /// manage the timestamps, then Starnix does not need to do so. `info` will be refreshed with
549    /// the timestamps from the filesystem by calling `fetch_and_refresh_info(..)` on the FsNode.
550    fn manages_timestamps(&self) -> bool {
551        false
552    }
553
554    /// Returns the crypt service associated with this filesystem, if any.
555    fn crypt_service(&self) -> Option<Arc<CryptService>> {
556        None
557    }
558}
559
560impl Drop for FileSystem {
561    fn drop(&mut self) {
562        self.ops.unmount();
563    }
564}
565
566pub type FileSystemHandle = Arc<FileSystem>;