starnix_core/vfs/
file_system.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::{CurrentTask, Kernel};
7use crate::vfs::fs_args::MountParams;
8use crate::vfs::fs_node_cache::FsNodeCache;
9use crate::vfs::{
10    DirEntry, DirEntryHandle, FsNode, FsNodeHandle, FsNodeInfo, FsNodeOps, FsStr, FsString,
11};
12use flyweights::FlyByteStr;
13use linked_hash_map::LinkedHashMap;
14use ref_cast::RefCast;
15use smallvec::SmallVec;
16use starnix_crypt::CryptService;
17use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, Mutex};
18use starnix_uapi::arc_key::ArcKey;
19use starnix_uapi::as_any::AsAny;
20use starnix_uapi::auth::FsCred;
21use starnix_uapi::device_type::DeviceType;
22use starnix_uapi::errors::Errno;
23use starnix_uapi::file_mode::mode;
24use starnix_uapi::mount_flags::MountFlags;
25use starnix_uapi::{error, ino_t, statfs};
26use std::collections::HashSet;
27use std::ops::Range;
28use std::sync::{Arc, OnceLock, Weak};
29
30/// A file system that can be mounted in a namespace.
31pub struct FileSystem {
32    pub kernel: Weak<Kernel>,
33    root: OnceLock<DirEntryHandle>,
34    ops: Box<dyn FileSystemOps>,
35
36    /// The options specified when mounting the filesystem. Saved here for display in
37    /// /proc/[pid]/mountinfo.
38    pub options: FileSystemOptions,
39
40    /// The device ID of this filesystem. Returned in the st_dev field when stating an inode in
41    /// this filesystem.
42    pub dev_id: DeviceType,
43
44    /// A file-system global mutex to serialize rename operations.
45    ///
46    /// This mutex is useful because the invariants enforced during a rename
47    /// operation involve many DirEntry objects. In the future, we might be
48    /// able to remove this mutex, but we will need to think carefully about
49    /// how rename operations can interleave.
50    ///
51    /// See DirEntry::rename.
52    pub rename_mutex: Mutex<()>,
53
54    /// The FsNode cache for this file system.
55    ///
56    /// When two directory entries are hard links to the same underlying inode,
57    /// this cache lets us re-use the same FsNode object for both directory
58    /// entries.
59    ///
60    /// Rather than calling FsNode::new directly, file systems should call
61    /// FileSystem::get_or_create_node to see if the FsNode already exists in
62    /// the cache.
63    node_cache: Arc<FsNodeCache>,
64
65    /// DirEntryHandle cache for the filesystem. Holds strong references to DirEntry objects. For
66    /// filesystems with permanent entries, this will hold a strong reference to every node to make
67    /// sure it doesn't get freed without being explicitly unlinked. Otherwise, entries are
68    /// maintained in an LRU cache.
69    dcache: DirEntryCache,
70
71    /// Holds security state for this file system, which is created and used by the Linux Security
72    /// Modules subsystem hooks.
73    pub security_state: security::FileSystemState,
74}
75
76impl std::fmt::Debug for FileSystem {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78        write!(f, "FileSystem")
79    }
80}
81
82#[derive(Clone, Debug, Default)]
83pub struct FileSystemOptions {
84    /// The source string passed as the first argument to mount(), e.g. a block device.
85    pub source: FlyByteStr,
86    /// Flags kept per-superblock, i.e. included in MountFlags::STORED_ON_FILESYSTEM.
87    pub flags: MountFlags,
88    /// Filesystem options passed as the last argument to mount().
89    pub params: MountParams,
90}
91
92impl FileSystemOptions {
93    pub fn source_for_display(&self) -> &FsStr {
94        if self.source.is_empty() {
95            return "none".into();
96        }
97        self.source.as_ref()
98    }
99}
100
101struct LruCache {
102    capacity: usize,
103    entries: Mutex<LinkedHashMap<ArcKey<DirEntry>, ()>>,
104}
105
106enum DirEntryCache {
107    Permanent(Mutex<HashSet<ArcKey<DirEntry>>>),
108    Lru(LruCache),
109    Uncached,
110}
111
112/// Configuration for CacheMode::Cached.
113pub struct CacheConfig {
114    pub capacity: usize,
115}
116
117pub enum CacheMode {
118    /// Entries are pemanent, instead of a cache of the backing storage. An example is tmpfs: the
119    /// DirEntry tree *is* the backing storage, as opposed to ext4, which uses the DirEntry tree as
120    /// a cache and removes unused nodes from it.
121    Permanent,
122    /// Entries are cached.
123    Cached(CacheConfig),
124    /// Entries are uncached. This can be appropriate in cases where it is difficult for the
125    /// filesystem to keep the cache coherent: e.g. the /proc/<pid>/task directory.
126    Uncached,
127}
128
129impl FileSystem {
130    /// Create a new filesystem.
131    pub fn new<L>(
132        locked: &mut Locked<L>,
133        kernel: &Kernel,
134        cache_mode: CacheMode,
135        ops: impl FileSystemOps,
136        mut options: FileSystemOptions,
137    ) -> Result<FileSystemHandle, Errno>
138    where
139        L: LockEqualOrBefore<FileOpsCore>,
140    {
141        let uses_external_node_ids = ops.uses_external_node_ids();
142        let node_cache = Arc::new(FsNodeCache::new(uses_external_node_ids));
143        assert_eq!(ops.uses_external_node_ids(), node_cache.uses_external_node_ids());
144
145        let mount_options = security::sb_eat_lsm_opts(&kernel, &mut options.params)?;
146        let security_state = security::file_system_init_security(&mount_options, &ops)?;
147
148        let file_system = Arc::new(FileSystem {
149            kernel: kernel.weak_self.clone(),
150            root: OnceLock::new(),
151            ops: Box::new(ops),
152            options,
153            dev_id: kernel.device_registry.next_anonymous_dev_id(locked),
154            rename_mutex: Mutex::new(()),
155            node_cache,
156            dcache: match cache_mode {
157                CacheMode::Permanent => DirEntryCache::Permanent(Mutex::new(HashSet::new())),
158                CacheMode::Cached(CacheConfig { capacity }) => DirEntryCache::Lru(LruCache {
159                    capacity,
160                    entries: Mutex::new(LinkedHashMap::new()),
161                }),
162                CacheMode::Uncached => DirEntryCache::Uncached,
163            },
164            security_state,
165        });
166
167        // TODO: https://fxbug.dev/366405587 - Workaround to allow SELinux to note that this
168        // `FileSystem` needs labeling, once a policy has been loaded.
169        security::file_system_post_init_security(kernel, &file_system);
170
171        Ok(file_system)
172    }
173
174    fn set_root(self: &FileSystemHandle, root: FsNodeHandle) {
175        // No need to cache the root directory, it is owned by the filesystem.
176        let root_dir = DirEntry::new_uncached(root, None, FsString::default());
177        assert!(
178            self.root.set(root_dir).is_ok(),
179            "FileSystem::set_root can't be called more than once"
180        );
181    }
182
183    pub fn has_permanent_entries(&self) -> bool {
184        matches!(self.dcache, DirEntryCache::Permanent(_))
185    }
186
187    /// The root directory entry of this file system.
188    ///
189    /// Panics if this file system does not have a root directory.
190    pub fn root(&self) -> &DirEntryHandle {
191        self.root.get().unwrap_or_else(|| panic!("FileSystem {} has no root", self.name()))
192    }
193
194    /// The root directory entry of this `FileSystem`, if it has one.
195    pub fn maybe_root(&self) -> Option<&DirEntryHandle> {
196        self.root.get()
197    }
198
199    pub fn get_or_create_node<F>(
200        &self,
201        node_key: ino_t,
202        create_fn: F,
203    ) -> Result<FsNodeHandle, Errno>
204    where
205        F: FnOnce() -> Result<FsNodeHandle, Errno>,
206    {
207        self.get_and_validate_or_create_node(node_key, |_| true, create_fn)
208    }
209
210    /// Get a node that is validated with the callback, or create an FsNode for
211    /// this file system.
212    ///
213    /// If node_id is Some, then this function checks the node cache to
214    /// determine whether this node is already open. If so, the function
215    /// returns the existing FsNode if it passes the validation check. If no
216    /// node exists, or a node does but fails the validation check, the function
217    /// calls the given create_fn function to create the FsNode.
218    ///
219    /// If node_id is None, then this function assigns a new identifier number
220    /// and calls the given create_fn function to create the FsNode with the
221    /// assigned number.
222    ///
223    /// Returns Err only if create_fn returns Err.
224    pub fn get_and_validate_or_create_node<V, C>(
225        &self,
226        node_key: ino_t,
227        validate_fn: V,
228        create_fn: C,
229    ) -> Result<FsNodeHandle, Errno>
230    where
231        V: Fn(&FsNodeHandle) -> bool,
232        C: FnOnce() -> Result<FsNodeHandle, Errno>,
233    {
234        self.node_cache.get_and_validate_or_create_node(node_key, validate_fn, create_fn)
235    }
236
237    /// File systems that produce their own IDs for nodes should invoke this
238    /// function. The ones who leave to this object to assign the IDs should
239    /// call |create_node_and_allocate_node_id|.
240    pub fn create_node(
241        self: &Arc<Self>,
242        ino: ino_t,
243        ops: impl Into<Box<dyn FsNodeOps>>,
244        info: FsNodeInfo,
245    ) -> FsNodeHandle {
246        let node = FsNode::new_uncached(ino, ops, self, info);
247        self.node_cache.insert_node(&node);
248        node
249    }
250
251    pub fn create_node_and_allocate_node_id(
252        self: &Arc<Self>,
253        ops: impl Into<Box<dyn FsNodeOps>>,
254        info: FsNodeInfo,
255    ) -> FsNodeHandle {
256        let ino = self.allocate_ino();
257        self.create_node(ino, ops, info)
258    }
259
260    /// Create a node for a directory that has no parent.
261    pub fn create_detached_node(
262        self: &Arc<Self>,
263        ino: ino_t,
264        ops: impl Into<Box<dyn FsNodeOps>>,
265        info: FsNodeInfo,
266    ) -> FsNodeHandle {
267        assert!(info.mode.is_dir());
268        let node = FsNode::new_uncached(ino, ops, self, info);
269        self.node_cache.insert_node(&node);
270        node
271    }
272
273    /// Create a root node for the filesystem.
274    ///
275    /// This is a convenience function that creates a root node with the default
276    /// directory mode and root credentials.
277    pub fn create_root(self: &Arc<Self>, ino: ino_t, ops: impl Into<Box<dyn FsNodeOps>>) {
278        let info = FsNodeInfo::new(mode!(IFDIR, 0o777), FsCred::root());
279        self.create_root_with_info(ino, ops, info);
280    }
281
282    pub fn create_root_with_info(
283        self: &Arc<Self>,
284        ino: ino_t,
285        ops: impl Into<Box<dyn FsNodeOps>>,
286        info: FsNodeInfo,
287    ) {
288        let node = self.create_detached_node(ino, ops, info);
289        self.set_root(node);
290    }
291
292    /// Remove the given FsNode from the node cache.
293    ///
294    /// Called from the Release trait of FsNode.
295    pub fn remove_node(&self, node: &FsNode) {
296        self.node_cache.remove_node(node);
297    }
298
299    pub fn allocate_ino(&self) -> ino_t {
300        self.node_cache
301            .allocate_ino()
302            .expect("allocate_ino called on a filesystem that uses external node IDs")
303    }
304
305    /// Allocate a contiguous block of node ids.
306    pub fn allocate_ino_range(&self, size: usize) -> Range<ino_t> {
307        self.node_cache
308            .allocate_ino_range(size)
309            .expect("allocate_ino_range called on a filesystem that uses external node IDs")
310    }
311
312    /// Move |renamed| that is at |old_name| in |old_parent| to |new_name| in |new_parent|
313    /// replacing |replaced|.
314    /// If |replaced| exists and is a directory, this function must check that |renamed| is n
315    /// directory and that |replaced| is empty.
316    pub fn rename<L>(
317        &self,
318        locked: &mut Locked<L>,
319        current_task: &CurrentTask,
320        old_parent: &FsNodeHandle,
321        old_name: &FsStr,
322        new_parent: &FsNodeHandle,
323        new_name: &FsStr,
324        renamed: &FsNodeHandle,
325        replaced: Option<&FsNodeHandle>,
326    ) -> Result<(), Errno>
327    where
328        L: LockEqualOrBefore<FileOpsCore>,
329    {
330        let locked = locked.cast_locked::<FileOpsCore>();
331        self.ops.rename(
332            locked,
333            self,
334            current_task,
335            old_parent,
336            old_name,
337            new_parent,
338            new_name,
339            renamed,
340            replaced,
341        )
342    }
343
344    /// Exchanges `node1` and `node2`. Parent directory node and the corresponding names
345    /// for the two exchanged nodes are passed as `parent1`, `name1`, `parent2`, `name2`.
346    pub fn exchange(
347        &self,
348        current_task: &CurrentTask,
349        node1: &FsNodeHandle,
350        parent1: &FsNodeHandle,
351        name1: &FsStr,
352        node2: &FsNodeHandle,
353        parent2: &FsNodeHandle,
354        name2: &FsStr,
355    ) -> Result<(), Errno> {
356        self.ops.exchange(self, current_task, node1, parent1, name1, node2, parent2, name2)
357    }
358
359    /// Forces a FileSystem unmount.
360    // TODO(https://fxbug.dev/394694891): kernel shutdown should ideally unmount FileSystems via
361    // their drop impl, which should be triggered by Mount.unmount().
362    pub fn force_unmount_ops(&self) {
363        self.ops.unmount();
364    }
365
366    /// Returns the `statfs` for this filesystem.
367    ///
368    /// Each `FileSystemOps` impl is expected to override this to return the specific statfs for
369    /// the filesystem.
370    ///
371    /// Returns `ENOSYS` if the `FileSystemOps` don't implement `stat`.
372    pub fn statfs<L>(
373        &self,
374        locked: &mut Locked<L>,
375        current_task: &CurrentTask,
376    ) -> Result<statfs, Errno>
377    where
378        L: LockEqualOrBefore<FileOpsCore>,
379    {
380        security::sb_statfs(current_task, &self)?;
381        let locked = locked.cast_locked::<FileOpsCore>();
382        let mut stat = self.ops.statfs(locked, self, current_task)?;
383        if stat.f_frsize == 0 {
384            stat.f_frsize = stat.f_bsize as i64;
385        }
386        Ok(stat)
387    }
388
389    pub fn did_create_dir_entry(&self, entry: &DirEntryHandle) {
390        match &self.dcache {
391            DirEntryCache::Permanent(p) => {
392                p.lock().insert(ArcKey(entry.clone()));
393            }
394            DirEntryCache::Lru(LruCache { entries, .. }) => {
395                entries.lock().insert(ArcKey(entry.clone()), ());
396            }
397            DirEntryCache::Uncached => {}
398        }
399    }
400
401    pub fn will_destroy_dir_entry(&self, entry: &DirEntryHandle) {
402        match &self.dcache {
403            DirEntryCache::Permanent(p) => {
404                p.lock().remove(ArcKey::ref_cast(entry));
405            }
406            DirEntryCache::Lru(LruCache { entries, .. }) => {
407                entries.lock().remove(ArcKey::ref_cast(entry));
408            }
409            DirEntryCache::Uncached => {}
410        };
411    }
412
413    /// Informs the cache that the entry was used.
414    pub fn did_access_dir_entry(&self, entry: &DirEntryHandle) {
415        if let DirEntryCache::Lru(LruCache { entries, .. }) = &self.dcache {
416            entries.lock().get_refresh(ArcKey::ref_cast(entry));
417        }
418    }
419
420    /// Purges old entries from the cache. This is done as a separate step to avoid potential
421    /// deadlocks that could occur if done at admission time (where locks might be held that are
422    /// required when dropping old entries). This should be called after any new entries are
423    /// admitted with no locks held that might be required for dropping entries.
424    pub fn purge_old_entries(&self) {
425        if let DirEntryCache::Lru(l) = &self.dcache {
426            let mut purged = SmallVec::<[DirEntryHandle; 4]>::new();
427            {
428                let mut entries = l.entries.lock();
429                while entries.len() > l.capacity {
430                    purged.push(entries.pop_front().unwrap().0.0);
431                }
432            }
433            // Entries will get dropped here whilst we're not holding a lock.
434            std::mem::drop(purged);
435        }
436    }
437
438    /// Returns the `FileSystem`'s `FileSystemOps` as a `&T`, or `None` if the downcast fails.
439    pub fn downcast_ops<T: 'static>(&self) -> Option<&T> {
440        self.ops.as_ref().as_any().downcast_ref()
441    }
442
443    pub fn name(&self) -> &'static FsStr {
444        self.ops.name()
445    }
446
447    pub fn manages_timestamps(&self) -> bool {
448        self.ops.manages_timestamps()
449    }
450
451    /// Returns the crypt service associated with this filesystem, if any. The crypt service
452    /// implements the fuchsia.fxfs.Crypt protocol and maintains an internal structure that maps
453    /// each encryption key id to the actual key.
454    pub fn crypt_service(&self) -> Option<Arc<CryptService>> {
455        self.ops.crypt_service()
456    }
457}
458
459/// The filesystem-implementation-specific data for FileSystem.
460pub trait FileSystemOps: AsAny + Send + Sync + 'static {
461    /// Return information about this filesystem.
462    ///
463    /// A typical implementation looks like this:
464    /// ```
465    /// Ok(statfs::default(FILE_SYSTEM_MAGIC))
466    /// ```
467    /// or, if the filesystem wants to customize fields:
468    /// ```
469    /// Ok(statfs {
470    ///     f_blocks: self.blocks,
471    ///     ..statfs::default(FILE_SYSTEM_MAGIC)
472    /// })
473    /// ```
474    fn statfs(
475        &self,
476        _locked: &mut Locked<FileOpsCore>,
477        _fs: &FileSystem,
478        _current_task: &CurrentTask,
479    ) -> Result<statfs, Errno>;
480
481    fn name(&self) -> &'static FsStr;
482
483    /// Whether this file system uses external node IDs.
484    ///
485    /// If this is true, then the file system is responsible for assigning node IDs to its nodes.
486    /// Otherwise, the VFS will assign node IDs to the nodes.
487    fn uses_external_node_ids(&self) -> bool {
488        false
489    }
490
491    /// Rename the given node.
492    ///
493    /// The node to be renamed is passed as "renamed". It currently has
494    /// old_name in old_parent. After the rename operation, it should have
495    /// new_name in new_parent.
496    ///
497    /// If new_parent already has a child named new_name, that node is passed as
498    /// "replaced". In that case, both "renamed" and "replaced" will be
499    /// directories and the rename operation should succeed only if "replaced"
500    /// is empty. The VFS will check that there are no children of "replaced" in
501    /// the DirEntry cache, but the implementation of this function is
502    /// responsible for checking that there are no children of replaced that are
503    /// known only to the file system implementation (e.g., present on-disk but
504    /// not in the DirEntry cache).
505    fn rename(
506        &self,
507        _locked: &mut Locked<FileOpsCore>,
508        _fs: &FileSystem,
509        _current_task: &CurrentTask,
510        _old_parent: &FsNodeHandle,
511        _old_name: &FsStr,
512        _new_parent: &FsNodeHandle,
513        _new_name: &FsStr,
514        _renamed: &FsNodeHandle,
515        _replaced: Option<&FsNodeHandle>,
516    ) -> Result<(), Errno> {
517        error!(EROFS)
518    }
519
520    fn exchange(
521        &self,
522        _fs: &FileSystem,
523        _current_task: &CurrentTask,
524        _node1: &FsNodeHandle,
525        _parent1: &FsNodeHandle,
526        _name1: &FsStr,
527        _node2: &FsNodeHandle,
528        _parent2: &FsNodeHandle,
529        _name2: &FsStr,
530    ) -> Result<(), Errno> {
531        error!(EINVAL)
532    }
533
534    /// Called when the filesystem is unmounted.
535    fn unmount(&self) {}
536
537    /// Indicates if the filesystem can manage the timestamps (i.e. ctime and mtime).
538    ///
539    /// Starnix updates the timestamps in FsNode's `info` directly. However, if the filesystem can
540    /// manage the timestamps, then Starnix does not need to do so. `info` will be refreshed with
541    /// the timestamps from the filesystem by calling `fetch_and_refresh_info(..)` on the FsNode.
542    fn manages_timestamps(&self) -> bool {
543        false
544    }
545
546    /// Returns the crypt service associated with this filesystem, if any.
547    fn crypt_service(&self) -> Option<Arc<CryptService>> {
548        None
549    }
550}
551
552impl Drop for FileSystem {
553    fn drop(&mut self) {
554        self.ops.unmount();
555    }
556}
557
558pub type FileSystemHandle = Arc<FileSystem>;