Skip to main content

starnix_core/vfs/
file_system.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::{CurrentTask, Kernel};
7use crate::vfs::fs_args::MountParams;
8use crate::vfs::fs_node_cache::FsNodeCache;
9use crate::vfs::{
10    DirEntry, DirEntryHandle, FsNode, FsNodeFlags, FsNodeHandle, FsNodeInfo, FsNodeOps, FsStr,
11    FsString,
12};
13use flyweights::FlyByteStr;
14use linked_hash_map::LinkedHashMap;
15use ref_cast::RefCast;
16use smallvec::SmallVec;
17use starnix_crypt::CryptService;
18use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, Mutex};
19use starnix_uapi::arc_key::ArcKey;
20use starnix_uapi::as_any::AsAny;
21use starnix_uapi::auth::FsCred;
22use starnix_uapi::device_id::DeviceId;
23use starnix_uapi::errors::Errno;
24use starnix_uapi::file_mode::mode;
25use starnix_uapi::mount_flags::FileSystemFlags;
26use starnix_uapi::{error, ino_t, statfs};
27use std::collections::HashSet;
28use std::ops::Range;
29use std::sync::{Arc, OnceLock, Weak};
30
31/// A file system that can be mounted in a namespace.
32pub struct FileSystem {
33    pub kernel: Weak<Kernel>,
34    root: OnceLock<DirEntryHandle>,
35    ops: Box<dyn FileSystemOps>,
36
37    /// The options specified when mounting the filesystem. Saved here for display in
38    /// /proc/[pid]/mountinfo.
39    pub options: FileSystemOptions,
40
41    /// The device ID of this filesystem. Returned in the st_dev field when stating an inode in
42    /// this filesystem.
43    pub dev_id: DeviceId,
44
45    /// A file-system global mutex to serialize rename operations.
46    ///
47    /// This mutex is useful because the invariants enforced during a rename
48    /// operation involve many DirEntry objects. In the future, we might be
49    /// able to remove this mutex, but we will need to think carefully about
50    /// how rename operations can interleave.
51    ///
52    /// See DirEntry::rename.
53    pub rename_mutex: Mutex<()>,
54
55    /// The FsNode cache for this file system.
56    ///
57    /// When two directory entries are hard links to the same underlying inode,
58    /// this cache lets us re-use the same FsNode object for both directory
59    /// entries.
60    ///
61    /// Rather than calling FsNode::new directly, file systems should call
62    /// FileSystem::get_or_create_node to see if the FsNode already exists in
63    /// the cache.
64    node_cache: Arc<FsNodeCache>,
65
66    /// DirEntryHandle cache for the filesystem. Holds strong references to DirEntry objects. For
67    /// filesystems with permanent entries, this will hold a strong reference to every node to make
68    /// sure it doesn't get freed without being explicitly unlinked. Otherwise, entries are
69    /// maintained in an LRU cache.
70    dcache: DirEntryCache,
71
72    /// Holds security state for this file system, which is created and used by the Linux Security
73    /// Modules subsystem hooks.
74    pub security_state: security::FileSystemState,
75}
76
77impl std::fmt::Debug for FileSystem {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        write!(f, "FileSystem")
80    }
81}
82
83#[derive(Clone, Debug, Default)]
84pub struct FileSystemOptions {
85    /// The source string passed as the first argument to mount(), e.g. a block device.
86    pub source: FlyByteStr,
87    /// Flags kept per-superblock.
88    pub flags: FileSystemFlags,
89    /// Filesystem options passed as the last argument to mount().
90    pub params: MountParams,
91}
92
93impl FileSystemOptions {
94    pub fn source_for_display(&self) -> &FsStr {
95        if self.source.is_empty() {
96            return "none".into();
97        }
98        self.source.as_ref()
99    }
100}
101
102struct LruCache {
103    capacity: usize,
104    entries: Mutex<LinkedHashMap<ArcKey<DirEntry>, ()>>,
105}
106
107enum DirEntryCache {
108    Permanent(Mutex<HashSet<ArcKey<DirEntry>>>),
109    Lru(LruCache),
110    Uncached,
111}
112
113/// Configuration for CacheMode::Cached.
114pub struct CacheConfig {
115    pub capacity: usize,
116}
117
118pub enum CacheMode {
119    /// Entries are pemanent, instead of a cache of the backing storage. An example is tmpfs: the
120    /// DirEntry tree *is* the backing storage, as opposed to ext4, which uses the DirEntry tree as
121    /// a cache and removes unused nodes from it.
122    Permanent,
123    /// Entries are cached.
124    Cached(CacheConfig),
125    /// Entries are uncached. This can be appropriate in cases where it is difficult for the
126    /// filesystem to keep the cache coherent: e.g. the /proc/<pid>/task directory.
127    Uncached,
128}
129
130impl FileSystem {
131    /// Create a new filesystem.
132    pub fn new<L>(
133        locked: &mut Locked<L>,
134        kernel: &Kernel,
135        cache_mode: CacheMode,
136        ops: impl FileSystemOps,
137        mut options: FileSystemOptions,
138    ) -> Result<FileSystemHandle, Errno>
139    where
140        L: LockEqualOrBefore<FileOpsCore>,
141    {
142        let uses_external_node_ids = ops.uses_external_node_ids();
143        let node_cache = Arc::new(FsNodeCache::new(uses_external_node_ids));
144        assert_eq!(ops.uses_external_node_ids(), node_cache.uses_external_node_ids());
145
146        let mount_options = security::sb_eat_lsm_opts(&kernel, &mut options.params)?;
147        let security_state = security::file_system_init_security(&mount_options, &ops)?;
148
149        // TODO: https://fxbug.dev/322875215 - Remove this workaround once non-bind MS_REMOUNT is
150        // implemented.
151        if !ops.is_readonly() {
152            // Preserve the old behaviour, that only the per-mount MS_RDONLY flag took effect, by
153            // removing it from the `MountFlags` stored with the `FileSystem`.
154            options.flags &= !FileSystemFlags::RDONLY;
155        }
156
157        let file_system = Arc::new(FileSystem {
158            kernel: kernel.weak_self.clone(),
159            root: OnceLock::new(),
160            ops: Box::new(ops),
161            options,
162            dev_id: kernel.device_registry.next_anonymous_dev_id(locked),
163            rename_mutex: Mutex::new(()),
164            node_cache,
165            dcache: match cache_mode {
166                CacheMode::Permanent => DirEntryCache::Permanent(Mutex::new(HashSet::new())),
167                CacheMode::Cached(CacheConfig { capacity }) => DirEntryCache::Lru(LruCache {
168                    capacity,
169                    entries: Mutex::new(LinkedHashMap::new()),
170                }),
171                CacheMode::Uncached => DirEntryCache::Uncached,
172            },
173            security_state,
174        });
175
176        // TODO: https://fxbug.dev/366405587 - Workaround to allow SELinux to note that this
177        // `FileSystem` needs labeling, once a policy has been loaded.
178        security::file_system_post_init_security(kernel, &file_system);
179
180        Ok(file_system)
181    }
182
183    fn set_root(self: &FileSystemHandle, root: FsNodeHandle) {
184        // No need to cache the root directory, it is owned by the filesystem.
185        let root_dir = DirEntry::new_uncached(root, None, FsString::default());
186        assert!(
187            self.root.set(root_dir).is_ok(),
188            "FileSystem::set_root can't be called more than once"
189        );
190    }
191
192    pub fn has_permanent_entries(&self) -> bool {
193        matches!(self.dcache, DirEntryCache::Permanent(_))
194    }
195
196    /// The root directory entry of this file system.
197    ///
198    /// Panics if this file system does not have a root directory.
199    pub fn root(&self) -> &DirEntryHandle {
200        self.root.get().unwrap_or_else(|| panic!("FileSystem {} has no root", self.name()))
201    }
202
203    /// The root directory entry of this `FileSystem`, if it has one.
204    pub fn maybe_root(&self) -> Option<&DirEntryHandle> {
205        self.root.get()
206    }
207
208    pub fn get_or_create_node<F>(
209        &self,
210        node_key: ino_t,
211        create_fn: F,
212    ) -> Result<FsNodeHandle, Errno>
213    where
214        F: FnOnce() -> Result<FsNodeHandle, Errno>,
215    {
216        self.get_and_validate_or_create_node(node_key, |_| true, create_fn)
217    }
218
219    /// Get a node that is validated with the callback, or create an FsNode for
220    /// this file system.
221    ///
222    /// If node_id is Some, then this function checks the node cache to
223    /// determine whether this node is already open. If so, the function
224    /// returns the existing FsNode if it passes the validation check. If no
225    /// node exists, or a node does but fails the validation check, the function
226    /// calls the given create_fn function to create the FsNode.
227    ///
228    /// If node_id is None, then this function assigns a new identifier number
229    /// and calls the given create_fn function to create the FsNode with the
230    /// assigned number.
231    ///
232    /// Returns Err only if create_fn returns Err.
233    pub fn get_and_validate_or_create_node<V, C>(
234        &self,
235        node_key: ino_t,
236        validate_fn: V,
237        create_fn: C,
238    ) -> Result<FsNodeHandle, Errno>
239    where
240        V: Fn(&FsNodeHandle) -> bool,
241        C: FnOnce() -> Result<FsNodeHandle, Errno>,
242    {
243        self.node_cache.get_and_validate_or_create_node(node_key, validate_fn, create_fn)
244    }
245
246    /// File systems that produce their own IDs for nodes should invoke this
247    /// function. The ones who leave to this object to assign the IDs should
248    /// call |create_node_and_allocate_node_id|.
249    pub fn create_node_with_flags(
250        self: &Arc<Self>,
251        ino: Option<ino_t>,
252        ops: impl Into<Box<dyn FsNodeOps>>,
253        info: FsNodeInfo,
254        flags: FsNodeFlags,
255    ) -> FsNodeHandle {
256        let ino = ino.unwrap_or_else(|| self.allocate_ino());
257        let node = FsNode::new_uncached(ino, ops, self, info, flags);
258        self.node_cache.insert_node(&node);
259        node
260    }
261
262    pub fn create_node(
263        self: &Arc<Self>,
264        ino: ino_t,
265        ops: impl Into<Box<dyn FsNodeOps>>,
266        info: FsNodeInfo,
267    ) -> FsNodeHandle {
268        self.create_node_with_flags(Some(ino), ops, info, FsNodeFlags::empty())
269    }
270
271    pub fn create_node_and_allocate_node_id(
272        self: &Arc<Self>,
273        ops: impl Into<Box<dyn FsNodeOps>>,
274        info: FsNodeInfo,
275    ) -> FsNodeHandle {
276        self.create_node_with_flags(None, ops, info, FsNodeFlags::empty())
277    }
278
279    /// Create a node for a directory that has no parent.
280    pub fn create_detached_node(
281        self: &Arc<Self>,
282        ino: ino_t,
283        ops: impl Into<Box<dyn FsNodeOps>>,
284        info: FsNodeInfo,
285    ) -> FsNodeHandle {
286        assert!(info.mode.is_dir());
287        let node = FsNode::new_uncached(ino, ops, self, info, FsNodeFlags::empty());
288        self.node_cache.insert_node(&node);
289        node
290    }
291
292    /// Create a root node for the filesystem.
293    ///
294    /// This is a convenience function that creates a root node with the default
295    /// directory mode and root credentials.
296    pub fn create_root(self: &Arc<Self>, ino: ino_t, ops: impl Into<Box<dyn FsNodeOps>>) {
297        let info = FsNodeInfo::new(mode!(IFDIR, 0o777), FsCred::root());
298        self.create_root_with_info(ino, ops, info);
299    }
300
301    pub fn create_root_with_info(
302        self: &Arc<Self>,
303        ino: ino_t,
304        ops: impl Into<Box<dyn FsNodeOps>>,
305        info: FsNodeInfo,
306    ) {
307        let node = self.create_detached_node(ino, ops, info);
308        self.set_root(node);
309    }
310
311    /// Remove the given FsNode from the node cache.
312    ///
313    /// Called from the Release trait of FsNode.
314    pub fn remove_node(&self, node: &FsNode) {
315        self.node_cache.remove_node(node);
316    }
317
318    pub fn allocate_ino(&self) -> ino_t {
319        self.node_cache
320            .allocate_ino()
321            .expect("allocate_ino called on a filesystem that uses external node IDs")
322    }
323
324    /// Allocate a contiguous block of node ids.
325    pub fn allocate_ino_range(&self, size: usize) -> Range<ino_t> {
326        self.node_cache
327            .allocate_ino_range(size)
328            .expect("allocate_ino_range called on a filesystem that uses external node IDs")
329    }
330
331    /// Move |renamed| that is at |old_name| in |old_parent| to |new_name| in |new_parent|
332    /// replacing |replaced|.
333    /// If |replaced| exists and is a directory, this function must check that |renamed| is n
334    /// directory and that |replaced| is empty.
335    pub fn rename<L>(
336        &self,
337        locked: &mut Locked<L>,
338        current_task: &CurrentTask,
339        old_parent: &FsNodeHandle,
340        old_name: &FsStr,
341        new_parent: &FsNodeHandle,
342        new_name: &FsStr,
343        renamed: &FsNodeHandle,
344        replaced: Option<&FsNodeHandle>,
345    ) -> Result<(), Errno>
346    where
347        L: LockEqualOrBefore<FileOpsCore>,
348    {
349        let locked = locked.cast_locked::<FileOpsCore>();
350        self.ops.rename(
351            locked,
352            self,
353            current_task,
354            old_parent,
355            old_name,
356            new_parent,
357            new_name,
358            renamed,
359            replaced,
360        )
361    }
362
363    /// Exchanges `node1` and `node2`. Parent directory node and the corresponding names
364    /// for the two exchanged nodes are passed as `parent1`, `name1`, `parent2`, `name2`.
365    pub fn exchange(
366        &self,
367        current_task: &CurrentTask,
368        node1: &FsNodeHandle,
369        parent1: &FsNodeHandle,
370        name1: &FsStr,
371        node2: &FsNodeHandle,
372        parent2: &FsNodeHandle,
373        name2: &FsStr,
374    ) -> Result<(), Errno> {
375        self.ops.exchange(self, current_task, node1, parent1, name1, node2, parent2, name2)
376    }
377
378    /// Forces a FileSystem unmount.
379    // TODO(https://fxbug.dev/394694891): kernel shutdown should ideally unmount FileSystems via
380    // their drop impl, which should be triggered by Mount.unmount().
381    pub fn force_unmount_ops(&self) {
382        self.ops.unmount();
383    }
384
385    /// Returns the `statfs` for this filesystem.
386    ///
387    /// Each `FileSystemOps` impl is expected to override this to return the specific statfs for
388    /// the filesystem.
389    ///
390    /// Returns `ENOSYS` if the `FileSystemOps` don't implement `stat`.
391    pub fn statfs<L>(
392        &self,
393        locked: &mut Locked<L>,
394        current_task: &CurrentTask,
395    ) -> Result<statfs, Errno>
396    where
397        L: LockEqualOrBefore<FileOpsCore>,
398    {
399        security::sb_statfs(current_task, &self)?;
400        let locked = locked.cast_locked::<FileOpsCore>();
401        let mut stat = self.ops.statfs(locked, self, current_task)?;
402        if stat.f_frsize == 0 {
403            stat.f_frsize = stat.f_bsize as i64;
404        }
405        Ok(stat)
406    }
407
408    pub fn sync<L>(&self, locked: &mut Locked<L>, current_task: &CurrentTask) -> Result<(), Errno>
409    where
410        L: LockEqualOrBefore<FileOpsCore>,
411    {
412        self.ops.sync(locked.cast_locked::<FileOpsCore>(), self, current_task)
413    }
414
415    pub fn did_create_dir_entry(&self, entry: &DirEntryHandle) {
416        match &self.dcache {
417            DirEntryCache::Permanent(p) => {
418                p.lock().insert(ArcKey(entry.clone()));
419            }
420            DirEntryCache::Lru(LruCache { entries, .. }) => {
421                entries.lock().insert(ArcKey(entry.clone()), ());
422            }
423            DirEntryCache::Uncached => {}
424        }
425    }
426
427    pub fn will_destroy_dir_entry(&self, entry: &DirEntryHandle) {
428        match &self.dcache {
429            DirEntryCache::Permanent(p) => {
430                p.lock().remove(ArcKey::ref_cast(entry));
431            }
432            DirEntryCache::Lru(LruCache { entries, .. }) => {
433                entries.lock().remove(ArcKey::ref_cast(entry));
434            }
435            DirEntryCache::Uncached => {}
436        };
437    }
438
439    /// Informs the cache that the entry was used.
440    pub fn did_access_dir_entry(&self, entry: &DirEntryHandle) {
441        if let DirEntryCache::Lru(LruCache { entries, .. }) = &self.dcache {
442            entries.lock().get_refresh(ArcKey::ref_cast(entry));
443        }
444    }
445
446    /// Purges old entries from the cache. This is done as a separate step to avoid potential
447    /// deadlocks that could occur if done at admission time (where locks might be held that are
448    /// required when dropping old entries). This should be called after any new entries are
449    /// admitted with no locks held that might be required for dropping entries.
450    pub fn purge_old_entries(&self) {
451        if let DirEntryCache::Lru(l) = &self.dcache {
452            let mut purged = SmallVec::<[DirEntryHandle; 4]>::new();
453            {
454                let mut entries = l.entries.lock();
455                while entries.len() > l.capacity {
456                    purged.push(entries.pop_front().unwrap().0.0);
457                }
458            }
459            // Entries will get dropped here whilst we're not holding a lock.
460            std::mem::drop(purged);
461        }
462    }
463
464    /// Returns the `FileSystem`'s `FileSystemOps` as a `&T`, or `None` if the downcast fails.
465    pub fn downcast_ops<T: 'static>(&self) -> Option<&T> {
466        self.ops.as_ref().as_any().downcast_ref()
467    }
468
469    pub fn name(&self) -> &'static FsStr {
470        self.ops.name()
471    }
472
473    pub fn manages_timestamps(&self) -> bool {
474        self.ops.manages_timestamps()
475    }
476
477    /// Returns the crypt service associated with this filesystem, if any. The crypt service
478    /// implements the fuchsia.fxfs.Crypt protocol and maintains an internal structure that maps
479    /// each encryption key id to the actual key.
480    pub fn crypt_service(&self) -> Option<Arc<CryptService>> {
481        self.ops.crypt_service()
482    }
483}
484
485/// The filesystem-implementation-specific data for FileSystem.
486pub trait FileSystemOps: AsAny + Send + Sync + 'static {
487    /// Return information about this filesystem.
488    ///
489    /// A typical implementation looks like this:
490    /// ```
491    /// Ok(statfs::default(FILE_SYSTEM_MAGIC))
492    /// ```
493    /// or, if the filesystem wants to customize fields:
494    /// ```
495    /// Ok(statfs {
496    ///     f_blocks: self.blocks,
497    ///     ..statfs::default(FILE_SYSTEM_MAGIC)
498    /// })
499    /// ```
500    fn statfs(
501        &self,
502        _locked: &mut Locked<FileOpsCore>,
503        _fs: &FileSystem,
504        _current_task: &CurrentTask,
505    ) -> Result<statfs, Errno>;
506
507    fn name(&self) -> &'static FsStr;
508
509    /// Whether this file system uses external node IDs.
510    ///
511    /// If this is true, then the file system is responsible for assigning node IDs to its nodes.
512    /// Otherwise, the VFS will assign node IDs to the nodes.
513    fn uses_external_node_ids(&self) -> bool {
514        false
515    }
516
517    /// Rename the given node.
518    ///
519    /// The node to be renamed is passed as "renamed". It currently has
520    /// old_name in old_parent. After the rename operation, it should have
521    /// new_name in new_parent.
522    ///
523    /// If new_parent already has a child named new_name, that node is passed as
524    /// "replaced". In that case, both "renamed" and "replaced" will be
525    /// directories and the rename operation should succeed only if "replaced"
526    /// is empty. The VFS will check that there are no children of "replaced" in
527    /// the DirEntry cache, but the implementation of this function is
528    /// responsible for checking that there are no children of replaced that are
529    /// known only to the file system implementation (e.g., present on-disk but
530    /// not in the DirEntry cache).
531    fn rename(
532        &self,
533        _locked: &mut Locked<FileOpsCore>,
534        _fs: &FileSystem,
535        _current_task: &CurrentTask,
536        _old_parent: &FsNodeHandle,
537        _old_name: &FsStr,
538        _new_parent: &FsNodeHandle,
539        _new_name: &FsStr,
540        _renamed: &FsNodeHandle,
541        _replaced: Option<&FsNodeHandle>,
542    ) -> Result<(), Errno> {
543        error!(EROFS)
544    }
545
546    fn exchange(
547        &self,
548        _fs: &FileSystem,
549        _current_task: &CurrentTask,
550        _node1: &FsNodeHandle,
551        _parent1: &FsNodeHandle,
552        _name1: &FsStr,
553        _node2: &FsNodeHandle,
554        _parent2: &FsNodeHandle,
555        _name2: &FsStr,
556    ) -> Result<(), Errno> {
557        error!(EINVAL)
558    }
559
560    /// Called when the filesystem is unmounted.
561    fn unmount(&self) {}
562
563    /// Indicates if the filesystem can manage the timestamps (i.e. ctime and mtime).
564    ///
565    /// Starnix updates the timestamps in FsNode's `info` directly. However, if the filesystem can
566    /// manage the timestamps, then Starnix does not need to do so. `info` will be refreshed with
567    /// the timestamps from the filesystem by calling `fetch_and_refresh_info(..)` on the FsNode.
568    fn manages_timestamps(&self) -> bool {
569        false
570    }
571
572    /// Returns the crypt service associated with this filesystem, if any.
573    fn crypt_service(&self) -> Option<Arc<CryptService>> {
574        None
575    }
576
577    fn sync(
578        &self,
579        _locked: &mut Locked<FileOpsCore>,
580        _fs: &FileSystem,
581        _current_task: &CurrentTask,
582    ) -> Result<(), Errno> {
583        Ok(())
584    }
585
586    /// Returns true if the `FileSystemOps` is intrinsically read-only, as is the case for
587    /// "remote_bundle", or the "remotefs" mounts to read-only directories.
588    // TODO: https://fxbug.dev/322875215 - Remove this workaround once non-bind MS_REMOUNT is
589    // implemented.
590    fn is_readonly(&self) -> bool {
591        false
592    }
593}
594
595impl Drop for FileSystem {
596    fn drop(&mut self) {
597        self.ops.unmount();
598    }
599}
600
601pub type FileSystemHandle = Arc<FileSystem>;