Skip to main content

starnix_core/vfs/
file_system.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::{CurrentTask, Kernel};
7use crate::vfs::fs_args::MountParams;
8use crate::vfs::fs_node_cache::FsNodeCache;
9use crate::vfs::{
10    DirEntry, DirEntryHandle, FsNode, FsNodeHandle, FsNodeInfo, FsNodeOps, FsStr, FsString,
11};
12use flyweights::FlyByteStr;
13use linked_hash_map::LinkedHashMap;
14use ref_cast::RefCast;
15use smallvec::SmallVec;
16use starnix_crypt::CryptService;
17use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, Mutex};
18use starnix_uapi::arc_key::ArcKey;
19use starnix_uapi::as_any::AsAny;
20use starnix_uapi::auth::FsCred;
21use starnix_uapi::device_id::DeviceId;
22use starnix_uapi::errors::Errno;
23use starnix_uapi::file_mode::mode;
24use starnix_uapi::mount_flags::FileSystemFlags;
25use starnix_uapi::{error, ino_t, statfs};
26use std::collections::HashSet;
27use std::ops::Range;
28use std::sync::{Arc, OnceLock, Weak};
29
30/// A file system that can be mounted in a namespace.
31pub struct FileSystem {
32    pub kernel: Weak<Kernel>,
33    root: OnceLock<DirEntryHandle>,
34    ops: Box<dyn FileSystemOps>,
35
36    /// The options specified when mounting the filesystem. Saved here for display in
37    /// /proc/[pid]/mountinfo.
38    pub options: FileSystemOptions,
39
40    /// The device ID of this filesystem. Returned in the st_dev field when stating an inode in
41    /// this filesystem.
42    pub dev_id: DeviceId,
43
44    /// A file-system global mutex to serialize rename operations.
45    ///
46    /// This mutex is useful because the invariants enforced during a rename
47    /// operation involve many DirEntry objects. In the future, we might be
48    /// able to remove this mutex, but we will need to think carefully about
49    /// how rename operations can interleave.
50    ///
51    /// See DirEntry::rename.
52    pub rename_mutex: Mutex<()>,
53
54    /// The FsNode cache for this file system.
55    ///
56    /// When two directory entries are hard links to the same underlying inode,
57    /// this cache lets us re-use the same FsNode object for both directory
58    /// entries.
59    ///
60    /// Rather than calling FsNode::new directly, file systems should call
61    /// FileSystem::get_or_create_node to see if the FsNode already exists in
62    /// the cache.
63    node_cache: Arc<FsNodeCache>,
64
65    /// DirEntryHandle cache for the filesystem. Holds strong references to DirEntry objects. For
66    /// filesystems with permanent entries, this will hold a strong reference to every node to make
67    /// sure it doesn't get freed without being explicitly unlinked. Otherwise, entries are
68    /// maintained in an LRU cache.
69    dcache: DirEntryCache,
70
71    /// Holds security state for this file system, which is created and used by the Linux Security
72    /// Modules subsystem hooks.
73    pub security_state: security::FileSystemState,
74}
75
76impl std::fmt::Debug for FileSystem {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78        write!(f, "FileSystem")
79    }
80}
81
82#[derive(Clone, Debug, Default)]
83pub struct FileSystemOptions {
84    /// The source string passed as the first argument to mount(), e.g. a block device.
85    pub source: FlyByteStr,
86    /// Flags kept per-superblock.
87    pub flags: FileSystemFlags,
88    /// Filesystem options passed as the last argument to mount().
89    pub params: MountParams,
90}
91
92impl FileSystemOptions {
93    pub fn source_for_display(&self) -> &FsStr {
94        if self.source.is_empty() {
95            return "none".into();
96        }
97        self.source.as_ref()
98    }
99}
100
101struct LruCache {
102    capacity: usize,
103    entries: Mutex<LinkedHashMap<ArcKey<DirEntry>, ()>>,
104}
105
106enum DirEntryCache {
107    Permanent(Mutex<HashSet<ArcKey<DirEntry>>>),
108    Lru(LruCache),
109    Uncached,
110}
111
112/// Configuration for CacheMode::Cached.
113pub struct CacheConfig {
114    pub capacity: usize,
115}
116
117pub enum CacheMode {
118    /// Entries are pemanent, instead of a cache of the backing storage. An example is tmpfs: the
119    /// DirEntry tree *is* the backing storage, as opposed to ext4, which uses the DirEntry tree as
120    /// a cache and removes unused nodes from it.
121    Permanent,
122    /// Entries are cached.
123    Cached(CacheConfig),
124    /// Entries are uncached. This can be appropriate in cases where it is difficult for the
125    /// filesystem to keep the cache coherent: e.g. the /proc/<pid>/task directory.
126    Uncached,
127}
128
129impl FileSystem {
130    /// Create a new filesystem.
131    pub fn new<L>(
132        locked: &mut Locked<L>,
133        kernel: &Kernel,
134        cache_mode: CacheMode,
135        ops: impl FileSystemOps,
136        mut options: FileSystemOptions,
137    ) -> Result<FileSystemHandle, Errno>
138    where
139        L: LockEqualOrBefore<FileOpsCore>,
140    {
141        let uses_external_node_ids = ops.uses_external_node_ids();
142        let node_cache = Arc::new(FsNodeCache::new(uses_external_node_ids));
143        assert_eq!(ops.uses_external_node_ids(), node_cache.uses_external_node_ids());
144
145        let mount_options = security::sb_eat_lsm_opts(&kernel, &mut options.params)?;
146        let security_state = security::file_system_init_security(&mount_options, &ops)?;
147
148        // TODO: https://fxbug.dev/322875215 - Remove this workaround once non-bind MS_REMOUNT is
149        // implemented.
150        if !ops.is_readonly() {
151            // Preserve the old behaviour, that only the per-mount MS_RDONLY flag took effect, by
152            // removing it from the `MountFlags` stored with the `FileSystem`.
153            options.flags &= !FileSystemFlags::RDONLY;
154        }
155
156        let file_system = Arc::new(FileSystem {
157            kernel: kernel.weak_self.clone(),
158            root: OnceLock::new(),
159            ops: Box::new(ops),
160            options,
161            dev_id: kernel.device_registry.next_anonymous_dev_id(locked),
162            rename_mutex: Mutex::new(()),
163            node_cache,
164            dcache: match cache_mode {
165                CacheMode::Permanent => DirEntryCache::Permanent(Mutex::new(HashSet::new())),
166                CacheMode::Cached(CacheConfig { capacity }) => DirEntryCache::Lru(LruCache {
167                    capacity,
168                    entries: Mutex::new(LinkedHashMap::new()),
169                }),
170                CacheMode::Uncached => DirEntryCache::Uncached,
171            },
172            security_state,
173        });
174
175        // TODO: https://fxbug.dev/366405587 - Workaround to allow SELinux to note that this
176        // `FileSystem` needs labeling, once a policy has been loaded.
177        security::file_system_post_init_security(kernel, &file_system);
178
179        Ok(file_system)
180    }
181
182    fn set_root(self: &FileSystemHandle, root: FsNodeHandle) {
183        // No need to cache the root directory, it is owned by the filesystem.
184        let root_dir = DirEntry::new_uncached(root, None, FsString::default());
185        assert!(
186            self.root.set(root_dir).is_ok(),
187            "FileSystem::set_root can't be called more than once"
188        );
189    }
190
191    pub fn has_permanent_entries(&self) -> bool {
192        matches!(self.dcache, DirEntryCache::Permanent(_))
193    }
194
195    /// The root directory entry of this file system.
196    ///
197    /// Panics if this file system does not have a root directory.
198    pub fn root(&self) -> &DirEntryHandle {
199        self.root.get().unwrap_or_else(|| panic!("FileSystem {} has no root", self.name()))
200    }
201
202    /// The root directory entry of this `FileSystem`, if it has one.
203    pub fn maybe_root(&self) -> Option<&DirEntryHandle> {
204        self.root.get()
205    }
206
207    pub fn get_or_create_node<F>(
208        &self,
209        node_key: ino_t,
210        create_fn: F,
211    ) -> Result<FsNodeHandle, Errno>
212    where
213        F: FnOnce() -> Result<FsNodeHandle, Errno>,
214    {
215        self.get_and_validate_or_create_node(node_key, |_| true, create_fn)
216    }
217
218    /// Get a node that is validated with the callback, or create an FsNode for
219    /// this file system.
220    ///
221    /// If node_id is Some, then this function checks the node cache to
222    /// determine whether this node is already open. If so, the function
223    /// returns the existing FsNode if it passes the validation check. If no
224    /// node exists, or a node does but fails the validation check, the function
225    /// calls the given create_fn function to create the FsNode.
226    ///
227    /// If node_id is None, then this function assigns a new identifier number
228    /// and calls the given create_fn function to create the FsNode with the
229    /// assigned number.
230    ///
231    /// Returns Err only if create_fn returns Err.
232    pub fn get_and_validate_or_create_node<V, C>(
233        &self,
234        node_key: ino_t,
235        validate_fn: V,
236        create_fn: C,
237    ) -> Result<FsNodeHandle, Errno>
238    where
239        V: Fn(&FsNodeHandle) -> bool,
240        C: FnOnce() -> Result<FsNodeHandle, Errno>,
241    {
242        self.node_cache.get_and_validate_or_create_node(node_key, validate_fn, create_fn)
243    }
244
245    /// File systems that produce their own IDs for nodes should invoke this
246    /// function. The ones who leave to this object to assign the IDs should
247    /// call |create_node_and_allocate_node_id|.
248    pub fn create_node(
249        self: &Arc<Self>,
250        ino: ino_t,
251        ops: impl Into<Box<dyn FsNodeOps>>,
252        info: FsNodeInfo,
253    ) -> FsNodeHandle {
254        let node = FsNode::new_uncached(ino, ops, self, info);
255        self.node_cache.insert_node(&node);
256        node
257    }
258
259    pub fn create_node_and_allocate_node_id(
260        self: &Arc<Self>,
261        ops: impl Into<Box<dyn FsNodeOps>>,
262        info: FsNodeInfo,
263    ) -> FsNodeHandle {
264        let ino = self.allocate_ino();
265        self.create_node(ino, ops, info)
266    }
267
268    /// Create a node for a directory that has no parent.
269    pub fn create_detached_node(
270        self: &Arc<Self>,
271        ino: ino_t,
272        ops: impl Into<Box<dyn FsNodeOps>>,
273        info: FsNodeInfo,
274    ) -> FsNodeHandle {
275        assert!(info.mode.is_dir());
276        let node = FsNode::new_uncached(ino, ops, self, info);
277        self.node_cache.insert_node(&node);
278        node
279    }
280
281    /// Create a root node for the filesystem.
282    ///
283    /// This is a convenience function that creates a root node with the default
284    /// directory mode and root credentials.
285    pub fn create_root(self: &Arc<Self>, ino: ino_t, ops: impl Into<Box<dyn FsNodeOps>>) {
286        let info = FsNodeInfo::new(mode!(IFDIR, 0o777), FsCred::root());
287        self.create_root_with_info(ino, ops, info);
288    }
289
290    pub fn create_root_with_info(
291        self: &Arc<Self>,
292        ino: ino_t,
293        ops: impl Into<Box<dyn FsNodeOps>>,
294        info: FsNodeInfo,
295    ) {
296        let node = self.create_detached_node(ino, ops, info);
297        self.set_root(node);
298    }
299
300    /// Remove the given FsNode from the node cache.
301    ///
302    /// Called from the Release trait of FsNode.
303    pub fn remove_node(&self, node: &FsNode) {
304        self.node_cache.remove_node(node);
305    }
306
307    pub fn allocate_ino(&self) -> ino_t {
308        self.node_cache
309            .allocate_ino()
310            .expect("allocate_ino called on a filesystem that uses external node IDs")
311    }
312
313    /// Allocate a contiguous block of node ids.
314    pub fn allocate_ino_range(&self, size: usize) -> Range<ino_t> {
315        self.node_cache
316            .allocate_ino_range(size)
317            .expect("allocate_ino_range called on a filesystem that uses external node IDs")
318    }
319
320    /// Move |renamed| that is at |old_name| in |old_parent| to |new_name| in |new_parent|
321    /// replacing |replaced|.
322    /// If |replaced| exists and is a directory, this function must check that |renamed| is n
323    /// directory and that |replaced| is empty.
324    pub fn rename<L>(
325        &self,
326        locked: &mut Locked<L>,
327        current_task: &CurrentTask,
328        old_parent: &FsNodeHandle,
329        old_name: &FsStr,
330        new_parent: &FsNodeHandle,
331        new_name: &FsStr,
332        renamed: &FsNodeHandle,
333        replaced: Option<&FsNodeHandle>,
334    ) -> Result<(), Errno>
335    where
336        L: LockEqualOrBefore<FileOpsCore>,
337    {
338        let locked = locked.cast_locked::<FileOpsCore>();
339        self.ops.rename(
340            locked,
341            self,
342            current_task,
343            old_parent,
344            old_name,
345            new_parent,
346            new_name,
347            renamed,
348            replaced,
349        )
350    }
351
352    /// Exchanges `node1` and `node2`. Parent directory node and the corresponding names
353    /// for the two exchanged nodes are passed as `parent1`, `name1`, `parent2`, `name2`.
354    pub fn exchange(
355        &self,
356        current_task: &CurrentTask,
357        node1: &FsNodeHandle,
358        parent1: &FsNodeHandle,
359        name1: &FsStr,
360        node2: &FsNodeHandle,
361        parent2: &FsNodeHandle,
362        name2: &FsStr,
363    ) -> Result<(), Errno> {
364        self.ops.exchange(self, current_task, node1, parent1, name1, node2, parent2, name2)
365    }
366
367    /// Forces a FileSystem unmount.
368    // TODO(https://fxbug.dev/394694891): kernel shutdown should ideally unmount FileSystems via
369    // their drop impl, which should be triggered by Mount.unmount().
370    pub fn force_unmount_ops(&self) {
371        self.ops.unmount();
372    }
373
374    /// Returns the `statfs` for this filesystem.
375    ///
376    /// Each `FileSystemOps` impl is expected to override this to return the specific statfs for
377    /// the filesystem.
378    ///
379    /// Returns `ENOSYS` if the `FileSystemOps` don't implement `stat`.
380    pub fn statfs<L>(
381        &self,
382        locked: &mut Locked<L>,
383        current_task: &CurrentTask,
384    ) -> Result<statfs, Errno>
385    where
386        L: LockEqualOrBefore<FileOpsCore>,
387    {
388        security::sb_statfs(current_task, &self)?;
389        let locked = locked.cast_locked::<FileOpsCore>();
390        let mut stat = self.ops.statfs(locked, self, current_task)?;
391        if stat.f_frsize == 0 {
392            stat.f_frsize = stat.f_bsize as i64;
393        }
394        Ok(stat)
395    }
396
397    pub fn sync<L>(&self, locked: &mut Locked<L>, current_task: &CurrentTask) -> Result<(), Errno>
398    where
399        L: LockEqualOrBefore<FileOpsCore>,
400    {
401        self.ops.sync(locked.cast_locked::<FileOpsCore>(), self, current_task)
402    }
403
404    pub fn did_create_dir_entry(&self, entry: &DirEntryHandle) {
405        match &self.dcache {
406            DirEntryCache::Permanent(p) => {
407                p.lock().insert(ArcKey(entry.clone()));
408            }
409            DirEntryCache::Lru(LruCache { entries, .. }) => {
410                entries.lock().insert(ArcKey(entry.clone()), ());
411            }
412            DirEntryCache::Uncached => {}
413        }
414    }
415
416    pub fn will_destroy_dir_entry(&self, entry: &DirEntryHandle) {
417        match &self.dcache {
418            DirEntryCache::Permanent(p) => {
419                p.lock().remove(ArcKey::ref_cast(entry));
420            }
421            DirEntryCache::Lru(LruCache { entries, .. }) => {
422                entries.lock().remove(ArcKey::ref_cast(entry));
423            }
424            DirEntryCache::Uncached => {}
425        };
426    }
427
428    /// Informs the cache that the entry was used.
429    pub fn did_access_dir_entry(&self, entry: &DirEntryHandle) {
430        if let DirEntryCache::Lru(LruCache { entries, .. }) = &self.dcache {
431            entries.lock().get_refresh(ArcKey::ref_cast(entry));
432        }
433    }
434
435    /// Purges old entries from the cache. This is done as a separate step to avoid potential
436    /// deadlocks that could occur if done at admission time (where locks might be held that are
437    /// required when dropping old entries). This should be called after any new entries are
438    /// admitted with no locks held that might be required for dropping entries.
439    pub fn purge_old_entries(&self) {
440        if let DirEntryCache::Lru(l) = &self.dcache {
441            let mut purged = SmallVec::<[DirEntryHandle; 4]>::new();
442            {
443                let mut entries = l.entries.lock();
444                while entries.len() > l.capacity {
445                    purged.push(entries.pop_front().unwrap().0.0);
446                }
447            }
448            // Entries will get dropped here whilst we're not holding a lock.
449            std::mem::drop(purged);
450        }
451    }
452
453    /// Returns the `FileSystem`'s `FileSystemOps` as a `&T`, or `None` if the downcast fails.
454    pub fn downcast_ops<T: 'static>(&self) -> Option<&T> {
455        self.ops.as_ref().as_any().downcast_ref()
456    }
457
458    pub fn name(&self) -> &'static FsStr {
459        self.ops.name()
460    }
461
462    pub fn manages_timestamps(&self) -> bool {
463        self.ops.manages_timestamps()
464    }
465
466    /// Returns the crypt service associated with this filesystem, if any. The crypt service
467    /// implements the fuchsia.fxfs.Crypt protocol and maintains an internal structure that maps
468    /// each encryption key id to the actual key.
469    pub fn crypt_service(&self) -> Option<Arc<CryptService>> {
470        self.ops.crypt_service()
471    }
472}
473
474/// The filesystem-implementation-specific data for FileSystem.
475pub trait FileSystemOps: AsAny + Send + Sync + 'static {
476    /// Return information about this filesystem.
477    ///
478    /// A typical implementation looks like this:
479    /// ```
480    /// Ok(statfs::default(FILE_SYSTEM_MAGIC))
481    /// ```
482    /// or, if the filesystem wants to customize fields:
483    /// ```
484    /// Ok(statfs {
485    ///     f_blocks: self.blocks,
486    ///     ..statfs::default(FILE_SYSTEM_MAGIC)
487    /// })
488    /// ```
489    fn statfs(
490        &self,
491        _locked: &mut Locked<FileOpsCore>,
492        _fs: &FileSystem,
493        _current_task: &CurrentTask,
494    ) -> Result<statfs, Errno>;
495
496    fn name(&self) -> &'static FsStr;
497
498    /// Whether this file system uses external node IDs.
499    ///
500    /// If this is true, then the file system is responsible for assigning node IDs to its nodes.
501    /// Otherwise, the VFS will assign node IDs to the nodes.
502    fn uses_external_node_ids(&self) -> bool {
503        false
504    }
505
506    /// Rename the given node.
507    ///
508    /// The node to be renamed is passed as "renamed". It currently has
509    /// old_name in old_parent. After the rename operation, it should have
510    /// new_name in new_parent.
511    ///
512    /// If new_parent already has a child named new_name, that node is passed as
513    /// "replaced". In that case, both "renamed" and "replaced" will be
514    /// directories and the rename operation should succeed only if "replaced"
515    /// is empty. The VFS will check that there are no children of "replaced" in
516    /// the DirEntry cache, but the implementation of this function is
517    /// responsible for checking that there are no children of replaced that are
518    /// known only to the file system implementation (e.g., present on-disk but
519    /// not in the DirEntry cache).
520    fn rename(
521        &self,
522        _locked: &mut Locked<FileOpsCore>,
523        _fs: &FileSystem,
524        _current_task: &CurrentTask,
525        _old_parent: &FsNodeHandle,
526        _old_name: &FsStr,
527        _new_parent: &FsNodeHandle,
528        _new_name: &FsStr,
529        _renamed: &FsNodeHandle,
530        _replaced: Option<&FsNodeHandle>,
531    ) -> Result<(), Errno> {
532        error!(EROFS)
533    }
534
535    fn exchange(
536        &self,
537        _fs: &FileSystem,
538        _current_task: &CurrentTask,
539        _node1: &FsNodeHandle,
540        _parent1: &FsNodeHandle,
541        _name1: &FsStr,
542        _node2: &FsNodeHandle,
543        _parent2: &FsNodeHandle,
544        _name2: &FsStr,
545    ) -> Result<(), Errno> {
546        error!(EINVAL)
547    }
548
549    /// Called when the filesystem is unmounted.
550    fn unmount(&self) {}
551
552    /// Indicates if the filesystem can manage the timestamps (i.e. ctime and mtime).
553    ///
554    /// Starnix updates the timestamps in FsNode's `info` directly. However, if the filesystem can
555    /// manage the timestamps, then Starnix does not need to do so. `info` will be refreshed with
556    /// the timestamps from the filesystem by calling `fetch_and_refresh_info(..)` on the FsNode.
557    fn manages_timestamps(&self) -> bool {
558        false
559    }
560
561    /// Returns the crypt service associated with this filesystem, if any.
562    fn crypt_service(&self) -> Option<Arc<CryptService>> {
563        None
564    }
565
566    fn sync(
567        &self,
568        _locked: &mut Locked<FileOpsCore>,
569        _fs: &FileSystem,
570        _current_task: &CurrentTask,
571    ) -> Result<(), Errno> {
572        Ok(())
573    }
574
575    /// Returns true if the `FileSystemOps` is intrinsically read-only, as is the case for
576    /// "remote_bundle", or the "remotefs" mounts to read-only directories.
577    // TODO: https://fxbug.dev/322875215 - Remove this workaround once non-bind MS_REMOUNT is
578    // implemented.
579    fn is_readonly(&self) -> bool {
580        false
581    }
582}
583
584impl Drop for FileSystem {
585    fn drop(&mut self) {
586        self.ops.unmount();
587    }
588}
589
590pub type FileSystemHandle = Arc<FileSystem>;