Skip to main content

starnix_core/vfs/
namespace.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mutable_state::{state_accessor, state_implementation};
6use crate::security;
7use crate::task::{CurrentTask, EventHandler, Kernel, Task, WaitCanceler, Waiter};
8use crate::time::utc;
9use crate::vfs::fs_registry::FsRegistry;
10use crate::vfs::pseudo::dynamic_file::{DynamicFile, DynamicFileBuf, DynamicFileSource};
11use crate::vfs::pseudo::simple_file::SimpleFileNode;
12use crate::vfs::socket::{SocketAddress, SocketHandle, UnixSocket};
13use crate::vfs::{
14    CheckAccessReason, DirEntry, DirEntryHandle, FileHandle, FileObject, FileOps, FileSystemHandle,
15    FileSystemOptions, FileWriteGuardMode, FsContext, FsNode, FsNodeHandle, FsNodeOps, FsStr,
16    FsString, PathBuilder, RenameFlags, SymlinkTarget, UnlinkKind, fileops_impl_dataless,
17    fileops_impl_delegate_read_write_and_seek, fileops_impl_nonseekable, fileops_impl_noop_sync,
18    fs_node_impl_not_dir,
19};
20use fuchsia_rcu::RcuReadScope;
21use macro_rules_attribute::apply;
22use ref_cast::RefCast;
23use starnix_logging::log_warn;
24use starnix_rcu::RcuHashMap;
25use starnix_sync::{
26    BeforeFsNodeAppend, FileOpsCore, LockDepMutex, LockEqualOrBefore, Locked, NamespaceFlagsLock,
27    RwLock, Unlocked,
28};
29use starnix_uapi::arc_key::{ArcKey, PtrKey, WeakKey};
30use starnix_uapi::auth::Credentials;
31use starnix_uapi::device_id::DeviceId;
32use starnix_uapi::errors::Errno;
33use starnix_uapi::file_mode::{AccessCheck, FileMode};
34use starnix_uapi::inotify_mask::InotifyMask;
35use starnix_uapi::mount_flags::{
36    AtomicMountpointFlags, FileSystemFlags, MountFlags, MountpointFlags,
37};
38use starnix_uapi::open_flags::OpenFlags;
39use starnix_uapi::unmount_flags::UnmountFlags;
40use starnix_uapi::vfs::{FdEvents, ResolveFlags};
41use starnix_uapi::{NAME_MAX, errno, error};
42use std::borrow::Borrow;
43use std::collections::HashSet;
44use std::fmt;
45use std::hash::{Hash, Hasher};
46use std::ops::{Deref, DerefMut};
47use std::sync::atomic::Ordering;
48use std::sync::{Arc, Weak};
49
50/// A mount namespace.
51///
52/// The namespace records at which entries filesystems are mounted.
53#[derive(Debug)]
54pub struct Namespace {
55    root_mount: MountHandle,
56
57    // Unique ID of this namespace.
58    pub id: u64,
59}
60
61impl Namespace {
62    pub fn new(fs: FileSystemHandle) -> Arc<Namespace> {
63        Self::new_with_flags(fs, MountpointFlags::empty())
64    }
65
66    pub fn new_with_flags(fs: FileSystemHandle, flags: MountpointFlags) -> Arc<Namespace> {
67        let kernel = fs.kernel.upgrade().expect("can't create namespace without a kernel");
68        let root_mount = Mount::new(WhatToMount::Fs(fs), flags);
69        Arc::new(Self { root_mount, id: kernel.get_next_namespace_id() })
70    }
71
72    pub fn root(&self) -> NamespaceNode {
73        self.root_mount.root()
74    }
75
76    pub fn clone_namespace(&self) -> Arc<Namespace> {
77        let kernel =
78            self.root_mount.fs.kernel.upgrade().expect("can't clone namespace without a kernel");
79        Arc::new(Self {
80            root_mount: self.root_mount.clone_mount_recursive(),
81            id: kernel.get_next_namespace_id(),
82        })
83    }
84
85    /// Assuming new_ns is a clone of the namespace that node is from, return the equivalent of
86    /// node in new_ns. If this assumption is violated, returns None.
87    pub fn translate_node(mut node: NamespaceNode, new_ns: &Namespace) -> Option<NamespaceNode> {
88        // Collect the list of mountpoints that leads to this node's mount
89        let mut mountpoints = vec![];
90        let mut mount = node.mount;
91        while let Some(mountpoint) = mount.as_ref().and_then(|m| m.read().mountpoint()) {
92            mountpoints.push(mountpoint.entry);
93            mount = mountpoint.mount;
94        }
95
96        // Follow the same path in the new namespace
97        let mut mount = Arc::clone(&new_ns.root_mount);
98        for mountpoint in mountpoints.iter().rev() {
99            let next_mount =
100                mount.read().submounts.get(ArcKey::ref_cast(mountpoint))?.mount.clone();
101            mount = next_mount;
102        }
103        node.mount = Some(mount).into();
104        Some(node)
105    }
106}
107
108impl FsNodeOps for Arc<Namespace> {
109    fs_node_impl_not_dir!();
110
111    fn create_file_ops(
112        &self,
113        _locked: &mut Locked<FileOpsCore>,
114        _node: &FsNode,
115        _current_task: &CurrentTask,
116        _flags: OpenFlags,
117    ) -> Result<Box<dyn FileOps>, Errno> {
118        Ok(Box::new(MountNamespaceFile(self.clone())))
119    }
120}
121
122pub struct MountNamespaceFile(pub Arc<Namespace>);
123
124impl FileOps for MountNamespaceFile {
125    fileops_impl_nonseekable!();
126    fileops_impl_dataless!();
127    fileops_impl_noop_sync!();
128}
129
130/// An empty struct that we use to track the number of active clients for a mount.
131///
132/// Each active client takes a reference to this object. The unmount operation fails
133/// if there are any active clients of the mount.
134type MountClientMarker = Arc<()>;
135
136/// An instance of a filesystem mounted in a namespace.
137///
138/// At a mount, path traversal switches from one filesystem to another.
139/// The client sees a composed directory structure that glues together the
140/// directories from the underlying FsNodes from those filesystems.
141///
142/// The mounts in a namespace form a mount tree, with `mountpoint` pointing to the parent and
143/// `submounts` pointing to the children.
144pub struct Mount {
145    root: DirEntryHandle,
146    fs: FileSystemHandle,
147
148    /// Holds the flags specific to this mount of the underlying filesystem.
149    flags: AtomicMountpointFlags,
150
151    /// Lock used to serialize updates of `flags` to ensure consistency during remount operations.
152    flags_lock: LockDepMutex<(), NamespaceFlagsLock>,
153
154    /// A unique identifier for this mount reported in /proc/pid/mountinfo.
155    id: u64,
156
157    /// A count of the number of active clients.
158    active_client_counter: MountClientMarker,
159
160    // Lock ordering: mount -> submount
161    state: RwLock<MountState>,
162    // Mount used to contain a Weak<Namespace>. It no longer does because since the mount point
163    // hash was moved from Namespace to Mount, nothing actually uses it. Now that
164    // Namespace::clone_namespace() is implemented in terms of Mount::clone_mount_recursive, it
165    // won't be trivial to add it back. If you end up needing to find a Mount's Namespace, I
166    // recommend turning the mountpoint field into an enum of Mountpoint or Namespace, maybe called
167    // "parent", and then you can traverse up to the top of the tree.
168}
169type MountHandle = Arc<Mount>;
170
171/// Public representation of the mount options.
172#[derive(Clone, Debug)]
173pub struct MountInfo {
174    handle: Option<MountHandle>,
175}
176
177impl MountInfo {
178    /// `MountInfo` for a element that is not tied to a given mount. Mount flags will be considered
179    /// empty.
180    pub fn detached() -> Self {
181        None.into()
182    }
183
184    /// The mount flags of the represented mount.
185    pub fn flags(&self) -> MountFlags {
186        if let Some(handle) = &self.handle {
187            handle.flags()
188        } else {
189            // Consider not mounted node have the NOATIME flags.
190            MountFlags::NOATIME
191        }
192    }
193
194    /// Checks whether this `MountInfo` represents a writable file system mount.
195    pub fn check_readonly_filesystem(&self) -> Result<(), Errno> {
196        if self.flags().contains(MountFlags::RDONLY) {
197            return error!(EROFS);
198        }
199        Ok(())
200    }
201
202    /// Checks whether this `MountInfo` represents an executable file system mount.
203    pub fn check_noexec_filesystem(&self) -> Result<(), Errno> {
204        if self.flags().contains(MountFlags::NOEXEC) {
205            return error!(EACCES);
206        }
207        Ok(())
208    }
209}
210
211impl Deref for MountInfo {
212    type Target = Option<MountHandle>;
213
214    fn deref(&self) -> &Self::Target {
215        &self.handle
216    }
217}
218
219impl DerefMut for MountInfo {
220    fn deref_mut(&mut self) -> &mut Self::Target {
221        &mut self.handle
222    }
223}
224
225impl std::cmp::PartialEq for MountInfo {
226    fn eq(&self, other: &Self) -> bool {
227        self.handle.as_ref().map(Arc::as_ptr) == other.handle.as_ref().map(Arc::as_ptr)
228    }
229}
230
231impl std::cmp::Eq for MountInfo {}
232
233impl Into<MountInfo> for Option<MountHandle> {
234    fn into(self) -> MountInfo {
235        MountInfo { handle: self }
236    }
237}
238
239#[derive(Default)]
240pub struct MountState {
241    /// The namespace node that this mount is mounted on. This is a tuple instead of a
242    /// NamespaceNode because the Mount pointer has to be weak because this is the pointer to the
243    /// parent mount, the parent has a pointer to the children too, and making both strong would be
244    /// a cycle.
245    mountpoint: Option<(Weak<Mount>, DirEntryHandle)>,
246
247    // The set is keyed by the mountpoints which are always descendants of this mount's root.
248    // Conceptually, the set is more akin to a map: `DirEntry -> MountHandle`, but we use a set
249    // instead because `Submount` has a drop implementation that needs both the key and value.
250    //
251    // Each directory entry can only have one mount attached. Mount shadowing works by using the
252    // root of the inner mount as a mountpoint. For example, if filesystem A is mounted at /foo,
253    // mounting filesystem B on /foo will create the mount as a child of the A mount, attached to
254    // A's root, instead of the root mount.
255    submounts: HashSet<Submount>,
256
257    /// The membership of this mount in its peer group. Do not access directly. Instead use
258    /// peer_group(), take_from_peer_group(), and set_peer_group().
259    // TODO(tbodt): Refactor the links into, some kind of extra struct or something? This is hard
260    // because setting this field requires the Arc<Mount>.
261    peer_group_: Option<(Arc<PeerGroup>, PtrKey<Mount>)>,
262    /// The membership of this mount in a PeerGroup's downstream. Do not access directly. Instead
263    /// use upstream(), take_from_upstream(), and set_upstream().
264    upstream_: Option<(Weak<PeerGroup>, PtrKey<Mount>)>,
265}
266
267/// A group of mounts. Setting MS_SHARED on a mount puts it in its own peer group. Any bind mounts
268/// of a mount in the group are also added to the group. A mount created in any mount in a peer
269/// group will be automatically propagated (recreated) in every other mount in the group.
270#[derive(Default)]
271struct PeerGroup {
272    id: u64,
273    state: RwLock<PeerGroupState>,
274}
275#[derive(Default)]
276struct PeerGroupState {
277    mounts: HashSet<WeakKey<Mount>>,
278    downstream: HashSet<WeakKey<Mount>>,
279}
280
281pub enum WhatToMount {
282    Fs(FileSystemHandle),
283    Bind(NamespaceNode),
284}
285
286enum WhatSubmount {
287    New(WhatToMount, MountpointFlags),
288    Existing(MountHandle),
289}
290
291impl Mount {
292    pub fn new(what: WhatToMount, mut flags: MountpointFlags) -> MountHandle {
293        match what {
294            WhatToMount::Fs(fs) => {
295                // If `flags` does not explicitly specify an access-time flag then default to `RELATIME`.
296                flags.default_atime_from(MountpointFlags::RELATIME);
297                Self::new_with_root(fs.root().clone(), flags)
298            }
299            WhatToMount::Bind(node) => {
300                let mount = node.mount.as_ref().expect("can't bind mount from an anonymous node");
301                mount.clone_mount(&node.entry, flags.into())
302            }
303        }
304    }
305
306    fn new_with_root(root: DirEntryHandle, flags: MountpointFlags) -> MountHandle {
307        let fs = root.node.fs();
308        let kernel = fs.kernel.upgrade().expect("can't create mount without kernel");
309        Arc::new(Self {
310            id: kernel.get_next_mount_id(),
311            flags: (flags & MountpointFlags::STORED_ON_MOUNT).into(),
312            flags_lock: LockDepMutex::new(()),
313            root,
314            active_client_counter: Default::default(),
315            fs,
316            state: Default::default(),
317        })
318    }
319
320    /// A namespace node referring to the root of the mount.
321    pub fn root(self: &MountHandle) -> NamespaceNode {
322        NamespaceNode::new(Arc::clone(self), Arc::clone(&self.root))
323    }
324
325    /// Create the specified mount as a child. Also propagate it to the mount's peer group.
326    fn create_submount(self: &MountHandle, dir: &DirEntryHandle, what: WhatSubmount) {
327        // TODO(b/482453480): Making a copy here is necessary for lock ordering, because the peer
328        // group lock nests inside all mount locks (it would be impractical to reverse this because
329        // you need to lock a mount to get its peer group.) But it opens the door to race conditions
330        // where if a peer are concurrently being added, the mount might not get propagated to the
331        // new peer. The only true solution to this is bigger locks, somehow using the same lock for
332        // the peer group and all of the mounts in the group. Since peer groups are fluid and can
333        // have mounts constantly joining and leaving and then joining other groups, the only
334        // sensible locking option is to use a single global lock for all mounts and peer groups.
335        // This is almost impossible to express in rust. Help.
336        //
337        // Update: Also necessary to make a copy to prevent excess replication, see the comment on
338        // the following Mount::new call.
339        let peers = {
340            let state = self.state.read();
341            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
342        };
343
344        // Create the mount after copying the peer list, because in the case of creating a bind
345        // mount inside itself, the new mount would get added to our peer group during the
346        // Mount::new call, but we don't want to replicate into it already. For an example see
347        // MountTest.QuizBRecursion.
348        let mount = match what {
349            WhatSubmount::Existing(mount) => mount,
350            WhatSubmount::New(what, flags) => Mount::new(what, flags),
351        };
352
353        if self.read().is_shared() {
354            mount.write().make_shared();
355        }
356
357        for peer in peers {
358            if Arc::ptr_eq(self, &peer) {
359                continue;
360            }
361            let clone = mount.clone_mount_recursive();
362            peer.write().add_submount_internal(dir, clone);
363        }
364
365        self.write().add_submount_internal(dir, mount)
366    }
367
368    fn remove_submount(self: &MountHandle, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
369        // create_submount explains why we need to make a copy of peers.
370        let peers = {
371            let state = self.state.read();
372            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
373        };
374
375        for peer in peers {
376            if Arc::ptr_eq(self, &peer) {
377                continue;
378            }
379            // mount_namespaces(7): If B is shared, then all most-recently-mounted mounts at b on
380            // mounts that receive propagation from mount B and do not have submounts under them are
381            // unmounted.
382            let mut peer = peer.write();
383            if let Some(submount) = peer.submounts.get(mount_hash_key) {
384                if !submount.mount.read().submounts.is_empty() {
385                    continue;
386                }
387            }
388            let _ = peer.remove_submount_internal(mount_hash_key);
389        }
390
391        self.write().remove_submount_internal(mount_hash_key)
392    }
393
394    pub fn move_mount(
395        source_mount: &MountHandle,
396        target_mount: &MountHandle,
397        target_dir: &DirEntryHandle,
398    ) -> Result<(), Errno> {
399        // TODO(b/482453480): Moving a mount is supposed to be atomic, but this isn't. Trying to
400        // think of a way to ensure full atomicity in the current locking model led to a train of
401        // thought of spiraling complexity (you need to lock source_parent before source_mount, but
402        // you need to lock source_mount in order to get a reference to source_parent, and someone
403        // could move the mount again in between these operations, so you need to retry this.) So
404        // I'm settling for not trying for atomicitiy, plus a TODO comment.
405        let source_mountpoint = source_mount.read().mountpoint().ok_or_else(|| errno!(EIO))?;
406        let source_parent =
407            source_mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
408
409        // First, disconnect the mount from its parent.
410        {
411            let mut source_parent = source_parent.write();
412            if source_parent.peer_group().is_some() {
413                // Sayeth mount(2):
414                // EINVAL A move operation (MS_MOVE) was attempted, but the parent mount of source
415                //        mount has propagation type MS_SHARED.
416                return error!(EINVAL);
417            }
418            let mut source_mount = source_mount.write();
419            source_parent.remove_submount_internal(source_mountpoint.mount_hash_key())?;
420            source_mount.mountpoint = None;
421        }
422
423        target_mount.create_submount(target_dir, WhatSubmount::Existing(Arc::clone(source_mount)));
424        Ok(())
425    }
426
427    /// Create a new mount with the same filesystem, flags, and peer group. Used to implement bind
428    /// mounts.
429    fn clone_mount(
430        self: &MountHandle,
431        new_root: &DirEntryHandle,
432        flags: MountFlags,
433    ) -> MountHandle {
434        assert!(new_root.is_descendant_of(&self.root));
435        // According to mount(2) on bind mounts, all flags other than MS_REC are ignored when doing
436        // a bind mount.
437        let clone = Self::new_with_root(Arc::clone(new_root), self.mount_flags());
438
439        if flags.contains(MountFlags::REC) {
440            // This is two steps because the alternative (locking clone.state while iterating over
441            // self.state.submounts) trips tracing_mutex. The lock ordering is parent -> child, and
442            // if the clone is eventually made a child of self, this looks like an ordering
443            // violation. I'm not convinced it's a real issue, but I can't convince myself it's not
444            // either.
445            let mut submounts = vec![];
446            for Submount { dir, mount } in &self.state.read().submounts {
447                submounts.push((dir.clone(), mount.clone_mount_recursive()));
448            }
449            let mut clone_state = clone.write();
450            for (dir, submount) in submounts {
451                clone_state.add_submount_internal(&dir, submount);
452            }
453        }
454
455        // Put the clone in the same peer group
456        let peer_group = self.state.read().peer_group().map(Arc::clone);
457        if let Some(peer_group) = peer_group {
458            clone.write().set_peer_group(peer_group);
459        }
460
461        clone
462    }
463
464    /// Do a clone of the full mount hierarchy below this mount. Used for creating mount
465    /// namespaces and creating copies to use for propagation.
466    fn clone_mount_recursive(self: &MountHandle) -> MountHandle {
467        self.clone_mount(&self.root, MountFlags::REC)
468    }
469
470    pub fn change_propagation(self: &MountHandle, flag: MountFlags, recursive: bool) {
471        let mut state = self.write();
472        match flag {
473            MountFlags::SHARED => state.make_shared(),
474            MountFlags::PRIVATE => state.make_private(),
475            MountFlags::DOWNSTREAM => state.make_downstream(),
476            _ => {
477                log_warn!("mount propagation {:?}", flag);
478                return;
479            }
480        }
481
482        if recursive {
483            for submount in &state.submounts {
484                submount.mount.change_propagation(flag, recursive);
485            }
486        }
487    }
488
489    /// Returns the effective flags for the `Mount`, calculated as the union of the mount flags
490    /// associated with the `FileSystem`, and with the `Mount` itself.
491    fn flags(&self) -> MountFlags {
492        MountFlags::from(self.mount_flags()) | self.fs_flags().into()
493    }
494
495    /// Returns the mount flags stored unique to this `Mount`.
496    fn mount_flags(&self) -> MountpointFlags {
497        self.flags.load(Ordering::Relaxed)
498    }
499
500    /// Returns the mount flags for the `FileSystem` of this `Mount`.
501    fn fs_flags(&self) -> FileSystemFlags {
502        self.fs.options.flags.load(Ordering::Relaxed)
503    }
504
505    /// Updates the `Mount` with the per-mount flags specified in `flags`, while preserving the
506    /// existing access-time flag if no access-time flag is set in `flags`.
507    pub fn update_flags(self: &MountHandle, mut flags: MountpointFlags) {
508        let _lock = self.flags_lock.lock();
509        // Since Linux 3.17, if none of MS_NOATIME, MS_NODIRATIME,
510        // MS_RELATIME, or MS_STRICTATIME is specified in mountflags, then
511        // the remount operation preserves the existing values of these
512        // flags (rather than defaulting to MS_RELATIME).
513        flags.default_atime_from(self.flags.load(Ordering::Relaxed));
514        flags &= MountpointFlags::STORED_ON_MOUNT;
515        self.flags.store(flags, Ordering::Relaxed);
516    }
517
518    /// The number of active clients of this mount.
519    ///
520    /// The mount cannot be unmounted if there are any active clients.
521    fn active_clients(&self) -> usize {
522        // We need to subtract one for our own reference. We are not a real client.
523        Arc::strong_count(&self.active_client_counter) - 1
524    }
525
526    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
527        if !flags.contains(UnmountFlags::DETACH) {
528            if self.active_clients() > 0 || !self.state.read().submounts.is_empty() {
529                return error!(EBUSY);
530            }
531        }
532        let mountpoint = self.state.read().mountpoint().ok_or_else(|| errno!(EINVAL))?;
533        let parent_mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
534        parent_mount.remove_submount(mountpoint.mount_hash_key())
535    }
536
537    /// Returns the security state of the fs.
538    pub fn security_state(&self) -> &security::FileSystemState {
539        &self.fs.security_state
540    }
541
542    /// Returns the name of the fs.
543    pub fn fs_name(&self) -> &'static FsStr {
544        self.fs.name()
545    }
546
547    /// Reconfigures the flags for the `FileSystem` backing this mount point.
548    pub fn reconfigure_fs(
549        &self,
550        current_task: &CurrentTask,
551        flags: FileSystemFlags,
552    ) -> Result<(), Errno> {
553        self.fs.update_flags(current_task, flags)
554    }
555
556    state_accessor!(Mount, state, Arc<Mount>);
557}
558
559impl MountState {
560    /// Returns true if there is a submount on top of `dir_entry`.
561    pub fn has_submount(&self, dir_entry: &DirEntryHandle) -> bool {
562        self.submounts.contains(ArcKey::ref_cast(dir_entry))
563    }
564
565    /// The NamespaceNode on which this Mount is mounted.
566    fn mountpoint(&self) -> Option<NamespaceNode> {
567        let (mount, entry) = self.mountpoint.as_ref()?;
568        Some(NamespaceNode::new(mount.upgrade()?, entry.clone()))
569    }
570
571    /// Return this mount's current peer group.
572    fn peer_group(&self) -> Option<&Arc<PeerGroup>> {
573        let (group, _) = self.peer_group_.as_ref()?;
574        Some(group)
575    }
576
577    /// Remove this mount from its peer group and return the peer group.
578    fn take_from_peer_group(&mut self) -> Option<Arc<PeerGroup>> {
579        let (old_group, old_mount) = self.peer_group_.take()?;
580        old_group.remove(old_mount);
581        if let Some(upstream) = self.take_from_upstream() {
582            let next_mount =
583                old_group.state.read().mounts.iter().next().map(|w| w.0.upgrade().unwrap());
584            if let Some(next_mount) = next_mount {
585                // TODO(https://fxbug.dev/42065259): Fix the lock ordering here. We've locked next_mount
586                // while self is locked, and since the propagation tree and mount tree are
587                // separate, this could violate the mount -> submount order previously established.
588                next_mount.write().set_upstream(upstream);
589            }
590        }
591        Some(old_group)
592    }
593
594    fn upstream(&self) -> Option<Arc<PeerGroup>> {
595        self.upstream_.as_ref().and_then(|g| g.0.upgrade())
596    }
597
598    fn take_from_upstream(&mut self) -> Option<Arc<PeerGroup>> {
599        let (old_upstream, old_mount) = self.upstream_.take()?;
600        // TODO(tbodt): Reason about whether the upgrade() could possibly return None, and what we
601        // should actually do in that case.
602        let old_upstream = old_upstream.upgrade()?;
603        old_upstream.remove_downstream(old_mount);
604        Some(old_upstream)
605    }
606}
607
608#[apply(state_implementation!)]
609impl MountState<Base = Mount, BaseType = Arc<Mount>> {
610    /// Add a child mount *without propagating it to the peer group*. For internal use only.
611    fn add_submount_internal(&mut self, dir: &DirEntryHandle, mount: MountHandle) {
612        if !dir.is_descendant_of(&self.base.root) {
613            return;
614        }
615
616        let submount = mount.fs.kernel.upgrade().unwrap().mounts.register_mount(dir, mount.clone());
617        let old_mountpoint =
618            mount.state.write().mountpoint.replace((Arc::downgrade(self.base), Arc::clone(dir)));
619        assert!(old_mountpoint.is_none(), "add_submount can only take a newly created mount");
620        // Mount shadowing is implemented by mounting onto the root of the first mount, not by
621        // creating two mounts on the same mountpoint.
622        let old_mount = self.submounts.replace(submount);
623
624        // In rare cases, mount propagation might result in a request to mount on a directory where
625        // something is already mounted. MountTest.LotsOfShadowing will trigger this. Linux handles
626        // this by inserting the new mount between the old mount and the current mount.
627        if let Some(mut old_mount) = old_mount {
628            // Previous state: self[dir] = old_mount
629            // New state: self[dir] = new_mount, new_mount[new_mount.root] = old_mount
630            // The new mount has already been inserted into self, now just update the old mount to
631            // be a child of the new mount.
632            old_mount.mount.write().mountpoint = Some((Arc::downgrade(&mount), Arc::clone(dir)));
633            old_mount.dir = ArcKey(mount.root.clone());
634            mount.write().submounts.insert(old_mount);
635        }
636    }
637
638    fn remove_submount_internal(&mut self, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
639        if self.submounts.remove(mount_hash_key) { Ok(()) } else { error!(EINVAL) }
640    }
641
642    /// Set this mount's peer group.
643    fn set_peer_group(&mut self, group: Arc<PeerGroup>) {
644        self.take_from_peer_group();
645        group.add(self.base);
646        self.peer_group_ = Some((group, Arc::as_ptr(self.base).into()));
647    }
648
649    fn set_upstream(&mut self, group: Arc<PeerGroup>) {
650        self.take_from_upstream();
651        group.add_downstream(self.base);
652        self.upstream_ = Some((Arc::downgrade(&group), Arc::as_ptr(self.base).into()));
653    }
654
655    /// Is the mount in a peer group? Corresponds to MS_SHARED.
656    pub fn is_shared(&self) -> bool {
657        self.peer_group().is_some()
658    }
659
660    /// Put the mount in a peer group. Implements MS_SHARED.
661    pub fn make_shared(&mut self) {
662        if self.is_shared() {
663            return;
664        }
665        let kernel =
666            self.base.fs.kernel.upgrade().expect("can't create new peer group without kernel");
667        self.set_peer_group(PeerGroup::new(kernel.get_next_peer_group_id()));
668    }
669
670    /// Take the mount out of its peer group, also remove upstream if any. Implements MS_PRIVATE.
671    pub fn make_private(&mut self) {
672        self.take_from_peer_group();
673        self.take_from_upstream();
674    }
675
676    /// Take the mount out of its peer group and make it downstream instead. Implements
677    /// MountFlags::DOWNSTREAM (MS_SLAVE).
678    pub fn make_downstream(&mut self) {
679        if let Some(peer_group) = self.take_from_peer_group() {
680            self.set_upstream(peer_group);
681        }
682    }
683}
684
685impl PeerGroup {
686    fn new(id: u64) -> Arc<Self> {
687        Arc::new(Self { id, state: Default::default() })
688    }
689
690    fn add(&self, mount: &Arc<Mount>) {
691        self.state.write().mounts.insert(WeakKey::from(mount));
692    }
693
694    fn remove(&self, mount: PtrKey<Mount>) {
695        self.state.write().mounts.remove(&mount);
696    }
697
698    fn add_downstream(&self, mount: &Arc<Mount>) {
699        self.state.write().downstream.insert(WeakKey::from(mount));
700    }
701
702    fn remove_downstream(&self, mount: PtrKey<Mount>) {
703        self.state.write().downstream.remove(&mount);
704    }
705
706    fn copy_propagation_targets(&self) -> Vec<MountHandle> {
707        let mut buf = vec![];
708        self.collect_propagation_targets(&mut buf);
709        buf
710    }
711
712    fn collect_propagation_targets(&self, buf: &mut Vec<MountHandle>) {
713        let downstream_mounts: Vec<_> = {
714            let state = self.state.read();
715            buf.extend(state.mounts.iter().filter_map(|m| m.0.upgrade()));
716            state.downstream.iter().filter_map(|m| m.0.upgrade()).collect()
717        };
718        for mount in downstream_mounts {
719            let peer_group = mount.read().peer_group().map(Arc::clone);
720            match peer_group {
721                Some(group) => group.collect_propagation_targets(buf),
722                None => buf.push(mount),
723            }
724        }
725    }
726}
727
728impl Drop for Mount {
729    fn drop(&mut self) {
730        let state = self.state.get_mut();
731        state.take_from_peer_group();
732        state.take_from_upstream();
733    }
734}
735
736impl fmt::Debug for Mount {
737    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
738        let state = self.state.read();
739        f.debug_struct("Mount")
740            .field("id", &(self as *const Mount))
741            .field("root", &self.root)
742            .field("mountpoint", &state.mountpoint)
743            .field("submounts", &state.submounts)
744            .finish()
745    }
746}
747
748impl Kernel {
749    pub fn get_next_mount_id(&self) -> u64 {
750        self.next_mount_id.next()
751    }
752
753    pub fn get_next_peer_group_id(&self) -> u64 {
754        self.next_peer_group_id.next()
755    }
756
757    pub fn get_next_namespace_id(&self) -> u64 {
758        self.next_namespace_id.next()
759    }
760}
761
762impl CurrentTask {
763    pub fn create_filesystem(
764        &self,
765        locked: &mut Locked<Unlocked>,
766        fs_type: &FsStr,
767        options: FileSystemOptions,
768    ) -> Result<FileSystemHandle, Errno> {
769        // Please register new file systems via //src/starnix/modules/lib.rs, even if the file
770        // system is implemented inside starnix_core.
771        //
772        // Most file systems should be implemented as modules. The VFS provides various traits that
773        // let starnix_core integrate file systems without needing to depend on the file systems
774        // directly.
775        self.kernel()
776            .expando
777            .get::<FsRegistry>()
778            .create(locked, self, fs_type, options)
779            .ok_or_else(|| errno!(ENODEV, fs_type))?
780    }
781}
782
783struct ProcMountsFileSource(Weak<Task>);
784
785impl DynamicFileSource for ProcMountsFileSource {
786    fn generate(
787        &self,
788        _current_task: &CurrentTask,
789        sink: &mut DynamicFileBuf,
790    ) -> Result<(), Errno> {
791        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
792        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
793        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
794        // extra work to maintain it.
795        let task = Task::from_weak(&self.0)?;
796        let task_fs = task.running_state()?.fs.read();
797        let root = task_fs.root();
798        let ns = task_fs.namespace();
799        for_each_mount(&ns.root_mount, &mut |mount| {
800            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
801            if !mountpoint.is_descendant_of(&root) {
802                return Ok(());
803            }
804            write!(
805                sink,
806                "{} {} {} {}{}",
807                mount.fs.options.source_for_display(),
808                mountpoint.path(&task_fs),
809                mount.fs.name(),
810                // Report the union of the FileSystem and Mount flags, as well as any FileSystem-
811                // or LSM-specific options.
812                mount.flags(),
813                security::sb_show_options(&task.kernel(), &mount.fs)?,
814            )?;
815            writeln!(sink, " 0 0")?;
816            Ok(())
817        })?;
818        Ok(())
819    }
820}
821
822pub struct ProcMountsFile {
823    dynamic_file: DynamicFile<ProcMountsFileSource>,
824}
825
826impl ProcMountsFile {
827    pub fn new_node(task: Weak<Task>) -> impl FsNodeOps {
828        SimpleFileNode::new(move |_, _| {
829            Ok(Self { dynamic_file: DynamicFile::new(ProcMountsFileSource(task.clone())) })
830        })
831    }
832}
833
834impl FileOps for ProcMountsFile {
835    fileops_impl_delegate_read_write_and_seek!(self, self.dynamic_file);
836    fileops_impl_noop_sync!();
837
838    fn wait_async(
839        &self,
840        _locked: &mut Locked<FileOpsCore>,
841        _file: &FileObject,
842        _current_task: &CurrentTask,
843        waiter: &Waiter,
844        _events: FdEvents,
845        _handler: EventHandler,
846    ) -> Option<WaitCanceler> {
847        // Polling this file gives notifications when any change to mounts occurs. This is not
848        // implemented yet, but stubbed for Android init.
849        Some(waiter.fake_wait())
850    }
851
852    fn query_events(
853        &self,
854        _locked: &mut Locked<FileOpsCore>,
855        _file: &FileObject,
856        _current_task: &CurrentTask,
857    ) -> Result<FdEvents, Errno> {
858        Ok(FdEvents::empty())
859    }
860}
861
862#[derive(Clone)]
863pub struct ProcMountinfoFile(Weak<Task>);
864impl ProcMountinfoFile {
865    pub fn new_node(task: Weak<Task>) -> impl FsNodeOps {
866        DynamicFile::new_node(Self(task))
867    }
868}
869impl DynamicFileSource for ProcMountinfoFile {
870    fn generate(
871        &self,
872        _current_task: &CurrentTask,
873        sink: &mut DynamicFileBuf,
874    ) -> Result<(), Errno> {
875        // Returns path to the `dir` from the root of the file system.
876        fn path_from_fs_root(dir: &DirEntryHandle) -> FsString {
877            let mut path = PathBuilder::new();
878            if dir.is_dead() {
879                // Return `/foo/dir//deleted` if the dir was deleted.
880                path.prepend_element("/deleted".into());
881            }
882            let scope = RcuReadScope::new();
883            let mut current = dir.deref();
884            while let Some(parent) = current.parent_ref(&scope) {
885                path.prepend_element(current.local_name(&scope));
886                current = parent;
887            }
888            path.build_absolute()
889        }
890
891        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
892        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
893        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
894        // extra work to maintain it.
895        let task = Task::from_weak(&self.0)?;
896        let task_fs = task.running_state()?.fs.read();
897        let root = task_fs.root();
898        let ns = task_fs.namespace();
899        for_each_mount(&ns.root_mount, &mut |mount| {
900            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
901            if !mountpoint.is_descendant_of(&root) {
902                return Ok(());
903            }
904            // Can't fail, mountpoint() and root() can't return a NamespaceNode with no mount
905            let parent = mountpoint.mount.as_ref().unwrap();
906            write!(
907                sink,
908                "{} {} {} {} {} {}",
909                mount.id,
910                parent.id,
911                mount.root.node.fs().dev_id,
912                path_from_fs_root(&mount.root),
913                mountpoint.path(&task_fs),
914                mount.mount_flags(),
915            )?;
916            if let Some(peer_group) = mount.read().peer_group() {
917                write!(sink, " shared:{}", peer_group.id)?;
918            }
919            if let Some(upstream) = mount.read().upstream() {
920                write!(sink, " master:{}", upstream.id)?;
921            }
922            writeln!(
923                sink,
924                " - {} {} {}{}",
925                mount.fs.name(),
926                mount.fs.options.source_for_display(),
927                mount.fs_flags(),
928                // LSM options are associated with the FileSystem rather than the Mount.
929                security::sb_show_options(&task.kernel(), &mount.fs)?
930            )?;
931            Ok(())
932        })?;
933        Ok(())
934    }
935}
936
937fn for_each_mount<E>(
938    mount: &MountHandle,
939    callback: &mut impl FnMut(&MountHandle) -> Result<(), E>,
940) -> Result<(), E> {
941    callback(mount)?;
942    // Collect list first to avoid self deadlock when ProcMountinfoFile::read_at tries to call
943    // NamespaceNode::path()
944    let submounts: Vec<_> = mount.read().submounts.iter().map(|s| s.mount.clone()).collect();
945    for submount in submounts {
946        for_each_mount(&submount, callback)?;
947    }
948    Ok(())
949}
950
951/// The `SymlinkMode` enum encodes how symlinks are followed during path traversal.
952#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)]
953pub enum SymlinkMode {
954    /// Follow a symlink at the end of a path resolution.
955    #[default]
956    Follow,
957
958    /// Do not follow a symlink at the end of a path resolution.
959    NoFollow,
960}
961
962/// The maximum number of symlink traversals that can be made during path resolution.
963pub const MAX_SYMLINK_FOLLOWS: u8 = 40;
964
965/// The context passed during namespace lookups.
966///
967/// Namespace lookups need to mutate a shared context in order to correctly
968/// count the number of remaining symlink traversals.
969pub struct LookupContext {
970    /// The SymlinkMode for the lookup.
971    ///
972    /// As the lookup proceeds, the follow count is decremented each time the
973    /// lookup traverses a symlink.
974    pub symlink_mode: SymlinkMode,
975
976    /// The number of symlinks remaining the follow.
977    ///
978    /// Each time path resolution calls readlink, this value is decremented.
979    pub remaining_follows: u8,
980
981    /// Whether the result of the lookup must be a directory.
982    ///
983    /// For example, if the path ends with a `/` or if userspace passes
984    /// O_DIRECTORY. This flag can be set to true if the lookup encounters a
985    /// symlink that ends with a `/`.
986    pub must_be_directory: bool,
987
988    /// Resolve flags passed to `openat2`. Empty if the lookup originated in any other syscall.
989    pub resolve_flags: ResolveFlags,
990
991    /// Base directory for the lookup. Set only when either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT`
992    /// is passed to `openat2`.
993    pub resolve_base: ResolveBase,
994}
995
996/// Used to specify base directory in `LookupContext` for lookups originating in the `openat2`
997/// syscall with either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT` flag.
998#[derive(Clone, Eq, PartialEq)]
999pub enum ResolveBase {
1000    None,
1001
1002    /// The lookup is not allowed to traverse any node that's not beneath the specified node.
1003    Beneath(NamespaceNode),
1004
1005    /// The lookup should be handled as if the root specified node is the file-system root.
1006    InRoot(NamespaceNode),
1007}
1008
1009impl LookupContext {
1010    pub fn new(symlink_mode: SymlinkMode) -> LookupContext {
1011        LookupContext {
1012            symlink_mode,
1013            remaining_follows: MAX_SYMLINK_FOLLOWS,
1014            must_be_directory: false,
1015            resolve_flags: ResolveFlags::empty(),
1016            resolve_base: ResolveBase::None,
1017        }
1018    }
1019
1020    pub fn with(&self, symlink_mode: SymlinkMode) -> LookupContext {
1021        LookupContext { symlink_mode, resolve_base: self.resolve_base.clone(), ..*self }
1022    }
1023
1024    pub fn update_for_path(&mut self, path: &FsStr) {
1025        if path.last() == Some(&b'/') {
1026            // The last path element must resolve to a directory. This is because a trailing slash
1027            // was found in the path.
1028            self.must_be_directory = true;
1029            // If the last path element is a symlink, we should follow it.
1030            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
1031            self.symlink_mode = SymlinkMode::Follow;
1032        }
1033    }
1034}
1035
1036impl Default for LookupContext {
1037    fn default() -> Self {
1038        LookupContext::new(SymlinkMode::Follow)
1039    }
1040}
1041
1042/// Whether the path is reachable from the given root.
1043pub enum PathWithReachability {
1044    /// The path is reachable from the given root.
1045    Reachable(FsString),
1046
1047    /// The path is not reachable from the given root.
1048    Unreachable(FsString),
1049}
1050
1051impl PathWithReachability {
1052    pub fn into_path(self) -> FsString {
1053        match self {
1054            PathWithReachability::Reachable(path) => path,
1055            PathWithReachability::Unreachable(path) => path,
1056        }
1057    }
1058}
1059
1060/// A node in a mount namespace.
1061///
1062/// This tree is a composite of the mount tree and the FsNode tree.
1063///
1064/// These nodes are used when traversing paths in a namespace in order to
1065/// present the client the directory structure that includes the mounted
1066/// filesystems.
1067#[derive(Clone)]
1068pub struct NamespaceNode {
1069    /// The mount where this namespace node is mounted.
1070    ///
1071    /// A given FsNode can be mounted in multiple places in a namespace. This
1072    /// field distinguishes between them.
1073    pub mount: MountInfo,
1074
1075    /// The FsNode that corresponds to this namespace entry.
1076    pub entry: DirEntryHandle,
1077}
1078
1079impl NamespaceNode {
1080    pub fn new(mount: MountHandle, entry: DirEntryHandle) -> Self {
1081        Self { mount: Some(mount).into(), entry }
1082    }
1083
1084    /// Create a namespace node that is not mounted in a namespace.
1085    pub fn new_anonymous(entry: DirEntryHandle) -> Self {
1086        Self { mount: None.into(), entry }
1087    }
1088
1089    /// Create a namespace node that is not mounted in a namespace and that refers to a node that
1090    /// is not rooted in a hierarchy and has no name.
1091    pub fn new_anonymous_unrooted(current_task: &CurrentTask, node: FsNodeHandle) -> Self {
1092        let dir_entry = DirEntry::new_unrooted(node);
1093        let _ = security::fs_node_init_with_dentry_no_xattr(current_task, &dir_entry);
1094        Self::new_anonymous(dir_entry)
1095    }
1096
1097    /// Create a FileObject corresponding to this namespace node.
1098    ///
1099    /// This function is the primary way of instantiating FileObjects. Each
1100    /// FileObject records the NamespaceNode that created it in order to
1101    /// remember its path in the Namespace.
1102    pub fn open(
1103        &self,
1104        locked: &mut Locked<Unlocked>,
1105        current_task: &CurrentTask,
1106        flags: OpenFlags,
1107        access_check: AccessCheck,
1108    ) -> Result<FileHandle, Errno> {
1109        let ops = self.entry.node.open(locked, current_task, self, flags, access_check)?;
1110        FileObject::new(locked, current_task, ops, self.clone(), flags)
1111    }
1112
1113    /// Create or open a node in the file system.
1114    ///
1115    /// Works for any type of node other than a symlink.
1116    ///
1117    /// Will return an existing node unless `flags` contains `OpenFlags::EXCL`.
1118    pub fn open_create_node<L>(
1119        &self,
1120        locked: &mut Locked<L>,
1121        current_task: &CurrentTask,
1122        name: &FsStr,
1123        mode: FileMode,
1124        dev: DeviceId,
1125        flags: OpenFlags,
1126    ) -> Result<NamespaceNode, Errno>
1127    where
1128        L: LockEqualOrBefore<FileOpsCore>,
1129    {
1130        let owner = current_task.current_fscred();
1131        let mode = current_task.fs().apply_umask(mode);
1132        let create_fn =
1133            |locked: &mut Locked<L>, dir: &FsNodeHandle, mount: &MountInfo, name: &_| {
1134                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1135            };
1136        let entry = if flags.contains(OpenFlags::EXCL) {
1137            self.entry.create_entry(locked, current_task, &self.mount, name, create_fn)
1138        } else {
1139            self.entry.get_or_create_entry(locked, current_task, &self.mount, name, create_fn)
1140        }?;
1141        Ok(self.with_new_entry(entry))
1142    }
1143
1144    pub fn into_active(self) -> ActiveNamespaceNode {
1145        ActiveNamespaceNode::new(self)
1146    }
1147
1148    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1149        self.into_active().into_mapping(mode)
1150    }
1151
1152    /// Create a node in the file system.
1153    ///
1154    /// Works for any type of node other than a symlink.
1155    ///
1156    /// Does not return an existing node.
1157    pub fn create_node<L>(
1158        &self,
1159        locked: &mut Locked<L>,
1160        current_task: &CurrentTask,
1161        name: &FsStr,
1162        mode: FileMode,
1163        dev: DeviceId,
1164    ) -> Result<NamespaceNode, Errno>
1165    where
1166        L: LockEqualOrBefore<FileOpsCore>,
1167    {
1168        let owner = current_task.current_fscred();
1169        let mode = current_task.fs().apply_umask(mode);
1170        let entry = self.entry.create_entry(
1171            locked,
1172            current_task,
1173            &self.mount,
1174            name,
1175            |locked, dir, mount, name| {
1176                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1177            },
1178        )?;
1179        Ok(self.with_new_entry(entry))
1180    }
1181
1182    /// Create a symlink in the file system.
1183    ///
1184    /// To create another type of node, use `create_node`.
1185    pub fn create_symlink<L>(
1186        &self,
1187        locked: &mut Locked<L>,
1188        current_task: &CurrentTask,
1189        name: &FsStr,
1190        target: &FsStr,
1191    ) -> Result<NamespaceNode, Errno>
1192    where
1193        L: LockEqualOrBefore<FileOpsCore>,
1194    {
1195        let owner = current_task.current_fscred();
1196        let entry = self.entry.create_entry(
1197            locked,
1198            current_task,
1199            &self.mount,
1200            name,
1201            |locked, dir, mount, name| {
1202                dir.create_symlink(locked, current_task, mount, name, target, owner)
1203            },
1204        )?;
1205        Ok(self.with_new_entry(entry))
1206    }
1207
1208    /// Creates an anonymous file.
1209    ///
1210    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
1211    ///
1212    /// Used by O_TMPFILE.
1213    pub fn create_tmpfile<L>(
1214        &self,
1215        locked: &mut Locked<L>,
1216        current_task: &CurrentTask,
1217        mode: FileMode,
1218        flags: OpenFlags,
1219    ) -> Result<NamespaceNode, Errno>
1220    where
1221        L: LockEqualOrBefore<FileOpsCore>,
1222    {
1223        let owner = current_task.current_fscred();
1224        let mode = current_task.fs().apply_umask(mode);
1225        Ok(self.with_new_entry(self.entry.create_tmpfile(
1226            locked,
1227            current_task,
1228            &self.mount,
1229            mode,
1230            owner,
1231            flags,
1232        )?))
1233    }
1234
1235    pub fn link<L>(
1236        &self,
1237        locked: &mut Locked<L>,
1238        current_task: &CurrentTask,
1239        name: &FsStr,
1240        child: &FsNodeHandle,
1241    ) -> Result<NamespaceNode, Errno>
1242    where
1243        L: LockEqualOrBefore<FileOpsCore>,
1244    {
1245        let dir_entry = self.entry.create_entry(
1246            locked,
1247            current_task,
1248            &self.mount,
1249            name,
1250            |locked, dir, mount, name| dir.link(locked, current_task, mount, name, child),
1251        )?;
1252        Ok(self.with_new_entry(dir_entry))
1253    }
1254
1255    pub fn bind_socket<L>(
1256        &self,
1257        locked: &mut Locked<L>,
1258        current_task: &CurrentTask,
1259        name: &FsStr,
1260        socket: SocketHandle,
1261        socket_address: SocketAddress,
1262        mode: FileMode,
1263    ) -> Result<NamespaceNode, Errno>
1264    where
1265        L: LockEqualOrBefore<FileOpsCore>,
1266    {
1267        let dir_entry = self.entry.create_entry(
1268            locked,
1269            current_task,
1270            &self.mount,
1271            name,
1272            |locked, dir, mount, name| {
1273                let node = dir.create_node(
1274                    locked,
1275                    current_task,
1276                    mount,
1277                    name,
1278                    mode,
1279                    DeviceId::NONE,
1280                    current_task.current_fscred(),
1281                )?;
1282                if let Some(unix_socket) = socket.downcast_socket::<UnixSocket>() {
1283                    unix_socket.bind_socket_to_node(&socket, socket_address, &node)?;
1284                } else {
1285                    return error!(ENOTSUP);
1286                }
1287                Ok(node)
1288            },
1289        )?;
1290        Ok(self.with_new_entry(dir_entry))
1291    }
1292
1293    pub fn unlink<L>(
1294        &self,
1295        locked: &mut Locked<L>,
1296        current_task: &CurrentTask,
1297        name: &FsStr,
1298        kind: UnlinkKind,
1299        must_be_directory: bool,
1300    ) -> Result<(), Errno>
1301    where
1302        L: LockEqualOrBefore<FileOpsCore>,
1303    {
1304        if DirEntry::is_reserved_name(name) {
1305            match kind {
1306                UnlinkKind::Directory => {
1307                    if name == ".." {
1308                        error!(ENOTEMPTY)
1309                    } else if self.parent().is_none() {
1310                        // The client is attempting to remove the root.
1311                        error!(EBUSY)
1312                    } else {
1313                        error!(EINVAL)
1314                    }
1315                }
1316                UnlinkKind::NonDirectory => error!(ENOTDIR),
1317            }
1318        } else {
1319            self.entry.unlink(locked, current_task, &self.mount, name, kind, must_be_directory)
1320        }
1321    }
1322
1323    // Resolve the current node.
1324    //
1325    // Depending on context, this will resolve symlink and mount point.
1326    fn resolve<L>(
1327        self,
1328        locked: &mut Locked<L>,
1329        current_task: &CurrentTask,
1330        context: &mut LookupContext,
1331    ) -> Result<NamespaceNode, Errno>
1332    where
1333        L: LockEqualOrBefore<FileOpsCore>,
1334    {
1335        let mut node = self;
1336
1337        loop {
1338            if !node.entry.node.is_lnk() || context.symlink_mode == SymlinkMode::NoFollow {
1339                break;
1340            }
1341            if context.remaining_follows == 0
1342                || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
1343            {
1344                return error!(ELOOP);
1345            }
1346            context.remaining_follows -= 1;
1347            node = match node.readlink(locked, current_task)? {
1348                SymlinkTarget::Path(link_target) => {
1349                    let link_directory = if link_target[0] == b'/' {
1350                        // If the path is absolute, we'll resolve the root directory.
1351                        match &context.resolve_base {
1352                            ResolveBase::None => current_task.fs().root(),
1353                            ResolveBase::Beneath(_) => return error!(EXDEV),
1354                            ResolveBase::InRoot(root) => root.clone(),
1355                        }
1356                    } else {
1357                        // If the path is not absolute, it's a relative directory.
1358                        // Let's try to get the parent of the current node, or in the case that
1359                        // the node is the root we can just use that directly.
1360                        node.parent().unwrap_or(node)
1361                    };
1362                    current_task.lookup_path(
1363                        locked,
1364                        context,
1365                        link_directory,
1366                        link_target.as_ref(),
1367                    )?
1368                }
1369                SymlinkTarget::Node(node) => {
1370                    if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
1371                        return error!(ELOOP);
1372                    }
1373                    node
1374                }
1375            };
1376        }
1377        Ok(node.enter_mount())
1378    }
1379
1380    /// Traverse down a parent-to-child link in the namespace.
1381    pub fn lookup_child<L>(
1382        &self,
1383        locked: &mut Locked<L>,
1384        current_task: &CurrentTask,
1385        context: &mut LookupContext,
1386        basename: &FsStr,
1387    ) -> Result<NamespaceNode, Errno>
1388    where
1389        L: LockEqualOrBefore<FileOpsCore>,
1390    {
1391        self.lookup_children(locked, current_task, context, &[basename])
1392    }
1393
1394    /// Traverse down a parent-to-child link in the namespace.
1395    pub fn lookup_children<L>(
1396        &self,
1397        locked: &mut Locked<L>,
1398        current_task: &CurrentTask,
1399        context: &mut LookupContext,
1400        mut basenames: &[&FsStr],
1401    ) -> Result<NamespaceNode, Errno>
1402    where
1403        L: LockEqualOrBefore<FileOpsCore>,
1404    {
1405        for name in basenames {
1406            if name.len() > NAME_MAX as usize {
1407                return error!(ENAMETOOLONG);
1408            }
1409        }
1410
1411        let mut current_namespace_node = self.clone();
1412
1413        while basenames.len() > 0 {
1414            if !current_namespace_node.entry.node.is_dir() {
1415                return error!(ENOTDIR);
1416            }
1417
1418            let basename = basenames[0];
1419            if basename.is_empty() || basename == "." {
1420                basenames = &basenames[1..];
1421                continue;
1422            }
1423            if basename == ".." {
1424                let root = match &context.resolve_base {
1425                    ResolveBase::None => current_task.fs().root(),
1426                    ResolveBase::Beneath(node) => {
1427                        // Do not allow traversal out of the 'node'.
1428                        if current_namespace_node == *node {
1429                            return error!(EXDEV);
1430                        }
1431                        current_task.fs().root()
1432                    }
1433                    ResolveBase::InRoot(root) => root.clone(),
1434                };
1435
1436                // Make sure this can't escape a chroot.
1437                if current_namespace_node != root {
1438                    current_namespace_node =
1439                        current_namespace_node.parent().unwrap_or(current_namespace_node)
1440                }
1441                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1442                    && current_namespace_node.mount != self.mount
1443                {
1444                    return error!(EXDEV);
1445                }
1446
1447                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1448                    return error!(ENOTDIR);
1449                }
1450                basenames = &basenames[1..];
1451                continue;
1452            }
1453            if basenames.len() == 1
1454                || !current_namespace_node.entry.node.ops().has_lookup_pipelined()
1455            {
1456                current_namespace_node = current_namespace_node.with_new_entry(
1457                    current_namespace_node.entry.component_lookup(
1458                        locked,
1459                        current_task,
1460                        &current_namespace_node.mount,
1461                        basename,
1462                    )?,
1463                );
1464
1465                current_namespace_node =
1466                    current_namespace_node.resolve(locked, current_task, context)?;
1467
1468                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1469                    && current_namespace_node.mount != self.mount
1470                {
1471                    return error!(EXDEV);
1472                }
1473
1474                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1475                    return error!(ENOTDIR);
1476                }
1477
1478                basenames = &basenames[1..];
1479                continue;
1480            }
1481
1482            let pipelined_basenames = if let Some(pos) =
1483                basenames.iter().position(|&name| name.is_empty() || name == "." || name == "..")
1484            {
1485                &basenames[..pos]
1486            } else {
1487                basenames
1488            };
1489            let precomputed_entries = current_namespace_node.entry.get_children_pipelined(
1490                locked,
1491                current_task,
1492                &current_namespace_node.mount,
1493                pipelined_basenames,
1494            );
1495            for entry in precomputed_entries {
1496                basenames = &basenames[1..];
1497                let child = current_namespace_node.with_new_entry(entry?);
1498
1499                current_namespace_node = child.clone().resolve(locked, current_task, context)?;
1500
1501                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1502                    && current_namespace_node.mount != self.mount
1503                {
1504                    return error!(EXDEV);
1505                }
1506
1507                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1508                    return error!(ENOTDIR);
1509                }
1510
1511                if current_namespace_node != child {
1512                    break;
1513                }
1514            }
1515        }
1516
1517        Ok(current_namespace_node)
1518    }
1519
1520    /// Traverse up a child-to-parent link in the namespace.
1521    ///
1522    /// This traversal matches the child-to-parent link in the underlying
1523    /// FsNode except at mountpoints, where the link switches from one
1524    /// filesystem to another.
1525    pub fn parent(&self) -> Option<NamespaceNode> {
1526        let mountpoint_or_self = self.escape_mount();
1527        let parent = mountpoint_or_self.entry.parent()?;
1528        Some(mountpoint_or_self.with_new_entry(parent))
1529    }
1530
1531    /// Returns the parent, but does not escape mounts i.e. returns None if this node
1532    /// is the root of a mount.
1533    pub fn parent_within_mount(&self) -> Option<DirEntryHandle> {
1534        if let Ok(_) = self.mount_if_root() {
1535            return None;
1536        }
1537        self.entry.parent()
1538    }
1539
1540    /// Whether this namespace node is a descendant of the given node.
1541    ///
1542    /// Walks up the namespace node tree looking for ancestor. If ancestor is
1543    /// found, returns true. Otherwise, returns false.
1544    pub fn is_descendant_of(&self, ancestor: &NamespaceNode) -> bool {
1545        let ancestor = ancestor.escape_mount();
1546        let mut current = self.escape_mount();
1547        while current != ancestor {
1548            if let Some(parent) = current.parent() {
1549                current = parent.escape_mount();
1550            } else {
1551                return false;
1552            }
1553        }
1554        true
1555    }
1556
1557    /// If this is a mount point, return the root of the mount. Otherwise return self.
1558    fn enter_mount(&self) -> NamespaceNode {
1559        // While the child is a mountpoint, replace child with the mount's root.
1560        fn enter_one_mount(node: &NamespaceNode) -> Option<NamespaceNode> {
1561            if let Some(mount) = node.mount.deref() {
1562                if let Some(submount) =
1563                    mount.state.read().submounts.get(ArcKey::ref_cast(&node.entry))
1564                {
1565                    return Some(submount.mount.root());
1566                }
1567            }
1568            None
1569        }
1570        let mut inner = self.clone();
1571        while let Some(inner_root) = enter_one_mount(&inner) {
1572            inner = inner_root;
1573        }
1574        inner
1575    }
1576
1577    /// If this is the root of a mount, return the mount point. Otherwise return self.
1578    ///
1579    /// This is not exactly the same as parent(). If parent() is called on a root, it will escape
1580    /// the mount, but then return the parent of the mount point instead of the mount point.
1581    fn escape_mount(&self) -> NamespaceNode {
1582        let mut mountpoint_or_self = self.clone();
1583        while let Some(mountpoint) = mountpoint_or_self.mountpoint() {
1584            mountpoint_or_self = mountpoint;
1585        }
1586        mountpoint_or_self
1587    }
1588
1589    /// If this node is the root of a mount, return it. Otherwise EINVAL.
1590    pub fn mount_if_root(&self) -> Result<&MountHandle, Errno> {
1591        if let Some(mount) = self.mount.deref() {
1592            if Arc::ptr_eq(&self.entry, &mount.root) {
1593                return Ok(mount);
1594            }
1595        }
1596        error!(EINVAL)
1597    }
1598
1599    /// Returns the mountpoint at this location in the namespace.
1600    ///
1601    /// If this node is mounted in another node, this function returns the node
1602    /// at which this node is mounted. Otherwise, returns None.
1603    fn mountpoint(&self) -> Option<NamespaceNode> {
1604        self.mount_if_root().ok()?.read().mountpoint()
1605    }
1606
1607    /// The path from the filesystem root to this node.
1608    pub fn path(&self, fs: &FsContext) -> FsString {
1609        self.path_from_root(Some(&fs.root())).into_path()
1610    }
1611
1612    /// The path from the root of the namespace to this node.
1613    pub fn path_escaping_chroot(&self) -> FsString {
1614        self.path_from_root(None).into_path()
1615    }
1616
1617    /// Returns the path to this node, accounting for a custom root.
1618    /// A task may have a custom root set by `chroot`.
1619    pub fn path_from_root(&self, root: Option<&NamespaceNode>) -> PathWithReachability {
1620        if self.mount.is_none() {
1621            return self.unrooted_path();
1622        }
1623
1624        let mut path = PathBuilder::new();
1625        let mut current = self.escape_mount();
1626        if let Some(root) = root {
1627            let scope = RcuReadScope::new();
1628            // The current node is expected to intersect with the custom root as we travel up the tree.
1629            let root = root.escape_mount();
1630            while current != root {
1631                if let Some(parent) = current.parent() {
1632                    path.prepend_element(current.entry.local_name(&scope));
1633                    current = parent.escape_mount();
1634                } else {
1635                    // This node hasn't intersected with the custom root and has reached the namespace root.
1636                    let mut absolute_path = path.build_absolute();
1637                    if self.entry.is_dead() {
1638                        absolute_path.extend_from_slice(b" (deleted)");
1639                    }
1640
1641                    return PathWithReachability::Unreachable(absolute_path);
1642                }
1643            }
1644        } else {
1645            // No custom root, so travel up the tree to the namespace root.
1646            let scope = RcuReadScope::new();
1647            while let Some(parent) = current.parent() {
1648                path.prepend_element(current.entry.local_name(&scope));
1649                current = parent.escape_mount();
1650            }
1651        }
1652
1653        let mut absolute_path = path.build_absolute();
1654        if self.entry.is_dead() {
1655            absolute_path.extend_from_slice(b" (deleted)");
1656        }
1657
1658        PathWithReachability::Reachable(absolute_path)
1659    }
1660
1661    fn unrooted_path(&self) -> PathWithReachability {
1662        let scope = RcuReadScope::new();
1663        let mode = self.entry.node.info().mode;
1664        let local_name = self.entry.local_name(&scope);
1665        let path = if !local_name.is_empty() {
1666            format!("anon_inode:{}", local_name)
1667        } else if mode.is_sock() {
1668            format!("socket:[{}]", self.entry.node.ino)
1669        } else if mode.is_fifo() {
1670            format!("pipe:[{}]", self.entry.node.ino)
1671        } else {
1672            format!("file:[{}]", self.entry.node.ino)
1673        };
1674        PathWithReachability::Reachable(path.into())
1675    }
1676
1677    pub fn mount(&self, what: WhatToMount, flags: MountpointFlags) -> Result<(), Errno> {
1678        let mountpoint = self.enter_mount();
1679        let mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
1680        mount.create_submount(&mountpoint.entry, WhatSubmount::New(what, flags));
1681        Ok(())
1682    }
1683
1684    /// If this is the root of a filesystem, unmount. Otherwise return EINVAL.
1685    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
1686        let mount = self.enter_mount().mount_if_root()?.clone();
1687        mount.unmount(flags)
1688    }
1689
1690    pub fn rename<L>(
1691        locked: &mut Locked<L>,
1692        current_task: &CurrentTask,
1693        old_parent: &NamespaceNode,
1694        old_name: &FsStr,
1695        new_parent: &NamespaceNode,
1696        new_name: &FsStr,
1697        flags: RenameFlags,
1698    ) -> Result<(), Errno>
1699    where
1700        L: LockEqualOrBefore<FileOpsCore>,
1701    {
1702        DirEntry::rename(
1703            locked,
1704            current_task,
1705            &old_parent.entry,
1706            &old_parent.mount,
1707            old_name,
1708            &new_parent.entry,
1709            &new_parent.mount,
1710            new_name,
1711            flags,
1712        )
1713    }
1714
1715    fn with_new_entry(&self, entry: DirEntryHandle) -> NamespaceNode {
1716        Self { mount: self.mount.clone(), entry }
1717    }
1718
1719    fn mount_hash_key(&self) -> &ArcKey<DirEntry> {
1720        ArcKey::ref_cast(&self.entry)
1721    }
1722
1723    pub fn apply_suid_and_sgid(&self, creds: &mut Credentials) {
1724        // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1725        //
1726        //   The aforementioned transformations of the effective IDs are not
1727        //   performed ... if ... the underlying filesystem is mounted nosuid
1728        //   (the MS_NOSUID flag for mount(2)).
1729        if self.mount.flags().contains(MountFlags::NOSUID) {
1730            return;
1731        }
1732        self.entry.node.info().apply_suid_and_sgid(creds)
1733    }
1734
1735    pub fn update_atime(&self) {
1736        // Do not update the atime of this node if it is mounted with the NOATIME flag.
1737        if !self.mount.flags().contains(MountFlags::NOATIME) {
1738            self.entry.node.update_info(|info| {
1739                let now = utc::utc_now();
1740                info.time_access = now;
1741                info.pending_time_access_update = true;
1742            });
1743        }
1744    }
1745
1746    pub fn readlink<L>(
1747        &self,
1748        locked: &mut Locked<L>,
1749        current_task: &CurrentTask,
1750    ) -> Result<SymlinkTarget, Errno>
1751    where
1752        L: LockEqualOrBefore<FileOpsCore>,
1753    {
1754        self.update_atime();
1755        self.entry.node.readlink(locked, current_task)
1756    }
1757
1758    pub fn notify(&self, event_mask: InotifyMask) {
1759        if self.mount.is_some() {
1760            self.entry.notify(event_mask);
1761        }
1762    }
1763
1764    /// Check whether the node can be accessed in the current context with the specified access
1765    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1766    /// owner or is in the file's group.
1767    pub fn check_access<L>(
1768        &self,
1769        locked: &mut Locked<L>,
1770        current_task: &CurrentTask,
1771        permission_flags: impl Into<security::PermissionFlags>,
1772        reason: CheckAccessReason,
1773    ) -> Result<(), Errno>
1774    where
1775        L: LockEqualOrBefore<FileOpsCore>,
1776    {
1777        self.entry.node.check_access(
1778            locked,
1779            current_task,
1780            &self.mount,
1781            permission_flags,
1782            reason,
1783            self,
1784        )
1785    }
1786
1787    /// Checks if O_NOATIME is allowed,
1788    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1789        self.entry.node.check_o_noatime_allowed(current_task)
1790    }
1791
1792    pub fn truncate<L>(
1793        &self,
1794        locked: &mut Locked<L>,
1795        current_task: &CurrentTask,
1796        length: u64,
1797    ) -> Result<(), Errno>
1798    where
1799        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1800    {
1801        self.entry.node.truncate(locked, current_task, &self.mount, length)?;
1802        self.entry.notify_ignoring_excl_unlink(InotifyMask::MODIFY);
1803        Ok(())
1804    }
1805}
1806
1807impl fmt::Debug for NamespaceNode {
1808    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1809        f.debug_struct("NamespaceNode")
1810            .field("path", &self.path_escaping_chroot())
1811            .field("mount", &self.mount)
1812            .field("entry", &self.entry)
1813            .finish()
1814    }
1815}
1816
1817// Eq/Hash impls intended for the MOUNT_POINTS hash
1818impl PartialEq for NamespaceNode {
1819    fn eq(&self, other: &Self) -> bool {
1820        self.mount.as_ref().map(Arc::as_ptr).eq(&other.mount.as_ref().map(Arc::as_ptr))
1821            && Arc::ptr_eq(&self.entry, &other.entry)
1822    }
1823}
1824impl Eq for NamespaceNode {}
1825impl Hash for NamespaceNode {
1826    fn hash<H: Hasher>(&self, state: &mut H) {
1827        self.mount.as_ref().map(Arc::as_ptr).hash(state);
1828        Arc::as_ptr(&self.entry).hash(state);
1829    }
1830}
1831
1832/// A namespace node that keeps the underly mount busy.
1833#[derive(Debug, Clone)]
1834pub struct ActiveNamespaceNode {
1835    /// The underlying namespace node.
1836    name: NamespaceNode,
1837
1838    /// Adds a reference to the mount client marker to prevent the mount from
1839    /// being removed while the NamespaceNode is active. Is None iff mount is
1840    /// None.
1841    _marker: Option<MountClientMarker>,
1842}
1843
1844impl ActiveNamespaceNode {
1845    pub fn new(name: NamespaceNode) -> Self {
1846        let marker = name.mount.as_ref().map(|mount| mount.active_client_counter.clone());
1847        Self { name, _marker: marker }
1848    }
1849
1850    pub fn to_passive(&self) -> NamespaceNode {
1851        self.deref().clone()
1852    }
1853
1854    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1855        if let Some(mode) = mode {
1856            self.entry.node.write_guard_state.lock().acquire(mode)?;
1857        }
1858        Ok(Arc::new(FileMapping { name: self, mode }))
1859    }
1860}
1861
1862impl Deref for ActiveNamespaceNode {
1863    type Target = NamespaceNode;
1864
1865    fn deref(&self) -> &Self::Target {
1866        &self.name
1867    }
1868}
1869
1870impl PartialEq for ActiveNamespaceNode {
1871    fn eq(&self, other: &Self) -> bool {
1872        self.deref().eq(other.deref())
1873    }
1874}
1875impl Eq for ActiveNamespaceNode {}
1876impl Hash for ActiveNamespaceNode {
1877    fn hash<H: Hasher>(&self, state: &mut H) {
1878        self.deref().hash(state)
1879    }
1880}
1881
1882#[derive(Debug, Clone, PartialEq, Eq)]
1883#[must_use]
1884pub struct FileMapping {
1885    pub name: ActiveNamespaceNode,
1886    mode: Option<FileWriteGuardMode>,
1887}
1888
1889impl Drop for FileMapping {
1890    fn drop(&mut self) {
1891        if let Some(mode) = self.mode {
1892            self.name.entry.node.write_guard_state.lock().release(mode);
1893        }
1894    }
1895}
1896
1897/// Tracks all mounts, keyed by mount point.
1898pub struct Mounts {
1899    mounts: RcuHashMap<WeakKey<DirEntry>, Vec<ArcKey<Mount>>>,
1900}
1901
1902impl Mounts {
1903    pub fn new() -> Self {
1904        Mounts { mounts: RcuHashMap::default() }
1905    }
1906
1907    /// Registers the mount in the global mounts map.
1908    fn register_mount(&self, dir_entry: &Arc<DirEntry>, mount: MountHandle) -> Submount {
1909        let mut mounts = self.mounts.lock();
1910        let key = WeakKey::from(dir_entry);
1911        let mut vec = mounts.get(&key).unwrap_or_else(|| {
1912            dir_entry.set_has_mounts(true);
1913            Vec::new()
1914        });
1915        vec.push(ArcKey(mount.clone()));
1916        mounts.insert(key, vec);
1917        Submount { dir: ArcKey(dir_entry.clone()), mount }
1918    }
1919
1920    /// Unregisters the mount.  This is called by `Submount::drop`.
1921    fn unregister_mount(&self, dir_entry: &Arc<DirEntry>, mount: &MountHandle) {
1922        let mut mounts = self.mounts.lock();
1923        let key = WeakKey::from(dir_entry);
1924        if let Some(mut vec) = mounts.get(&key) {
1925            let index = vec.iter().position(|e| e == ArcKey::ref_cast(mount)).unwrap();
1926            if vec.len() == 1 {
1927                mounts.remove(&key);
1928                dir_entry.set_has_mounts(false);
1929            } else {
1930                vec.swap_remove(index);
1931                mounts.insert(key, vec);
1932            }
1933        }
1934    }
1935
1936    /// Unmounts all mounts associated with `dir_entry`.  This is called when `dir_entry` is
1937    /// unlinked (which would normally result in EBUSY, but not if it isn't mounted in the local
1938    /// namespace).
1939    pub fn unmount(&self, dir_entry: &DirEntry) {
1940        let mounts = self.mounts.lock().remove(&PtrKey::from(dir_entry as *const _));
1941        if let Some(mounts) = mounts {
1942            for mount in mounts {
1943                // Ignore errors.
1944                let _ = mount.unmount(UnmountFlags::DETACH);
1945            }
1946        }
1947    }
1948
1949    /// Drain mounts. For each drained mount, force a FileSystem unmount.
1950    // TODO(https://fxbug.dev/295073633): Graceful shutdown should try to first unmount the mounts
1951    // and only force a FileSystem unmount on failure.
1952    pub fn clear(&self) {
1953        for (_dir_entry, mounts) in self.mounts.lock().drain() {
1954            for mount in mounts {
1955                mount.fs.force_unmount_ops();
1956            }
1957        }
1958    }
1959
1960    pub fn sync_all(
1961        &self,
1962        locked: &mut Locked<Unlocked>,
1963        current_task: &CurrentTask,
1964    ) -> Result<(), Errno> {
1965        let mut filesystems = Vec::new();
1966        {
1967            let scope = RcuReadScope::new();
1968            let mut seen = HashSet::new();
1969            for (_dir_entry, m_list) in self.mounts.iter(&scope) {
1970                for m in m_list {
1971                    if seen.insert(Arc::as_ptr(&m.fs)) {
1972                        filesystems.push(m.fs.clone());
1973                    }
1974                }
1975            }
1976        }
1977
1978        for fs in filesystems {
1979            if let Err(e) = fs.sync(locked, current_task) {
1980                log_warn!("sync failed for filesystem {:?}: {:?}", fs.name(), e);
1981            }
1982        }
1983        Ok(())
1984    }
1985}
1986
1987/// A RAII object that unregisters a mount when dropped.
1988#[derive(Debug)]
1989struct Submount {
1990    dir: ArcKey<DirEntry>,
1991    mount: MountHandle,
1992}
1993
1994impl Drop for Submount {
1995    fn drop(&mut self) {
1996        self.mount.fs.kernel.upgrade().unwrap().mounts.unregister_mount(&self.dir, &self.mount)
1997    }
1998}
1999
2000/// Submount is stored in a mount's submounts hash set, which is keyed by the mountpoint.
2001impl Eq for Submount {}
2002impl PartialEq<Self> for Submount {
2003    fn eq(&self, other: &Self) -> bool {
2004        self.dir == other.dir
2005    }
2006}
2007impl Hash for Submount {
2008    fn hash<H: Hasher>(&self, state: &mut H) {
2009        self.dir.hash(state)
2010    }
2011}
2012
2013impl Borrow<ArcKey<DirEntry>> for Submount {
2014    fn borrow(&self) -> &ArcKey<DirEntry> {
2015        &self.dir
2016    }
2017}
2018
2019#[cfg(test)]
2020mod test {
2021    use crate::fs::tmpfs::TmpFs;
2022    use crate::testing::spawn_kernel_and_run;
2023    use crate::vfs::namespace::DeviceId;
2024    use crate::vfs::{
2025        CallbackSymlinkNode, FsNodeInfo, LookupContext, MountInfo, Namespace, NamespaceNode,
2026        RenameFlags, SymlinkMode, SymlinkTarget, UnlinkKind, WhatToMount,
2027    };
2028    use starnix_uapi::mount_flags::MountpointFlags;
2029    use starnix_uapi::{errno, mode};
2030    use std::sync::Arc;
2031
2032    #[::fuchsia::test]
2033    async fn test_namespace() {
2034        spawn_kernel_and_run(async |locked, current_task| {
2035            let kernel = current_task.kernel();
2036            let root_fs = TmpFs::new_fs(locked, &kernel);
2037            let root_node = Arc::clone(root_fs.root());
2038            let _dev_node = root_node
2039                .create_dir(locked, &current_task, "dev".into())
2040                .expect("failed to mkdir dev");
2041            let dev_fs = TmpFs::new_fs(locked, &kernel);
2042            let dev_root_node = Arc::clone(dev_fs.root());
2043            let _dev_pts_node = dev_root_node
2044                .create_dir(locked, &current_task, "pts".into())
2045                .expect("failed to mkdir pts");
2046
2047            let ns = Namespace::new(root_fs);
2048            let mut context = LookupContext::default();
2049            let dev = ns
2050                .root()
2051                .lookup_child(locked, &current_task, &mut context, "dev".into())
2052                .expect("failed to lookup dev");
2053            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2054                .expect("failed to mount dev root node");
2055
2056            let mut context = LookupContext::default();
2057            let dev = ns
2058                .root()
2059                .lookup_child(locked, &current_task, &mut context, "dev".into())
2060                .expect("failed to lookup dev");
2061            let mut context = LookupContext::default();
2062            let pts = dev
2063                .lookup_child(locked, &current_task, &mut context, "pts".into())
2064                .expect("failed to lookup pts");
2065            let pts_parent =
2066                pts.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of pts");
2067            assert!(Arc::ptr_eq(&pts_parent.entry, &dev.entry));
2068
2069            let dev_parent =
2070                dev.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of dev");
2071            assert!(Arc::ptr_eq(&dev_parent.entry, &ns.root().entry));
2072        })
2073        .await;
2074    }
2075
2076    #[::fuchsia::test]
2077    async fn test_mount_does_not_upgrade() {
2078        spawn_kernel_and_run(async |locked, current_task| {
2079            let kernel = current_task.kernel();
2080            let root_fs = TmpFs::new_fs(locked, &kernel);
2081            let root_node = Arc::clone(root_fs.root());
2082            let _dev_node = root_node
2083                .create_dir(locked, &current_task, "dev".into())
2084                .expect("failed to mkdir dev");
2085            let dev_fs = TmpFs::new_fs(locked, &kernel);
2086            let dev_root_node = Arc::clone(dev_fs.root());
2087            let _dev_pts_node = dev_root_node
2088                .create_dir(locked, &current_task, "pts".into())
2089                .expect("failed to mkdir pts");
2090
2091            let ns = Namespace::new(root_fs);
2092            let mut context = LookupContext::default();
2093            let dev = ns
2094                .root()
2095                .lookup_child(locked, &current_task, &mut context, "dev".into())
2096                .expect("failed to lookup dev");
2097            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2098                .expect("failed to mount dev root node");
2099            let mut context = LookupContext::default();
2100            let new_dev = ns
2101                .root()
2102                .lookup_child(locked, &current_task, &mut context, "dev".into())
2103                .expect("failed to lookup dev again");
2104            assert!(!Arc::ptr_eq(&dev.entry, &new_dev.entry));
2105            assert_ne!(&dev, &new_dev);
2106
2107            let mut context = LookupContext::default();
2108            let _new_pts = new_dev
2109                .lookup_child(locked, &current_task, &mut context, "pts".into())
2110                .expect("failed to lookup pts");
2111            let mut context = LookupContext::default();
2112            assert!(dev.lookup_child(locked, &current_task, &mut context, "pts".into()).is_err());
2113        })
2114        .await;
2115    }
2116
2117    #[::fuchsia::test]
2118    async fn test_path() {
2119        spawn_kernel_and_run(async |locked, current_task| {
2120            let kernel = current_task.kernel();
2121            let root_fs = TmpFs::new_fs(locked, &kernel);
2122            let root_node = Arc::clone(root_fs.root());
2123            let _dev_node = root_node
2124                .create_dir(locked, &current_task, "dev".into())
2125                .expect("failed to mkdir dev");
2126            let dev_fs = TmpFs::new_fs(locked, &kernel);
2127            let dev_root_node = Arc::clone(dev_fs.root());
2128            let _dev_pts_node = dev_root_node
2129                .create_dir(locked, &current_task, "pts".into())
2130                .expect("failed to mkdir pts");
2131
2132            let ns = Namespace::new(root_fs);
2133            let mut context = LookupContext::default();
2134            let dev = ns
2135                .root()
2136                .lookup_child(locked, &current_task, &mut context, "dev".into())
2137                .expect("failed to lookup dev");
2138            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2139                .expect("failed to mount dev root node");
2140
2141            let mut context = LookupContext::default();
2142            let dev = ns
2143                .root()
2144                .lookup_child(locked, &current_task, &mut context, "dev".into())
2145                .expect("failed to lookup dev");
2146            let mut context = LookupContext::default();
2147            let pts = dev
2148                .lookup_child(locked, &current_task, &mut context, "pts".into())
2149                .expect("failed to lookup pts");
2150
2151            assert_eq!("/", ns.root().path_escaping_chroot());
2152            assert_eq!("/dev", dev.path_escaping_chroot());
2153            assert_eq!("/dev/pts", pts.path_escaping_chroot());
2154        })
2155        .await;
2156    }
2157
2158    #[::fuchsia::test]
2159    async fn test_shadowing() {
2160        spawn_kernel_and_run(async |locked, current_task| {
2161            let kernel = current_task.kernel();
2162            let root_fs = TmpFs::new_fs(locked, &kernel);
2163            let ns = Namespace::new(root_fs.clone());
2164            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2165            let mut context = LookupContext::default();
2166            let foo_dir =
2167                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2168
2169            let foofs1 = TmpFs::new_fs(locked, &kernel);
2170            foo_dir.mount(WhatToMount::Fs(foofs1.clone()), MountpointFlags::empty()).unwrap();
2171            let mut context = LookupContext::default();
2172            assert!(Arc::ptr_eq(
2173                &ns.root()
2174                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2175                    .unwrap()
2176                    .entry,
2177                foofs1.root()
2178            ));
2179            let foo_dir =
2180                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2181
2182            let ns_clone = ns.clone_namespace();
2183
2184            let foofs2 = TmpFs::new_fs(locked, &kernel);
2185            foo_dir.mount(WhatToMount::Fs(foofs2.clone()), MountpointFlags::empty()).unwrap();
2186            let mut context = LookupContext::default();
2187            assert!(Arc::ptr_eq(
2188                &ns.root()
2189                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2190                    .unwrap()
2191                    .entry,
2192                foofs2.root()
2193            ));
2194
2195            assert!(Arc::ptr_eq(
2196                &ns_clone
2197                    .root()
2198                    .lookup_child(
2199                        locked,
2200                        &current_task,
2201                        &mut LookupContext::default(),
2202                        "foo".into()
2203                    )
2204                    .unwrap()
2205                    .entry,
2206                foofs1.root()
2207            ));
2208        })
2209        .await;
2210    }
2211
2212    #[::fuchsia::test]
2213    async fn test_unlink_mounted_directory() {
2214        spawn_kernel_and_run(async |locked, current_task| {
2215            let kernel = current_task.kernel();
2216            let root_fs = TmpFs::new_fs(locked, &kernel);
2217            let ns1 = Namespace::new(root_fs.clone());
2218            let ns2 = Namespace::new(root_fs.clone());
2219            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2220            let mut context = LookupContext::default();
2221            let foo_dir =
2222                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2223
2224            let foofs = TmpFs::new_fs(locked, &kernel);
2225            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2226
2227            // Trying to unlink from ns1 should fail.
2228            assert_eq!(
2229                ns1.root()
2230                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2231                    .unwrap_err(),
2232                errno!(EBUSY),
2233            );
2234
2235            // But unlinking from ns2 should succeed.
2236            ns2.root()
2237                .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2238                .expect("unlink failed");
2239
2240            // And it should no longer show up in ns1.
2241            assert_eq!(
2242                ns1.root()
2243                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2244                    .unwrap_err(),
2245                errno!(ENOENT),
2246            );
2247        })
2248        .await;
2249    }
2250
2251    #[::fuchsia::test]
2252    async fn test_rename_mounted_directory() {
2253        spawn_kernel_and_run(async |locked, current_task| {
2254            let kernel = current_task.kernel();
2255            let root_fs = TmpFs::new_fs(locked, &kernel);
2256            let ns1 = Namespace::new(root_fs.clone());
2257            let ns2 = Namespace::new(root_fs.clone());
2258            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2259            let _bar_node = root_fs.root().create_dir(locked, &current_task, "bar".into()).unwrap();
2260            let _baz_node = root_fs.root().create_dir(locked, &current_task, "baz".into()).unwrap();
2261            let mut context = LookupContext::default();
2262            let foo_dir =
2263                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2264
2265            let foofs = TmpFs::new_fs(locked, &kernel);
2266            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2267
2268            // Trying to rename over foo from ns1 should fail.
2269            let root = ns1.root();
2270            assert_eq!(
2271                NamespaceNode::rename(
2272                    locked,
2273                    &current_task,
2274                    &root,
2275                    "bar".into(),
2276                    &root,
2277                    "foo".into(),
2278                    RenameFlags::empty()
2279                )
2280                .unwrap_err(),
2281                errno!(EBUSY),
2282            );
2283            // Likewise the other way.
2284            assert_eq!(
2285                NamespaceNode::rename(
2286                    locked,
2287                    &current_task,
2288                    &root,
2289                    "foo".into(),
2290                    &root,
2291                    "bar".into(),
2292                    RenameFlags::empty()
2293                )
2294                .unwrap_err(),
2295                errno!(EBUSY),
2296            );
2297
2298            // But renaming from ns2 should succeed.
2299            let root = ns2.root();
2300
2301            // First rename the directory with the mount.
2302            NamespaceNode::rename(
2303                locked,
2304                &current_task,
2305                &root,
2306                "foo".into(),
2307                &root,
2308                "bar".into(),
2309                RenameFlags::empty(),
2310            )
2311            .expect("rename failed");
2312
2313            // Renaming over a directory with a mount should also work.
2314            NamespaceNode::rename(
2315                locked,
2316                &current_task,
2317                &root,
2318                "baz".into(),
2319                &root,
2320                "bar".into(),
2321                RenameFlags::empty(),
2322            )
2323            .expect("rename failed");
2324
2325            // "foo" and "baz" should no longer show up in ns1.
2326            assert_eq!(
2327                ns1.root()
2328                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2329                    .unwrap_err(),
2330                errno!(ENOENT)
2331            );
2332            assert_eq!(
2333                ns1.root()
2334                    .lookup_child(locked, &current_task, &mut context, "baz".into())
2335                    .unwrap_err(),
2336                errno!(ENOENT)
2337            );
2338        })
2339        .await;
2340    }
2341
2342    /// Symlinks which need to be traversed across types (nodes and paths), as well as across
2343    /// owning directories, can be tricky to get right.
2344    #[::fuchsia::test]
2345    async fn test_lookup_with_symlink_chain() {
2346        spawn_kernel_and_run(async |locked, current_task| {
2347            // Set up the root filesystem
2348            let kernel = current_task.kernel();
2349            let root_fs = TmpFs::new_fs(locked, &kernel);
2350            let root_node = Arc::clone(root_fs.root());
2351            let _first_subdir_node = root_node
2352                .create_dir(locked, &current_task, "first_subdir".into())
2353                .expect("failed to mkdir dev");
2354            let _second_subdir_node = root_node
2355                .create_dir(locked, &current_task, "second_subdir".into())
2356                .expect("failed to mkdir dev");
2357
2358            // Set up two subdirectories under the root filesystem
2359            let first_subdir_fs = TmpFs::new_fs(locked, &kernel);
2360            let second_subdir_fs = TmpFs::new_fs(locked, &kernel);
2361
2362            let ns = Namespace::new(root_fs);
2363            let mut context = LookupContext::default();
2364            let first_subdir = ns
2365                .root()
2366                .lookup_child(locked, &current_task, &mut context, "first_subdir".into())
2367                .expect("failed to lookup first_subdir");
2368            first_subdir
2369                .mount(WhatToMount::Fs(first_subdir_fs), MountpointFlags::empty())
2370                .expect("failed to mount first_subdir fs node");
2371            let second_subdir = ns
2372                .root()
2373                .lookup_child(locked, &current_task, &mut context, "second_subdir".into())
2374                .expect("failed to lookup second_subdir");
2375            second_subdir
2376                .mount(WhatToMount::Fs(second_subdir_fs), MountpointFlags::empty())
2377                .expect("failed to mount second_subdir fs node");
2378
2379            // Create the symlink structure. To trigger potential symlink traversal bugs, we're going
2380            // for the following directory structure:
2381            // / (root)
2382            //     + first_subdir/
2383            //         - real_file
2384            //         - path_symlink (-> real_file)
2385            //     + second_subdir/
2386            //         - node_symlink (-> path_symlink)
2387            let real_file_node = first_subdir
2388                .create_node(
2389                    locked,
2390                    &current_task,
2391                    "real_file".into(),
2392                    mode!(IFREG, 0o777),
2393                    DeviceId::NONE,
2394                )
2395                .expect("failed to create real_file");
2396            first_subdir
2397                .create_symlink(locked, &current_task, "path_symlink".into(), "real_file".into())
2398                .expect("failed to create path_symlink");
2399
2400            let mut no_follow_lookup_context = LookupContext::new(SymlinkMode::NoFollow);
2401            let path_symlink_node = first_subdir
2402                .lookup_child(
2403                    locked,
2404                    &current_task,
2405                    &mut no_follow_lookup_context,
2406                    "path_symlink".into(),
2407                )
2408                .expect("Failed to lookup path_symlink");
2409
2410            // The second symlink needs to be of type SymlinkTarget::Node in order to trip the sensitive
2411            // code path. There's no easy method for creating this type of symlink target, so we'll need
2412            // to construct a node from scratch and insert it into the directory manually.
2413            let node_symlink_node = second_subdir.entry.node.fs().create_node_and_allocate_node_id(
2414                CallbackSymlinkNode::new(move || {
2415                    let node = path_symlink_node.clone();
2416                    Ok(SymlinkTarget::Node(node))
2417                }),
2418                FsNodeInfo::new(mode!(IFLNK, 0o777), current_task.current_fscred()),
2419            );
2420            second_subdir
2421                .entry
2422                .create_entry(
2423                    locked,
2424                    &current_task,
2425                    &MountInfo::detached(),
2426                    "node_symlink".into(),
2427                    move |_locked, _dir, _mount, _name| Ok(node_symlink_node),
2428                )
2429                .expect("failed to create node_symlink entry");
2430
2431            // Finally, exercise the lookup under test.
2432            let mut follow_lookup_context = LookupContext::new(SymlinkMode::Follow);
2433            let node_symlink_resolution = second_subdir
2434                .lookup_child(
2435                    locked,
2436                    &current_task,
2437                    &mut follow_lookup_context,
2438                    "node_symlink".into(),
2439                )
2440                .expect("lookup with symlink chain failed");
2441
2442            // The lookup resolution should have correctly followed the symlinks to the real_file node.
2443            assert!(node_symlink_resolution.entry.node.ino == real_file_node.entry.node.ino);
2444        })
2445        .await;
2446    }
2447}