Skip to main content

starnix_core/vfs/
namespace.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mutable_state::{state_accessor, state_implementation};
6use crate::security;
7use crate::task::{CurrentTask, EventHandler, Kernel, Task, WaitCanceler, Waiter};
8use crate::time::utc;
9use crate::vfs::fs_registry::FsRegistry;
10use crate::vfs::pseudo::dynamic_file::{DynamicFile, DynamicFileBuf, DynamicFileSource};
11use crate::vfs::pseudo::simple_file::SimpleFileNode;
12use crate::vfs::socket::{SocketAddress, SocketHandle, UnixSocket};
13use crate::vfs::{
14    CheckAccessReason, DirEntry, DirEntryHandle, FileHandle, FileObject, FileOps, FileSystemHandle,
15    FileSystemOptions, FileWriteGuardMode, FsContext, FsNode, FsNodeHandle, FsNodeOps, FsStr,
16    FsString, PathBuilder, RenameFlags, SymlinkTarget, UnlinkKind, fileops_impl_dataless,
17    fileops_impl_delegate_read_write_and_seek, fileops_impl_nonseekable, fileops_impl_noop_sync,
18    fs_node_impl_not_dir,
19};
20use fuchsia_rcu::RcuReadScope;
21use macro_rules_attribute::apply;
22use ref_cast::RefCast;
23use starnix_logging::log_warn;
24use starnix_rcu::RcuHashMap;
25use starnix_sync::{
26    BeforeFsNodeAppend, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, Mutex, RwLock, Unlocked,
27};
28use starnix_types::ownership::WeakRef;
29use starnix_uapi::arc_key::{ArcKey, PtrKey, WeakKey};
30use starnix_uapi::auth::UserAndOrGroupId;
31use starnix_uapi::device_type::DeviceType;
32use starnix_uapi::errors::Errno;
33use starnix_uapi::file_mode::{AccessCheck, FileMode};
34use starnix_uapi::inotify_mask::InotifyMask;
35use starnix_uapi::mount_flags::MountFlags;
36use starnix_uapi::open_flags::OpenFlags;
37use starnix_uapi::unmount_flags::UnmountFlags;
38use starnix_uapi::vfs::{FdEvents, ResolveFlags};
39use starnix_uapi::{NAME_MAX, errno, error};
40use std::borrow::Borrow;
41use std::collections::HashSet;
42use std::fmt;
43use std::hash::{Hash, Hasher};
44use std::ops::{Deref, DerefMut};
45use std::sync::{Arc, Weak};
46
47/// A mount namespace.
48///
49/// The namespace records at which entries filesystems are mounted.
50#[derive(Debug)]
51pub struct Namespace {
52    root_mount: MountHandle,
53
54    // Unique ID of this namespace.
55    pub id: u64,
56}
57
58impl Namespace {
59    pub fn new(fs: FileSystemHandle) -> Arc<Namespace> {
60        Self::new_with_flags(fs, MountFlags::empty())
61    }
62
63    pub fn new_with_flags(fs: FileSystemHandle, flags: MountFlags) -> Arc<Namespace> {
64        let kernel = fs.kernel.upgrade().expect("can't create namespace without a kernel");
65        let root_mount = Mount::new(WhatToMount::Fs(fs), flags);
66        Arc::new(Self { root_mount, id: kernel.get_next_namespace_id() })
67    }
68
69    pub fn root(&self) -> NamespaceNode {
70        self.root_mount.root()
71    }
72
73    pub fn clone_namespace(&self) -> Arc<Namespace> {
74        let kernel =
75            self.root_mount.fs.kernel.upgrade().expect("can't clone namespace without a kernel");
76        Arc::new(Self {
77            root_mount: self.root_mount.clone_mount_recursive(),
78            id: kernel.get_next_namespace_id(),
79        })
80    }
81
82    /// Assuming new_ns is a clone of the namespace that node is from, return the equivalent of
83    /// node in new_ns. If this assumption is violated, returns None.
84    pub fn translate_node(mut node: NamespaceNode, new_ns: &Namespace) -> Option<NamespaceNode> {
85        // Collect the list of mountpoints that leads to this node's mount
86        let mut mountpoints = vec![];
87        let mut mount = node.mount;
88        while let Some(mountpoint) = mount.as_ref().and_then(|m| m.read().mountpoint()) {
89            mountpoints.push(mountpoint.entry);
90            mount = mountpoint.mount;
91        }
92
93        // Follow the same path in the new namespace
94        let mut mount = Arc::clone(&new_ns.root_mount);
95        for mountpoint in mountpoints.iter().rev() {
96            let next_mount =
97                mount.read().submounts.get(ArcKey::ref_cast(mountpoint))?.mount.clone();
98            mount = next_mount;
99        }
100        node.mount = Some(mount).into();
101        Some(node)
102    }
103}
104
105impl FsNodeOps for Arc<Namespace> {
106    fs_node_impl_not_dir!();
107
108    fn create_file_ops(
109        &self,
110        _locked: &mut Locked<FileOpsCore>,
111        _node: &FsNode,
112        _current_task: &CurrentTask,
113        _flags: OpenFlags,
114    ) -> Result<Box<dyn FileOps>, Errno> {
115        Ok(Box::new(MountNamespaceFile(self.clone())))
116    }
117}
118
119pub struct MountNamespaceFile(pub Arc<Namespace>);
120
121impl FileOps for MountNamespaceFile {
122    fileops_impl_nonseekable!();
123    fileops_impl_dataless!();
124    fileops_impl_noop_sync!();
125}
126
127/// An empty struct that we use to track the number of active clients for a mount.
128///
129/// Each active client takes a reference to this object. The unmount operation fails
130/// if there are any active clients of the mount.
131type MountClientMarker = Arc<()>;
132
133/// An instance of a filesystem mounted in a namespace.
134///
135/// At a mount, path traversal switches from one filesystem to another.
136/// The client sees a composed directory structure that glues together the
137/// directories from the underlying FsNodes from those filesystems.
138///
139/// The mounts in a namespace form a mount tree, with `mountpoint` pointing to the parent and
140/// `submounts` pointing to the children.
141pub struct Mount {
142    root: DirEntryHandle,
143    flags: Mutex<MountFlags>,
144    fs: FileSystemHandle,
145
146    /// A unique identifier for this mount reported in /proc/pid/mountinfo.
147    id: u64,
148
149    /// A count of the number of active clients.
150    active_client_counter: MountClientMarker,
151
152    // Lock ordering: mount -> submount
153    state: RwLock<MountState>,
154    // Mount used to contain a Weak<Namespace>. It no longer does because since the mount point
155    // hash was moved from Namespace to Mount, nothing actually uses it. Now that
156    // Namespace::clone_namespace() is implemented in terms of Mount::clone_mount_recursive, it
157    // won't be trivial to add it back. I recommend turning the mountpoint field into an enum of
158    // Mountpoint or Namespace, maybe called "parent", and then traverse up to the top of the tree
159    // if you need to find a Mount's Namespace.
160}
161type MountHandle = Arc<Mount>;
162
163/// Public representation of the mount options.
164#[derive(Clone, Debug)]
165pub struct MountInfo {
166    handle: Option<MountHandle>,
167}
168
169impl MountInfo {
170    /// `MountInfo` for a element that is not tied to a given mount. Mount flags will be considered
171    /// empty.
172    pub fn detached() -> Self {
173        None.into()
174    }
175
176    /// The mount flags of the represented mount.
177    pub fn flags(&self) -> MountFlags {
178        if let Some(handle) = &self.handle {
179            handle.flags()
180        } else {
181            // Consider not mounted node have the NOATIME flags.
182            MountFlags::NOATIME
183        }
184    }
185
186    /// Checks whether this `MountInfo` represents a writable file system mount.
187    pub fn check_readonly_filesystem(&self) -> Result<(), Errno> {
188        if self.flags().contains(MountFlags::RDONLY) {
189            return error!(EROFS);
190        }
191        Ok(())
192    }
193
194    /// Checks whether this `MountInfo` represents an executable file system mount.
195    pub fn check_noexec_filesystem(&self) -> Result<(), Errno> {
196        if self.flags().contains(MountFlags::NOEXEC) {
197            return error!(EACCES);
198        }
199        Ok(())
200    }
201}
202
203impl Deref for MountInfo {
204    type Target = Option<MountHandle>;
205
206    fn deref(&self) -> &Self::Target {
207        &self.handle
208    }
209}
210
211impl DerefMut for MountInfo {
212    fn deref_mut(&mut self) -> &mut Self::Target {
213        &mut self.handle
214    }
215}
216
217impl std::cmp::PartialEq for MountInfo {
218    fn eq(&self, other: &Self) -> bool {
219        self.handle.as_ref().map(Arc::as_ptr) == other.handle.as_ref().map(Arc::as_ptr)
220    }
221}
222
223impl std::cmp::Eq for MountInfo {}
224
225impl Into<MountInfo> for Option<MountHandle> {
226    fn into(self) -> MountInfo {
227        MountInfo { handle: self }
228    }
229}
230
231#[derive(Default)]
232pub struct MountState {
233    /// The namespace node that this mount is mounted on. This is a tuple instead of a
234    /// NamespaceNode because the Mount pointer has to be weak because this is the pointer to the
235    /// parent mount, the parent has a pointer to the children too, and making both strong would be
236    /// a cycle.
237    mountpoint: Option<(Weak<Mount>, DirEntryHandle)>,
238
239    // The set is keyed by the mountpoints which are always descendants of this mount's root.
240    // Conceptually, the set is more akin to a map: `DirEntry -> MountHandle`, but we use a set
241    // instead because `Submount` has a drop implementation that needs both the key and value.
242    //
243    // Each directory entry can only have one mount attached. Mount shadowing works by using the
244    // root of the inner mount as a mountpoint. For example, if filesystem A is mounted at /foo,
245    // mounting filesystem B on /foo will create the mount as a child of the A mount, attached to
246    // A's root, instead of the root mount.
247    submounts: HashSet<Submount>,
248
249    /// The membership of this mount in its peer group. Do not access directly. Instead use
250    /// peer_group(), take_from_peer_group(), and set_peer_group().
251    // TODO(tbodt): Refactor the links into, some kind of extra struct or something? This is hard
252    // because setting this field requires the Arc<Mount>.
253    peer_group_: Option<(Arc<PeerGroup>, PtrKey<Mount>)>,
254    /// The membership of this mount in a PeerGroup's downstream. Do not access directly. Instead
255    /// use upstream(), take_from_upstream(), and set_upstream().
256    upstream_: Option<(Weak<PeerGroup>, PtrKey<Mount>)>,
257}
258
259/// A group of mounts. Setting MS_SHARED on a mount puts it in its own peer group. Any bind mounts
260/// of a mount in the group are also added to the group. A mount created in any mount in a peer
261/// group will be automatically propagated (recreated) in every other mount in the group.
262#[derive(Default)]
263struct PeerGroup {
264    id: u64,
265    state: RwLock<PeerGroupState>,
266}
267#[derive(Default)]
268struct PeerGroupState {
269    mounts: HashSet<WeakKey<Mount>>,
270    downstream: HashSet<WeakKey<Mount>>,
271}
272
273pub enum WhatToMount {
274    Fs(FileSystemHandle),
275    Bind(NamespaceNode),
276}
277
278impl Mount {
279    pub fn new(what: WhatToMount, flags: MountFlags) -> MountHandle {
280        match what {
281            WhatToMount::Fs(fs) => Self::new_with_root(fs.root().clone(), flags),
282            WhatToMount::Bind(node) => {
283                let mount = node.mount.as_ref().expect("can't bind mount from an anonymous node");
284                mount.clone_mount(&node.entry, flags)
285            }
286        }
287    }
288
289    fn new_with_root(root: DirEntryHandle, flags: MountFlags) -> MountHandle {
290        let known_flags = MountFlags::STORED_ON_MOUNT;
291        assert!(
292            !flags.intersects(!known_flags),
293            "mount created with extra flags {:?}",
294            flags - known_flags
295        );
296        let fs = root.node.fs();
297        let kernel = fs.kernel.upgrade().expect("can't create mount without kernel");
298        Arc::new(Self {
299            id: kernel.get_next_mount_id(),
300            flags: Mutex::new(flags),
301            root,
302            active_client_counter: Default::default(),
303            fs,
304            state: Default::default(),
305        })
306    }
307
308    /// A namespace node referring to the root of the mount.
309    pub fn root(self: &MountHandle) -> NamespaceNode {
310        NamespaceNode::new(Arc::clone(self), Arc::clone(&self.root))
311    }
312
313    /// Create the specified mount as a child. Also propagate it to the mount's peer group.
314    fn create_submount(
315        self: &MountHandle,
316        dir: &DirEntryHandle,
317        what: WhatToMount,
318        flags: MountFlags,
319    ) {
320        // TODO(tbodt): Making a copy here is necessary for lock ordering, because the peer group
321        // lock nests inside all mount locks (it would be impractical to reverse this because you
322        // need to lock a mount to get its peer group.) But it opens the door to race conditions
323        // where if a peer are concurrently being added, the mount might not get propagated to the
324        // new peer. The only true solution to this is bigger locks, somehow using the same lock
325        // for the peer group and all of the mounts in the group. Since peer groups are fluid and
326        // can have mounts constantly joining and leaving and then joining other groups, the only
327        // sensible locking option is to use a single global lock for all mounts and peer groups.
328        // This is almost impossible to express in rust. Help.
329        //
330        // Update: Also necessary to make a copy to prevent excess replication, see the comment on
331        // the following Mount::new call.
332        let peers = {
333            let state = self.state.read();
334            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
335        };
336
337        // Create the mount after copying the peer groups, because in the case of creating a bind
338        // mount inside itself, the new mount would get added to our peer group during the
339        // Mount::new call, but we don't want to replicate into it already. For an example see
340        // MountTest.QuizBRecursion.
341        let mount = Mount::new(what, flags);
342
343        if self.read().is_shared() {
344            mount.write().make_shared();
345        }
346
347        for peer in peers {
348            if Arc::ptr_eq(self, &peer) {
349                continue;
350            }
351            let clone = mount.clone_mount_recursive();
352            peer.write().add_submount_internal(dir, clone);
353        }
354
355        self.write().add_submount_internal(dir, mount)
356    }
357
358    fn remove_submount(self: &MountHandle, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
359        // create_submount explains why we need to make a copy of peers.
360        let peers = {
361            let state = self.state.read();
362            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
363        };
364
365        for peer in peers {
366            if Arc::ptr_eq(self, &peer) {
367                continue;
368            }
369            // mount_namespaces(7): If B is shared, then all most-recently-mounted mounts at b on
370            // mounts that receive propagation from mount B and do not have submounts under them are
371            // unmounted.
372            let mut peer = peer.write();
373            if let Some(submount) = peer.submounts.get(mount_hash_key) {
374                if !submount.mount.read().submounts.is_empty() {
375                    continue;
376                }
377            }
378            let _ = peer.remove_submount_internal(mount_hash_key);
379        }
380
381        self.write().remove_submount_internal(mount_hash_key)
382    }
383
384    /// Create a new mount with the same filesystem, flags, and peer group. Used to implement bind
385    /// mounts.
386    fn clone_mount(
387        self: &MountHandle,
388        new_root: &DirEntryHandle,
389        flags: MountFlags,
390    ) -> MountHandle {
391        assert!(new_root.is_descendant_of(&self.root));
392        // According to mount(2) on bind mounts, all flags other than MS_REC are ignored when doing
393        // a bind mount.
394        let clone = Self::new_with_root(Arc::clone(new_root), self.flags());
395
396        if flags.contains(MountFlags::REC) {
397            // This is two steps because the alternative (locking clone.state while iterating over
398            // self.state.submounts) trips tracing_mutex. The lock ordering is parent -> child, and
399            // if the clone is eventually made a child of self, this looks like an ordering
400            // violation. I'm not convinced it's a real issue, but I can't convince myself it's not
401            // either.
402            let mut submounts = vec![];
403            for Submount { dir, mount } in &self.state.read().submounts {
404                submounts.push((dir.clone(), mount.clone_mount_recursive()));
405            }
406            let mut clone_state = clone.write();
407            for (dir, submount) in submounts {
408                clone_state.add_submount_internal(&dir, submount);
409            }
410        }
411
412        // Put the clone in the same peer group
413        let peer_group = self.state.read().peer_group().map(Arc::clone);
414        if let Some(peer_group) = peer_group {
415            clone.write().set_peer_group(peer_group);
416        }
417
418        clone
419    }
420
421    /// Do a clone of the full mount hierarchy below this mount. Used for creating mount
422    /// namespaces and creating copies to use for propagation.
423    fn clone_mount_recursive(self: &MountHandle) -> MountHandle {
424        self.clone_mount(&self.root, MountFlags::REC)
425    }
426
427    pub fn change_propagation(self: &MountHandle, flag: MountFlags, recursive: bool) {
428        let mut state = self.write();
429        match flag {
430            MountFlags::SHARED => state.make_shared(),
431            MountFlags::PRIVATE => state.make_private(),
432            MountFlags::DOWNSTREAM => state.make_downstream(),
433            _ => {
434                log_warn!("mount propagation {:?}", flag);
435                return;
436            }
437        }
438
439        if recursive {
440            for submount in &state.submounts {
441                submount.mount.change_propagation(flag, recursive);
442            }
443        }
444    }
445
446    fn flags(&self) -> MountFlags {
447        *self.flags.lock()
448    }
449
450    pub fn update_flags(self: &MountHandle, mut flags: MountFlags) {
451        flags &= MountFlags::STORED_ON_MOUNT;
452        let atime_flags = MountFlags::NOATIME
453            | MountFlags::NODIRATIME
454            | MountFlags::RELATIME
455            | MountFlags::STRICTATIME;
456        let mut stored_flags = self.flags.lock();
457        if !flags.intersects(atime_flags) {
458            // Since Linux 3.17, if none of MS_NOATIME, MS_NODIRATIME,
459            // MS_RELATIME, or MS_STRICTATIME is specified in mountflags, then
460            // the remount operation preserves the existing values of these
461            // flags (rather than defaulting to MS_RELATIME).
462            flags |= *stored_flags & atime_flags;
463        }
464        // The "effect [of MS_STRICTATIME] is to clear the MS_NOATIME and MS_RELATIME flags."
465        flags &= !MountFlags::STRICTATIME;
466        *stored_flags = flags;
467    }
468
469    /// The number of active clients of this mount.
470    ///
471    /// The mount cannot be unmounted if there are any active clients.
472    fn active_clients(&self) -> usize {
473        // We need to subtract one for our own reference. We are not a real client.
474        Arc::strong_count(&self.active_client_counter) - 1
475    }
476
477    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
478        if !flags.contains(UnmountFlags::DETACH) {
479            if self.active_clients() > 0 || !self.state.read().submounts.is_empty() {
480                return error!(EBUSY);
481            }
482        }
483        let mountpoint = self.state.read().mountpoint().ok_or_else(|| errno!(EINVAL))?;
484        let parent_mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
485        parent_mount.remove_submount(mountpoint.mount_hash_key())
486    }
487
488    /// Returns the security state of the fs.
489    pub fn security_state(&self) -> &security::FileSystemState {
490        &self.fs.security_state
491    }
492
493    /// Returns the name of the fs.
494    pub fn fs_name(&self) -> &'static FsStr {
495        self.fs.name()
496    }
497
498    state_accessor!(Mount, state, Arc<Mount>);
499}
500
501impl MountState {
502    /// Returns true if there is a submount on top of `dir_entry`.
503    pub fn has_submount(&self, dir_entry: &DirEntryHandle) -> bool {
504        self.submounts.contains(ArcKey::ref_cast(dir_entry))
505    }
506
507    /// The NamespaceNode on which this Mount is mounted.
508    fn mountpoint(&self) -> Option<NamespaceNode> {
509        let (mount, entry) = self.mountpoint.as_ref()?;
510        Some(NamespaceNode::new(mount.upgrade()?, entry.clone()))
511    }
512
513    /// Return this mount's current peer group.
514    fn peer_group(&self) -> Option<&Arc<PeerGroup>> {
515        let (group, _) = self.peer_group_.as_ref()?;
516        Some(group)
517    }
518
519    /// Remove this mount from its peer group and return the peer group.
520    fn take_from_peer_group(&mut self) -> Option<Arc<PeerGroup>> {
521        let (old_group, old_mount) = self.peer_group_.take()?;
522        old_group.remove(old_mount);
523        if let Some(upstream) = self.take_from_upstream() {
524            let next_mount =
525                old_group.state.read().mounts.iter().next().map(|w| w.0.upgrade().unwrap());
526            if let Some(next_mount) = next_mount {
527                // TODO(https://fxbug.dev/42065259): Fix the lock ordering here. We've locked next_mount
528                // while self is locked, and since the propagation tree and mount tree are
529                // separate, this could violate the mount -> submount order previously established.
530                next_mount.write().set_upstream(upstream);
531            }
532        }
533        Some(old_group)
534    }
535
536    fn upstream(&self) -> Option<Arc<PeerGroup>> {
537        self.upstream_.as_ref().and_then(|g| g.0.upgrade())
538    }
539
540    fn take_from_upstream(&mut self) -> Option<Arc<PeerGroup>> {
541        let (old_upstream, old_mount) = self.upstream_.take()?;
542        // TODO(tbodt): Reason about whether the upgrade() could possibly return None, and what we
543        // should actually do in that case.
544        let old_upstream = old_upstream.upgrade()?;
545        old_upstream.remove_downstream(old_mount);
546        Some(old_upstream)
547    }
548}
549
550#[apply(state_implementation!)]
551impl MountState<Base = Mount, BaseType = Arc<Mount>> {
552    /// Add a child mount *without propagating it to the peer group*. For internal use only.
553    fn add_submount_internal(&mut self, dir: &DirEntryHandle, mount: MountHandle) {
554        if !dir.is_descendant_of(&self.base.root) {
555            return;
556        }
557
558        let submount = mount.fs.kernel.upgrade().unwrap().mounts.register_mount(dir, mount.clone());
559        let old_mountpoint =
560            mount.state.write().mountpoint.replace((Arc::downgrade(self.base), Arc::clone(dir)));
561        assert!(old_mountpoint.is_none(), "add_submount can only take a newly created mount");
562        // Mount shadowing is implemented by mounting onto the root of the first mount, not by
563        // creating two mounts on the same mountpoint.
564        let old_mount = self.submounts.replace(submount);
565
566        // In rare cases, mount propagation might result in a request to mount on a directory where
567        // something is already mounted. MountTest.LotsOfShadowing will trigger this. Linux handles
568        // this by inserting the new mount between the old mount and the current mount.
569        if let Some(mut old_mount) = old_mount {
570            // Previous state: self[dir] = old_mount
571            // New state: self[dir] = new_mount, new_mount[new_mount.root] = old_mount
572            // The new mount has already been inserted into self, now just update the old mount to
573            // be a child of the new mount.
574            old_mount.mount.write().mountpoint = Some((Arc::downgrade(&mount), Arc::clone(dir)));
575            old_mount.dir = ArcKey(mount.root.clone());
576            mount.write().submounts.insert(old_mount);
577        }
578    }
579
580    fn remove_submount_internal(&mut self, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
581        if self.submounts.remove(mount_hash_key) { Ok(()) } else { error!(EINVAL) }
582    }
583
584    /// Set this mount's peer group.
585    fn set_peer_group(&mut self, group: Arc<PeerGroup>) {
586        self.take_from_peer_group();
587        group.add(self.base);
588        self.peer_group_ = Some((group, Arc::as_ptr(self.base).into()));
589    }
590
591    fn set_upstream(&mut self, group: Arc<PeerGroup>) {
592        self.take_from_upstream();
593        group.add_downstream(self.base);
594        self.upstream_ = Some((Arc::downgrade(&group), Arc::as_ptr(self.base).into()));
595    }
596
597    /// Is the mount in a peer group? Corresponds to MS_SHARED.
598    pub fn is_shared(&self) -> bool {
599        self.peer_group().is_some()
600    }
601
602    /// Put the mount in a peer group. Implements MS_SHARED.
603    pub fn make_shared(&mut self) {
604        if self.is_shared() {
605            return;
606        }
607        let kernel =
608            self.base.fs.kernel.upgrade().expect("can't create new peer group without kernel");
609        self.set_peer_group(PeerGroup::new(kernel.get_next_peer_group_id()));
610    }
611
612    /// Take the mount out of its peer group, also remove upstream if any. Implements MS_PRIVATE.
613    pub fn make_private(&mut self) {
614        self.take_from_peer_group();
615        self.take_from_upstream();
616    }
617
618    /// Take the mount out of its peer group and make it downstream instead. Implements
619    /// MountFlags::DOWNSTREAM (MS_SLAVE).
620    pub fn make_downstream(&mut self) {
621        if let Some(peer_group) = self.take_from_peer_group() {
622            self.set_upstream(peer_group);
623        }
624    }
625}
626
627impl PeerGroup {
628    fn new(id: u64) -> Arc<Self> {
629        Arc::new(Self { id, state: Default::default() })
630    }
631
632    fn add(&self, mount: &Arc<Mount>) {
633        self.state.write().mounts.insert(WeakKey::from(mount));
634    }
635
636    fn remove(&self, mount: PtrKey<Mount>) {
637        self.state.write().mounts.remove(&mount);
638    }
639
640    fn add_downstream(&self, mount: &Arc<Mount>) {
641        self.state.write().downstream.insert(WeakKey::from(mount));
642    }
643
644    fn remove_downstream(&self, mount: PtrKey<Mount>) {
645        self.state.write().downstream.remove(&mount);
646    }
647
648    fn copy_propagation_targets(&self) -> Vec<MountHandle> {
649        let mut buf = vec![];
650        self.collect_propagation_targets(&mut buf);
651        buf
652    }
653
654    fn collect_propagation_targets(&self, buf: &mut Vec<MountHandle>) {
655        let downstream_mounts: Vec<_> = {
656            let state = self.state.read();
657            buf.extend(state.mounts.iter().filter_map(|m| m.0.upgrade()));
658            state.downstream.iter().filter_map(|m| m.0.upgrade()).collect()
659        };
660        for mount in downstream_mounts {
661            let peer_group = mount.read().peer_group().map(Arc::clone);
662            match peer_group {
663                Some(group) => group.collect_propagation_targets(buf),
664                None => buf.push(mount),
665            }
666        }
667    }
668}
669
670impl Drop for Mount {
671    fn drop(&mut self) {
672        let state = self.state.get_mut();
673        state.take_from_peer_group();
674        state.take_from_upstream();
675    }
676}
677
678impl fmt::Debug for Mount {
679    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
680        let state = self.state.read();
681        f.debug_struct("Mount")
682            .field("id", &(self as *const Mount))
683            .field("root", &self.root)
684            .field("mountpoint", &state.mountpoint)
685            .field("submounts", &state.submounts)
686            .finish()
687    }
688}
689
690impl Kernel {
691    pub fn get_next_mount_id(&self) -> u64 {
692        self.next_mount_id.next()
693    }
694
695    pub fn get_next_peer_group_id(&self) -> u64 {
696        self.next_peer_group_id.next()
697    }
698
699    pub fn get_next_namespace_id(&self) -> u64 {
700        self.next_namespace_id.next()
701    }
702}
703
704impl CurrentTask {
705    pub fn create_filesystem(
706        &self,
707        locked: &mut Locked<Unlocked>,
708        fs_type: &FsStr,
709        options: FileSystemOptions,
710    ) -> Result<FileSystemHandle, Errno> {
711        // Please register new file systems via //src/starnix/modules/lib.rs, even if the file
712        // system is implemented inside starnix_core.
713        //
714        // Most file systems should be implemented as modules. The VFS provides various traits that
715        // let starnix_core integrate file systems without needing to depend on the file systems
716        // directly.
717        self.kernel()
718            .expando
719            .get::<FsRegistry>()
720            .create(locked, self, fs_type, options)
721            .ok_or_else(|| errno!(ENODEV, fs_type))?
722    }
723}
724
725// Writes to `sink` the mount flags and LSM mount options for the given `mount`.
726fn write_mount_info(task: &Task, sink: &mut DynamicFileBuf, mount: &Mount) -> Result<(), Errno> {
727    write!(sink, "{}", mount.flags())?;
728    security::sb_show_options(&task.kernel(), sink, &mount)
729}
730
731struct ProcMountsFileSource(WeakRef<Task>);
732
733impl DynamicFileSource for ProcMountsFileSource {
734    fn generate(
735        &self,
736        _current_task: &CurrentTask,
737        sink: &mut DynamicFileBuf,
738    ) -> Result<(), Errno> {
739        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
740        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
741        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
742        // extra work to maintain it.
743        let task = Task::from_weak(&self.0)?;
744        let task_fs = task.live()?.fs.read();
745        let root = task_fs.root();
746        let ns = task_fs.namespace();
747        for_each_mount(&ns.root_mount, &mut |mount| {
748            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
749            if !mountpoint.is_descendant_of(&root) {
750                return Ok(());
751            }
752            write!(
753                sink,
754                "{} {} {} ",
755                mount.fs.options.source_for_display(),
756                mountpoint.path(&task_fs),
757                mount.fs.name(),
758            )?;
759            write_mount_info(&task, sink, mount)?;
760            writeln!(sink, " 0 0")?;
761            Ok(())
762        })?;
763        Ok(())
764    }
765}
766
767pub struct ProcMountsFile {
768    dynamic_file: DynamicFile<ProcMountsFileSource>,
769}
770
771impl ProcMountsFile {
772    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
773        SimpleFileNode::new(move |_, _| {
774            Ok(Self { dynamic_file: DynamicFile::new(ProcMountsFileSource(task.clone())) })
775        })
776    }
777}
778
779impl FileOps for ProcMountsFile {
780    fileops_impl_delegate_read_write_and_seek!(self, self.dynamic_file);
781    fileops_impl_noop_sync!();
782
783    fn wait_async(
784        &self,
785        _locked: &mut Locked<FileOpsCore>,
786        _file: &FileObject,
787        _current_task: &CurrentTask,
788        waiter: &Waiter,
789        _events: FdEvents,
790        _handler: EventHandler,
791    ) -> Option<WaitCanceler> {
792        // Polling this file gives notifications when any change to mounts occurs. This is not
793        // implemented yet, but stubbed for Android init.
794        Some(waiter.fake_wait())
795    }
796
797    fn query_events(
798        &self,
799        _locked: &mut Locked<FileOpsCore>,
800        _file: &FileObject,
801        _current_task: &CurrentTask,
802    ) -> Result<FdEvents, Errno> {
803        Ok(FdEvents::empty())
804    }
805}
806
807#[derive(Clone)]
808pub struct ProcMountinfoFile(WeakRef<Task>);
809impl ProcMountinfoFile {
810    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
811        DynamicFile::new_node(Self(task))
812    }
813}
814impl DynamicFileSource for ProcMountinfoFile {
815    fn generate(
816        &self,
817        _current_task: &CurrentTask,
818        sink: &mut DynamicFileBuf,
819    ) -> Result<(), Errno> {
820        // Returns path to the `dir` from the root of the file system.
821        fn path_from_fs_root(dir: &DirEntryHandle) -> FsString {
822            let mut path = PathBuilder::new();
823            if dir.is_dead() {
824                // Return `/foo/dir//deleted` if the dir was deleted.
825                path.prepend_element("/deleted".into());
826            }
827            let scope = RcuReadScope::new();
828            let mut current = dir.deref();
829            while let Some(parent) = current.parent_ref(&scope) {
830                path.prepend_element(current.local_name(&scope));
831                current = parent;
832            }
833            path.build_absolute()
834        }
835
836        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
837        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
838        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
839        // extra work to maintain it.
840        let task = Task::from_weak(&self.0)?;
841        let task_fs = task.live()?.fs.read();
842        let root = task_fs.root();
843        let ns = task_fs.namespace();
844        for_each_mount(&ns.root_mount, &mut |mount| {
845            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
846            if !mountpoint.is_descendant_of(&root) {
847                return Ok(());
848            }
849            // Can't fail, mountpoint() and root() can't return a NamespaceNode with no mount
850            let parent = mountpoint.mount.as_ref().unwrap();
851            write!(
852                sink,
853                "{} {} {} {} {} ",
854                mount.id,
855                parent.id,
856                mount.root.node.fs().dev_id,
857                path_from_fs_root(&mount.root),
858                mountpoint.path(&task_fs),
859            )?;
860            write_mount_info(&task, sink, mount)?;
861            if let Some(peer_group) = mount.read().peer_group() {
862                write!(sink, " shared:{}", peer_group.id)?;
863            }
864            if let Some(upstream) = mount.read().upstream() {
865                write!(sink, " master:{}", upstream.id)?;
866            }
867            writeln!(
868                sink,
869                " - {} {} {}",
870                mount.fs.name(),
871                mount.fs.options.source_for_display(),
872                mount.fs.options.flags,
873            )?;
874            Ok(())
875        })?;
876        Ok(())
877    }
878}
879
880fn for_each_mount<E>(
881    mount: &MountHandle,
882    callback: &mut impl FnMut(&MountHandle) -> Result<(), E>,
883) -> Result<(), E> {
884    callback(mount)?;
885    // Collect list first to avoid self deadlock when ProcMountinfoFile::read_at tries to call
886    // NamespaceNode::path()
887    let submounts: Vec<_> = mount.read().submounts.iter().map(|s| s.mount.clone()).collect();
888    for submount in submounts {
889        for_each_mount(&submount, callback)?;
890    }
891    Ok(())
892}
893
894/// The `SymlinkMode` enum encodes how symlinks are followed during path traversal.
895#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)]
896pub enum SymlinkMode {
897    /// Follow a symlink at the end of a path resolution.
898    #[default]
899    Follow,
900
901    /// Do not follow a symlink at the end of a path resolution.
902    NoFollow,
903}
904
905/// The maximum number of symlink traversals that can be made during path resolution.
906pub const MAX_SYMLINK_FOLLOWS: u8 = 40;
907
908/// The context passed during namespace lookups.
909///
910/// Namespace lookups need to mutate a shared context in order to correctly
911/// count the number of remaining symlink traversals.
912pub struct LookupContext {
913    /// The SymlinkMode for the lookup.
914    ///
915    /// As the lookup proceeds, the follow count is decremented each time the
916    /// lookup traverses a symlink.
917    pub symlink_mode: SymlinkMode,
918
919    /// The number of symlinks remaining the follow.
920    ///
921    /// Each time path resolution calls readlink, this value is decremented.
922    pub remaining_follows: u8,
923
924    /// Whether the result of the lookup must be a directory.
925    ///
926    /// For example, if the path ends with a `/` or if userspace passes
927    /// O_DIRECTORY. This flag can be set to true if the lookup encounters a
928    /// symlink that ends with a `/`.
929    pub must_be_directory: bool,
930
931    /// Resolve flags passed to `openat2`. Empty if the lookup originated in any other syscall.
932    pub resolve_flags: ResolveFlags,
933
934    /// Base directory for the lookup. Set only when either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT`
935    /// is passed to `openat2`.
936    pub resolve_base: ResolveBase,
937}
938
939/// Used to specify base directory in `LookupContext` for lookups originating in the `openat2`
940/// syscall with either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT` flag.
941#[derive(Clone, Eq, PartialEq)]
942pub enum ResolveBase {
943    None,
944
945    /// The lookup is not allowed to traverse any node that's not beneath the specified node.
946    Beneath(NamespaceNode),
947
948    /// The lookup should be handled as if the root specified node is the file-system root.
949    InRoot(NamespaceNode),
950}
951
952impl LookupContext {
953    pub fn new(symlink_mode: SymlinkMode) -> LookupContext {
954        LookupContext {
955            symlink_mode,
956            remaining_follows: MAX_SYMLINK_FOLLOWS,
957            must_be_directory: false,
958            resolve_flags: ResolveFlags::empty(),
959            resolve_base: ResolveBase::None,
960        }
961    }
962
963    pub fn with(&self, symlink_mode: SymlinkMode) -> LookupContext {
964        LookupContext { symlink_mode, resolve_base: self.resolve_base.clone(), ..*self }
965    }
966
967    pub fn update_for_path(&mut self, path: &FsStr) {
968        if path.last() == Some(&b'/') {
969            // The last path element must resolve to a directory. This is because a trailing slash
970            // was found in the path.
971            self.must_be_directory = true;
972            // If the last path element is a symlink, we should follow it.
973            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
974            self.symlink_mode = SymlinkMode::Follow;
975        }
976    }
977}
978
979impl Default for LookupContext {
980    fn default() -> Self {
981        LookupContext::new(SymlinkMode::Follow)
982    }
983}
984
985/// Whether the path is reachable from the given root.
986pub enum PathWithReachability {
987    /// The path is reachable from the given root.
988    Reachable(FsString),
989
990    /// The path is not reachable from the given root.
991    Unreachable(FsString),
992}
993
994impl PathWithReachability {
995    pub fn into_path(self) -> FsString {
996        match self {
997            PathWithReachability::Reachable(path) => path,
998            PathWithReachability::Unreachable(path) => path,
999        }
1000    }
1001}
1002
1003/// A node in a mount namespace.
1004///
1005/// This tree is a composite of the mount tree and the FsNode tree.
1006///
1007/// These nodes are used when traversing paths in a namespace in order to
1008/// present the client the directory structure that includes the mounted
1009/// filesystems.
1010#[derive(Clone)]
1011pub struct NamespaceNode {
1012    /// The mount where this namespace node is mounted.
1013    ///
1014    /// A given FsNode can be mounted in multiple places in a namespace. This
1015    /// field distinguishes between them.
1016    pub mount: MountInfo,
1017
1018    /// The FsNode that corresponds to this namespace entry.
1019    pub entry: DirEntryHandle,
1020}
1021
1022impl NamespaceNode {
1023    pub fn new(mount: MountHandle, entry: DirEntryHandle) -> Self {
1024        Self { mount: Some(mount).into(), entry }
1025    }
1026
1027    /// Create a namespace node that is not mounted in a namespace.
1028    pub fn new_anonymous(entry: DirEntryHandle) -> Self {
1029        Self { mount: None.into(), entry }
1030    }
1031
1032    /// Create a namespace node that is not mounted in a namespace and that refers to a node that
1033    /// is not rooted in a hierarchy and has no name.
1034    pub fn new_anonymous_unrooted(current_task: &CurrentTask, node: FsNodeHandle) -> Self {
1035        let dir_entry = DirEntry::new_unrooted(node);
1036        let _ = security::fs_node_init_with_dentry_no_xattr(current_task, &dir_entry);
1037        Self::new_anonymous(dir_entry)
1038    }
1039
1040    /// Create a FileObject corresponding to this namespace node.
1041    ///
1042    /// This function is the primary way of instantiating FileObjects. Each
1043    /// FileObject records the NamespaceNode that created it in order to
1044    /// remember its path in the Namespace.
1045    pub fn open(
1046        &self,
1047        locked: &mut Locked<Unlocked>,
1048        current_task: &CurrentTask,
1049        flags: OpenFlags,
1050        access_check: AccessCheck,
1051    ) -> Result<FileHandle, Errno> {
1052        let ops = self.entry.node.open(locked, current_task, self, flags, access_check)?;
1053        FileObject::new(locked, current_task, ops, self.clone(), flags)
1054    }
1055
1056    /// Create or open a node in the file system.
1057    ///
1058    /// Works for any type of node other than a symlink.
1059    ///
1060    /// Will return an existing node unless `flags` contains `OpenFlags::EXCL`.
1061    pub fn open_create_node<L>(
1062        &self,
1063        locked: &mut Locked<L>,
1064        current_task: &CurrentTask,
1065        name: &FsStr,
1066        mode: FileMode,
1067        dev: DeviceType,
1068        flags: OpenFlags,
1069    ) -> Result<NamespaceNode, Errno>
1070    where
1071        L: LockEqualOrBefore<FileOpsCore>,
1072    {
1073        let owner = current_task.current_fscred();
1074        let mode = current_task.fs().apply_umask(mode);
1075        let create_fn =
1076            |locked: &mut Locked<L>, dir: &FsNodeHandle, mount: &MountInfo, name: &_| {
1077                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1078            };
1079        let entry = if flags.contains(OpenFlags::EXCL) {
1080            self.entry.create_entry(locked, current_task, &self.mount, name, create_fn)
1081        } else {
1082            self.entry.get_or_create_entry(locked, current_task, &self.mount, name, create_fn)
1083        }?;
1084        Ok(self.with_new_entry(entry))
1085    }
1086
1087    pub fn into_active(self) -> ActiveNamespaceNode {
1088        ActiveNamespaceNode::new(self)
1089    }
1090
1091    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1092        self.into_active().into_mapping(mode)
1093    }
1094
1095    /// Create a node in the file system.
1096    ///
1097    /// Works for any type of node other than a symlink.
1098    ///
1099    /// Does not return an existing node.
1100    pub fn create_node<L>(
1101        &self,
1102        locked: &mut Locked<L>,
1103        current_task: &CurrentTask,
1104        name: &FsStr,
1105        mode: FileMode,
1106        dev: DeviceType,
1107    ) -> Result<NamespaceNode, Errno>
1108    where
1109        L: LockEqualOrBefore<FileOpsCore>,
1110    {
1111        let owner = current_task.current_fscred();
1112        let mode = current_task.fs().apply_umask(mode);
1113        let entry = self.entry.create_entry(
1114            locked,
1115            current_task,
1116            &self.mount,
1117            name,
1118            |locked, dir, mount, name| {
1119                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1120            },
1121        )?;
1122        Ok(self.with_new_entry(entry))
1123    }
1124
1125    /// Create a symlink in the file system.
1126    ///
1127    /// To create another type of node, use `create_node`.
1128    pub fn create_symlink<L>(
1129        &self,
1130        locked: &mut Locked<L>,
1131        current_task: &CurrentTask,
1132        name: &FsStr,
1133        target: &FsStr,
1134    ) -> Result<NamespaceNode, Errno>
1135    where
1136        L: LockEqualOrBefore<FileOpsCore>,
1137    {
1138        let owner = current_task.current_fscred();
1139        let entry = self.entry.create_entry(
1140            locked,
1141            current_task,
1142            &self.mount,
1143            name,
1144            |locked, dir, mount, name| {
1145                dir.create_symlink(locked, current_task, mount, name, target, owner)
1146            },
1147        )?;
1148        Ok(self.with_new_entry(entry))
1149    }
1150
1151    /// Creates an anonymous file.
1152    ///
1153    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
1154    ///
1155    /// Used by O_TMPFILE.
1156    pub fn create_tmpfile<L>(
1157        &self,
1158        locked: &mut Locked<L>,
1159        current_task: &CurrentTask,
1160        mode: FileMode,
1161        flags: OpenFlags,
1162    ) -> Result<NamespaceNode, Errno>
1163    where
1164        L: LockEqualOrBefore<FileOpsCore>,
1165    {
1166        let owner = current_task.current_fscred();
1167        let mode = current_task.fs().apply_umask(mode);
1168        Ok(self.with_new_entry(self.entry.create_tmpfile(
1169            locked,
1170            current_task,
1171            &self.mount,
1172            mode,
1173            owner,
1174            flags,
1175        )?))
1176    }
1177
1178    pub fn link<L>(
1179        &self,
1180        locked: &mut Locked<L>,
1181        current_task: &CurrentTask,
1182        name: &FsStr,
1183        child: &FsNodeHandle,
1184    ) -> Result<NamespaceNode, Errno>
1185    where
1186        L: LockEqualOrBefore<FileOpsCore>,
1187    {
1188        let dir_entry = self.entry.create_entry(
1189            locked,
1190            current_task,
1191            &self.mount,
1192            name,
1193            |locked, dir, mount, name| dir.link(locked, current_task, mount, name, child),
1194        )?;
1195        Ok(self.with_new_entry(dir_entry))
1196    }
1197
1198    pub fn bind_socket<L>(
1199        &self,
1200        locked: &mut Locked<L>,
1201        current_task: &CurrentTask,
1202        name: &FsStr,
1203        socket: SocketHandle,
1204        socket_address: SocketAddress,
1205        mode: FileMode,
1206    ) -> Result<NamespaceNode, Errno>
1207    where
1208        L: LockEqualOrBefore<FileOpsCore>,
1209    {
1210        let dir_entry = self.entry.create_entry(
1211            locked,
1212            current_task,
1213            &self.mount,
1214            name,
1215            |locked, dir, mount, name| {
1216                let node = dir.create_node(
1217                    locked,
1218                    current_task,
1219                    mount,
1220                    name,
1221                    mode,
1222                    DeviceType::NONE,
1223                    current_task.current_fscred(),
1224                )?;
1225                if let Some(unix_socket) = socket.downcast_socket::<UnixSocket>() {
1226                    unix_socket.bind_socket_to_node(&socket, socket_address, &node)?;
1227                } else {
1228                    return error!(ENOTSUP);
1229                }
1230                Ok(node)
1231            },
1232        )?;
1233        Ok(self.with_new_entry(dir_entry))
1234    }
1235
1236    pub fn unlink<L>(
1237        &self,
1238        locked: &mut Locked<L>,
1239        current_task: &CurrentTask,
1240        name: &FsStr,
1241        kind: UnlinkKind,
1242        must_be_directory: bool,
1243    ) -> Result<(), Errno>
1244    where
1245        L: LockEqualOrBefore<FileOpsCore>,
1246    {
1247        if DirEntry::is_reserved_name(name) {
1248            match kind {
1249                UnlinkKind::Directory => {
1250                    if name == ".." {
1251                        error!(ENOTEMPTY)
1252                    } else if self.parent().is_none() {
1253                        // The client is attempting to remove the root.
1254                        error!(EBUSY)
1255                    } else {
1256                        error!(EINVAL)
1257                    }
1258                }
1259                UnlinkKind::NonDirectory => error!(ENOTDIR),
1260            }
1261        } else {
1262            self.entry.unlink(locked, current_task, &self.mount, name, kind, must_be_directory)
1263        }
1264    }
1265
1266    /// Traverse down a parent-to-child link in the namespace.
1267    pub fn lookup_child<L>(
1268        &self,
1269        locked: &mut Locked<L>,
1270        current_task: &CurrentTask,
1271        context: &mut LookupContext,
1272        basename: &FsStr,
1273    ) -> Result<NamespaceNode, Errno>
1274    where
1275        L: LockEqualOrBefore<FileOpsCore>,
1276    {
1277        if !self.entry.node.is_dir() {
1278            return error!(ENOTDIR);
1279        }
1280
1281        if basename.len() > NAME_MAX as usize {
1282            return error!(ENAMETOOLONG);
1283        }
1284
1285        let child = if basename.is_empty() || basename == "." {
1286            self.clone()
1287        } else if basename == ".." {
1288            let root = match &context.resolve_base {
1289                ResolveBase::None => current_task.fs().root(),
1290                ResolveBase::Beneath(node) => {
1291                    // Do not allow traversal out of the 'node'.
1292                    if *self == *node {
1293                        return error!(EXDEV);
1294                    }
1295                    current_task.fs().root()
1296                }
1297                ResolveBase::InRoot(root) => root.clone(),
1298            };
1299
1300            // Make sure this can't escape a chroot.
1301            if *self == root { root } else { self.parent().unwrap_or_else(|| self.clone()) }
1302        } else {
1303            let mut child = self.with_new_entry(self.entry.component_lookup(
1304                locked,
1305                current_task,
1306                &self.mount,
1307                basename,
1308            )?);
1309            while child.entry.node.is_lnk() {
1310                match context.symlink_mode {
1311                    SymlinkMode::NoFollow => {
1312                        break;
1313                    }
1314                    SymlinkMode::Follow => {
1315                        if context.remaining_follows == 0
1316                            || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
1317                        {
1318                            return error!(ELOOP);
1319                        }
1320                        context.remaining_follows -= 1;
1321                        child = match child.readlink(locked, current_task)? {
1322                            SymlinkTarget::Path(link_target) => {
1323                                let link_directory = if link_target[0] == b'/' {
1324                                    // If the path is absolute, we'll resolve the root directory.
1325                                    match &context.resolve_base {
1326                                        ResolveBase::None => current_task.fs().root(),
1327                                        ResolveBase::Beneath(_) => return error!(EXDEV),
1328                                        ResolveBase::InRoot(root) => root.clone(),
1329                                    }
1330                                } else {
1331                                    // If the path is not absolute, it's a relative directory. Let's
1332                                    // try to get the parent of the current child, or in the case
1333                                    // that the child is the root we can just use that directly.
1334                                    child.parent().unwrap_or(child)
1335                                };
1336                                current_task.lookup_path(
1337                                    locked,
1338                                    context,
1339                                    link_directory,
1340                                    link_target.as_ref(),
1341                                )?
1342                            }
1343                            SymlinkTarget::Node(node) => {
1344                                if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
1345                                    return error!(ELOOP);
1346                                }
1347                                node
1348                            }
1349                        }
1350                    }
1351                };
1352            }
1353
1354            child.enter_mount()
1355        };
1356
1357        if context.resolve_flags.contains(ResolveFlags::NO_XDEV) && child.mount != self.mount {
1358            return error!(EXDEV);
1359        }
1360
1361        if context.must_be_directory && !child.entry.node.is_dir() {
1362            return error!(ENOTDIR);
1363        }
1364
1365        Ok(child)
1366    }
1367
1368    /// Traverse up a child-to-parent link in the namespace.
1369    ///
1370    /// This traversal matches the child-to-parent link in the underlying
1371    /// FsNode except at mountpoints, where the link switches from one
1372    /// filesystem to another.
1373    pub fn parent(&self) -> Option<NamespaceNode> {
1374        let mountpoint_or_self = self.escape_mount();
1375        let parent = mountpoint_or_self.entry.parent()?;
1376        Some(mountpoint_or_self.with_new_entry(parent))
1377    }
1378
1379    /// Returns the parent, but does not escape mounts i.e. returns None if this node
1380    /// is the root of a mount.
1381    pub fn parent_within_mount(&self) -> Option<DirEntryHandle> {
1382        if let Ok(_) = self.mount_if_root() {
1383            return None;
1384        }
1385        self.entry.parent()
1386    }
1387
1388    /// Whether this namespace node is a descendant of the given node.
1389    ///
1390    /// Walks up the namespace node tree looking for ancestor. If ancestor is
1391    /// found, returns true. Otherwise, returns false.
1392    pub fn is_descendant_of(&self, ancestor: &NamespaceNode) -> bool {
1393        let ancestor = ancestor.escape_mount();
1394        let mut current = self.escape_mount();
1395        while current != ancestor {
1396            if let Some(parent) = current.parent() {
1397                current = parent.escape_mount();
1398            } else {
1399                return false;
1400            }
1401        }
1402        true
1403    }
1404
1405    /// If this is a mount point, return the root of the mount. Otherwise return self.
1406    fn enter_mount(&self) -> NamespaceNode {
1407        // While the child is a mountpoint, replace child with the mount's root.
1408        fn enter_one_mount(node: &NamespaceNode) -> Option<NamespaceNode> {
1409            if let Some(mount) = node.mount.deref() {
1410                if let Some(submount) =
1411                    mount.state.read().submounts.get(ArcKey::ref_cast(&node.entry))
1412                {
1413                    return Some(submount.mount.root());
1414                }
1415            }
1416            None
1417        }
1418        let mut inner = self.clone();
1419        while let Some(inner_root) = enter_one_mount(&inner) {
1420            inner = inner_root;
1421        }
1422        inner
1423    }
1424
1425    /// If this is the root of a mount, return the mount point. Otherwise return self.
1426    ///
1427    /// This is not exactly the same as parent(). If parent() is called on a root, it will escape
1428    /// the mount, but then return the parent of the mount point instead of the mount point.
1429    fn escape_mount(&self) -> NamespaceNode {
1430        let mut mountpoint_or_self = self.clone();
1431        while let Some(mountpoint) = mountpoint_or_self.mountpoint() {
1432            mountpoint_or_self = mountpoint;
1433        }
1434        mountpoint_or_self
1435    }
1436
1437    /// If this node is the root of a mount, return it. Otherwise EINVAL.
1438    pub fn mount_if_root(&self) -> Result<&MountHandle, Errno> {
1439        if let Some(mount) = self.mount.deref() {
1440            if Arc::ptr_eq(&self.entry, &mount.root) {
1441                return Ok(mount);
1442            }
1443        }
1444        error!(EINVAL)
1445    }
1446
1447    /// Returns the mountpoint at this location in the namespace.
1448    ///
1449    /// If this node is mounted in another node, this function returns the node
1450    /// at which this node is mounted. Otherwise, returns None.
1451    fn mountpoint(&self) -> Option<NamespaceNode> {
1452        self.mount_if_root().ok()?.read().mountpoint()
1453    }
1454
1455    /// The path from the filesystem root to this node.
1456    pub fn path(&self, fs: &FsContext) -> FsString {
1457        self.path_from_root(Some(&fs.root())).into_path()
1458    }
1459
1460    /// The path from the root of the namespace to this node.
1461    pub fn path_escaping_chroot(&self) -> FsString {
1462        self.path_from_root(None).into_path()
1463    }
1464
1465    /// Returns the path to this node, accounting for a custom root.
1466    /// A task may have a custom root set by `chroot`.
1467    pub fn path_from_root(&self, root: Option<&NamespaceNode>) -> PathWithReachability {
1468        if self.mount.is_none() {
1469            return PathWithReachability::Reachable(self.entry.node.internal_name());
1470        }
1471
1472        let mut path = PathBuilder::new();
1473        let mut current = self.escape_mount();
1474        if let Some(root) = root {
1475            let scope = RcuReadScope::new();
1476            // The current node is expected to intersect with the custom root as we travel up the tree.
1477            let root = root.escape_mount();
1478            while current != root {
1479                if let Some(parent) = current.parent() {
1480                    path.prepend_element(current.entry.local_name(&scope));
1481                    current = parent.escape_mount();
1482                } else {
1483                    // This node hasn't intersected with the custom root and has reached the namespace root.
1484                    let mut absolute_path = path.build_absolute();
1485                    if self.entry.is_dead() {
1486                        absolute_path.extend_from_slice(b" (deleted)");
1487                    }
1488
1489                    return PathWithReachability::Unreachable(absolute_path);
1490                }
1491            }
1492        } else {
1493            // No custom root, so travel up the tree to the namespace root.
1494            let scope = RcuReadScope::new();
1495            while let Some(parent) = current.parent() {
1496                path.prepend_element(current.entry.local_name(&scope));
1497                current = parent.escape_mount();
1498            }
1499        }
1500
1501        let mut absolute_path = path.build_absolute();
1502        if self.entry.is_dead() {
1503            absolute_path.extend_from_slice(b" (deleted)");
1504        }
1505
1506        PathWithReachability::Reachable(absolute_path)
1507    }
1508
1509    pub fn mount(&self, what: WhatToMount, flags: MountFlags) -> Result<(), Errno> {
1510        let flags = flags & (MountFlags::STORED_ON_MOUNT | MountFlags::REC);
1511        let mountpoint = self.enter_mount();
1512        let mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
1513        mount.create_submount(&mountpoint.entry, what, flags);
1514        Ok(())
1515    }
1516
1517    /// If this is the root of a filesystem, unmount. Otherwise return EINVAL.
1518    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
1519        let mount = self.enter_mount().mount_if_root()?.clone();
1520        mount.unmount(flags)
1521    }
1522
1523    pub fn rename<L>(
1524        locked: &mut Locked<L>,
1525        current_task: &CurrentTask,
1526        old_parent: &NamespaceNode,
1527        old_name: &FsStr,
1528        new_parent: &NamespaceNode,
1529        new_name: &FsStr,
1530        flags: RenameFlags,
1531    ) -> Result<(), Errno>
1532    where
1533        L: LockEqualOrBefore<FileOpsCore>,
1534    {
1535        DirEntry::rename(
1536            locked,
1537            current_task,
1538            &old_parent.entry,
1539            &old_parent.mount,
1540            old_name,
1541            &new_parent.entry,
1542            &new_parent.mount,
1543            new_name,
1544            flags,
1545        )
1546    }
1547
1548    fn with_new_entry(&self, entry: DirEntryHandle) -> NamespaceNode {
1549        Self { mount: self.mount.clone(), entry }
1550    }
1551
1552    fn mount_hash_key(&self) -> &ArcKey<DirEntry> {
1553        ArcKey::ref_cast(&self.entry)
1554    }
1555
1556    pub fn suid_and_sgid(&self, current_task: &CurrentTask) -> Result<UserAndOrGroupId, Errno> {
1557        if self.mount.flags().contains(MountFlags::NOSUID) {
1558            Ok(UserAndOrGroupId::default())
1559        } else {
1560            self.entry.node.info().suid_and_sgid(current_task, &self.entry.node)
1561        }
1562    }
1563
1564    pub fn update_atime(&self) {
1565        // Do not update the atime of this node if it is mounted with the NOATIME flag.
1566        if !self.mount.flags().contains(MountFlags::NOATIME) {
1567            self.entry.node.update_info(|info| {
1568                let now = utc::utc_now();
1569                info.time_access = now;
1570                info.pending_time_access_update = true;
1571            });
1572        }
1573    }
1574
1575    pub fn readlink<L>(
1576        &self,
1577        locked: &mut Locked<L>,
1578        current_task: &CurrentTask,
1579    ) -> Result<SymlinkTarget, Errno>
1580    where
1581        L: LockEqualOrBefore<FileOpsCore>,
1582    {
1583        self.update_atime();
1584        self.entry.node.readlink(locked, current_task)
1585    }
1586
1587    pub fn notify(&self, event_mask: InotifyMask) {
1588        if self.mount.is_some() {
1589            self.entry.notify(event_mask);
1590        }
1591    }
1592
1593    /// Check whether the node can be accessed in the current context with the specified access
1594    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1595    /// owner or is in the file's group.
1596    pub fn check_access<L>(
1597        &self,
1598        locked: &mut Locked<L>,
1599        current_task: &CurrentTask,
1600        permission_flags: impl Into<security::PermissionFlags>,
1601        reason: CheckAccessReason,
1602    ) -> Result<(), Errno>
1603    where
1604        L: LockEqualOrBefore<FileOpsCore>,
1605    {
1606        self.entry.node.check_access(
1607            locked,
1608            current_task,
1609            &self.mount,
1610            permission_flags,
1611            reason,
1612            self,
1613        )
1614    }
1615
1616    /// Checks if O_NOATIME is allowed,
1617    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1618        self.entry.node.check_o_noatime_allowed(current_task)
1619    }
1620
1621    pub fn truncate<L>(
1622        &self,
1623        locked: &mut Locked<L>,
1624        current_task: &CurrentTask,
1625        length: u64,
1626    ) -> Result<(), Errno>
1627    where
1628        L: LockBefore<BeforeFsNodeAppend>,
1629    {
1630        self.entry.node.truncate(locked, current_task, &self.mount, length)?;
1631        self.entry.notify_ignoring_excl_unlink(InotifyMask::MODIFY);
1632        Ok(())
1633    }
1634}
1635
1636impl fmt::Debug for NamespaceNode {
1637    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1638        f.debug_struct("NamespaceNode")
1639            .field("path", &self.path_escaping_chroot())
1640            .field("mount", &self.mount)
1641            .field("entry", &self.entry)
1642            .finish()
1643    }
1644}
1645
1646// Eq/Hash impls intended for the MOUNT_POINTS hash
1647impl PartialEq for NamespaceNode {
1648    fn eq(&self, other: &Self) -> bool {
1649        self.mount.as_ref().map(Arc::as_ptr).eq(&other.mount.as_ref().map(Arc::as_ptr))
1650            && Arc::ptr_eq(&self.entry, &other.entry)
1651    }
1652}
1653impl Eq for NamespaceNode {}
1654impl Hash for NamespaceNode {
1655    fn hash<H: Hasher>(&self, state: &mut H) {
1656        self.mount.as_ref().map(Arc::as_ptr).hash(state);
1657        Arc::as_ptr(&self.entry).hash(state);
1658    }
1659}
1660
1661/// A namespace node that keeps the underly mount busy.
1662#[derive(Debug, Clone)]
1663pub struct ActiveNamespaceNode {
1664    /// The underlying namespace node.
1665    name: NamespaceNode,
1666
1667    /// Adds a reference to the mount client marker to prevent the mount from
1668    /// being removed while the NamespaceNode is active. Is None iff mount is
1669    /// None.
1670    _marker: Option<MountClientMarker>,
1671}
1672
1673impl ActiveNamespaceNode {
1674    pub fn new(name: NamespaceNode) -> Self {
1675        let marker = name.mount.as_ref().map(|mount| mount.active_client_counter.clone());
1676        Self { name, _marker: marker }
1677    }
1678
1679    pub fn to_passive(&self) -> NamespaceNode {
1680        self.deref().clone()
1681    }
1682
1683    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1684        if let Some(mode) = mode {
1685            self.entry.node.write_guard_state.lock().acquire(mode)?;
1686        }
1687        Ok(Arc::new(FileMapping { name: self, mode }))
1688    }
1689}
1690
1691impl Deref for ActiveNamespaceNode {
1692    type Target = NamespaceNode;
1693
1694    fn deref(&self) -> &Self::Target {
1695        &self.name
1696    }
1697}
1698
1699impl PartialEq for ActiveNamespaceNode {
1700    fn eq(&self, other: &Self) -> bool {
1701        self.deref().eq(other.deref())
1702    }
1703}
1704impl Eq for ActiveNamespaceNode {}
1705impl Hash for ActiveNamespaceNode {
1706    fn hash<H: Hasher>(&self, state: &mut H) {
1707        self.deref().hash(state)
1708    }
1709}
1710
1711#[derive(Debug, Clone, PartialEq, Eq)]
1712#[must_use]
1713pub struct FileMapping {
1714    pub name: ActiveNamespaceNode,
1715    mode: Option<FileWriteGuardMode>,
1716}
1717
1718impl Drop for FileMapping {
1719    fn drop(&mut self) {
1720        if let Some(mode) = self.mode {
1721            self.name.entry.node.write_guard_state.lock().release(mode);
1722        }
1723    }
1724}
1725
1726/// Tracks all mounts, keyed by mount point.
1727pub struct Mounts {
1728    mounts: RcuHashMap<WeakKey<DirEntry>, Vec<ArcKey<Mount>>>,
1729}
1730
1731impl Mounts {
1732    pub fn new() -> Self {
1733        Mounts { mounts: RcuHashMap::default() }
1734    }
1735
1736    /// Registers the mount in the global mounts map.
1737    fn register_mount(&self, dir_entry: &Arc<DirEntry>, mount: MountHandle) -> Submount {
1738        let mut mounts = self.mounts.lock();
1739        let key = WeakKey::from(dir_entry);
1740        let mut vec = mounts.get(&key).unwrap_or_else(|| {
1741            dir_entry.set_has_mounts(true);
1742            Vec::new()
1743        });
1744        vec.push(ArcKey(mount.clone()));
1745        mounts.insert(key, vec);
1746        Submount { dir: ArcKey(dir_entry.clone()), mount }
1747    }
1748
1749    /// Unregisters the mount.  This is called by `Submount::drop`.
1750    fn unregister_mount(&self, dir_entry: &Arc<DirEntry>, mount: &MountHandle) {
1751        let mut mounts = self.mounts.lock();
1752        let key = WeakKey::from(dir_entry);
1753        if let Some(mut vec) = mounts.get(&key) {
1754            let index = vec.iter().position(|e| e == ArcKey::ref_cast(mount)).unwrap();
1755            if vec.len() == 1 {
1756                mounts.remove(&key);
1757                dir_entry.set_has_mounts(false);
1758            } else {
1759                vec.swap_remove(index);
1760                mounts.insert(key, vec);
1761            }
1762        }
1763    }
1764
1765    /// Unmounts all mounts associated with `dir_entry`.  This is called when `dir_entry` is
1766    /// unlinked (which would normally result in EBUSY, but not if it isn't mounted in the local
1767    /// namespace).
1768    pub fn unmount(&self, dir_entry: &DirEntry) {
1769        let mounts = self.mounts.lock().remove(&PtrKey::from(dir_entry as *const _));
1770        if let Some(mounts) = mounts {
1771            for mount in mounts {
1772                // Ignore errors.
1773                let _ = mount.unmount(UnmountFlags::DETACH);
1774            }
1775        }
1776    }
1777
1778    /// Drain mounts. For each drained mount, force a FileSystem unmount.
1779    // TODO(https://fxbug.dev/295073633): Graceful shutdown should try to first unmount the mounts
1780    // and only force a FileSystem unmount on failure.
1781    pub fn clear(&self) {
1782        for (_dir_entry, mounts) in self.mounts.lock().drain() {
1783            for mount in mounts {
1784                mount.fs.force_unmount_ops();
1785            }
1786        }
1787    }
1788
1789    pub fn sync_all(
1790        &self,
1791        locked: &mut Locked<Unlocked>,
1792        current_task: &CurrentTask,
1793    ) -> Result<(), Errno> {
1794        let mut filesystems = Vec::new();
1795        {
1796            let scope = RcuReadScope::new();
1797            let mut seen = HashSet::new();
1798            for (_dir_entry, m_list) in self.mounts.iter(&scope) {
1799                for m in m_list {
1800                    if seen.insert(Arc::as_ptr(&m.fs)) {
1801                        filesystems.push(m.fs.clone());
1802                    }
1803                }
1804            }
1805        }
1806
1807        for fs in filesystems {
1808            if let Err(e) = fs.sync(locked, current_task) {
1809                log_warn!("sync failed for filesystem {:?}: {:?}", fs.name(), e);
1810            }
1811        }
1812        Ok(())
1813    }
1814}
1815
1816/// A RAII object that unregisters a mount when dropped.
1817#[derive(Debug)]
1818struct Submount {
1819    dir: ArcKey<DirEntry>,
1820    mount: MountHandle,
1821}
1822
1823impl Drop for Submount {
1824    fn drop(&mut self) {
1825        self.mount.fs.kernel.upgrade().unwrap().mounts.unregister_mount(&self.dir, &self.mount)
1826    }
1827}
1828
1829/// Submount is stored in a mount's submounts hash set, which is keyed by the mountpoint.
1830impl Eq for Submount {}
1831impl PartialEq<Self> for Submount {
1832    fn eq(&self, other: &Self) -> bool {
1833        self.dir == other.dir
1834    }
1835}
1836impl Hash for Submount {
1837    fn hash<H: Hasher>(&self, state: &mut H) {
1838        self.dir.hash(state)
1839    }
1840}
1841
1842impl Borrow<ArcKey<DirEntry>> for Submount {
1843    fn borrow(&self) -> &ArcKey<DirEntry> {
1844        &self.dir
1845    }
1846}
1847
1848#[cfg(test)]
1849mod test {
1850    use crate::fs::tmpfs::TmpFs;
1851    use crate::testing::spawn_kernel_and_run;
1852    use crate::vfs::namespace::DeviceType;
1853    use crate::vfs::{
1854        CallbackSymlinkNode, FsNodeInfo, LookupContext, MountInfo, Namespace, NamespaceNode,
1855        RenameFlags, SymlinkMode, SymlinkTarget, UnlinkKind, WhatToMount,
1856    };
1857    use starnix_uapi::mount_flags::MountFlags;
1858    use starnix_uapi::{errno, mode};
1859    use std::sync::Arc;
1860
1861    #[::fuchsia::test]
1862    async fn test_namespace() {
1863        spawn_kernel_and_run(async |locked, current_task| {
1864            let kernel = current_task.kernel();
1865            let root_fs = TmpFs::new_fs(locked, &kernel);
1866            let root_node = Arc::clone(root_fs.root());
1867            let _dev_node = root_node
1868                .create_dir(locked, &current_task, "dev".into())
1869                .expect("failed to mkdir dev");
1870            let dev_fs = TmpFs::new_fs(locked, &kernel);
1871            let dev_root_node = Arc::clone(dev_fs.root());
1872            let _dev_pts_node = dev_root_node
1873                .create_dir(locked, &current_task, "pts".into())
1874                .expect("failed to mkdir pts");
1875
1876            let ns = Namespace::new(root_fs);
1877            let mut context = LookupContext::default();
1878            let dev = ns
1879                .root()
1880                .lookup_child(locked, &current_task, &mut context, "dev".into())
1881                .expect("failed to lookup dev");
1882            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1883                .expect("failed to mount dev root node");
1884
1885            let mut context = LookupContext::default();
1886            let dev = ns
1887                .root()
1888                .lookup_child(locked, &current_task, &mut context, "dev".into())
1889                .expect("failed to lookup dev");
1890            let mut context = LookupContext::default();
1891            let pts = dev
1892                .lookup_child(locked, &current_task, &mut context, "pts".into())
1893                .expect("failed to lookup pts");
1894            let pts_parent =
1895                pts.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of pts");
1896            assert!(Arc::ptr_eq(&pts_parent.entry, &dev.entry));
1897
1898            let dev_parent =
1899                dev.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of dev");
1900            assert!(Arc::ptr_eq(&dev_parent.entry, &ns.root().entry));
1901        })
1902        .await;
1903    }
1904
1905    #[::fuchsia::test]
1906    async fn test_mount_does_not_upgrade() {
1907        spawn_kernel_and_run(async |locked, current_task| {
1908            let kernel = current_task.kernel();
1909            let root_fs = TmpFs::new_fs(locked, &kernel);
1910            let root_node = Arc::clone(root_fs.root());
1911            let _dev_node = root_node
1912                .create_dir(locked, &current_task, "dev".into())
1913                .expect("failed to mkdir dev");
1914            let dev_fs = TmpFs::new_fs(locked, &kernel);
1915            let dev_root_node = Arc::clone(dev_fs.root());
1916            let _dev_pts_node = dev_root_node
1917                .create_dir(locked, &current_task, "pts".into())
1918                .expect("failed to mkdir pts");
1919
1920            let ns = Namespace::new(root_fs);
1921            let mut context = LookupContext::default();
1922            let dev = ns
1923                .root()
1924                .lookup_child(locked, &current_task, &mut context, "dev".into())
1925                .expect("failed to lookup dev");
1926            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1927                .expect("failed to mount dev root node");
1928            let mut context = LookupContext::default();
1929            let new_dev = ns
1930                .root()
1931                .lookup_child(locked, &current_task, &mut context, "dev".into())
1932                .expect("failed to lookup dev again");
1933            assert!(!Arc::ptr_eq(&dev.entry, &new_dev.entry));
1934            assert_ne!(&dev, &new_dev);
1935
1936            let mut context = LookupContext::default();
1937            let _new_pts = new_dev
1938                .lookup_child(locked, &current_task, &mut context, "pts".into())
1939                .expect("failed to lookup pts");
1940            let mut context = LookupContext::default();
1941            assert!(dev.lookup_child(locked, &current_task, &mut context, "pts".into()).is_err());
1942        })
1943        .await;
1944    }
1945
1946    #[::fuchsia::test]
1947    async fn test_path() {
1948        spawn_kernel_and_run(async |locked, current_task| {
1949            let kernel = current_task.kernel();
1950            let root_fs = TmpFs::new_fs(locked, &kernel);
1951            let root_node = Arc::clone(root_fs.root());
1952            let _dev_node = root_node
1953                .create_dir(locked, &current_task, "dev".into())
1954                .expect("failed to mkdir dev");
1955            let dev_fs = TmpFs::new_fs(locked, &kernel);
1956            let dev_root_node = Arc::clone(dev_fs.root());
1957            let _dev_pts_node = dev_root_node
1958                .create_dir(locked, &current_task, "pts".into())
1959                .expect("failed to mkdir pts");
1960
1961            let ns = Namespace::new(root_fs);
1962            let mut context = LookupContext::default();
1963            let dev = ns
1964                .root()
1965                .lookup_child(locked, &current_task, &mut context, "dev".into())
1966                .expect("failed to lookup dev");
1967            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1968                .expect("failed to mount dev root node");
1969
1970            let mut context = LookupContext::default();
1971            let dev = ns
1972                .root()
1973                .lookup_child(locked, &current_task, &mut context, "dev".into())
1974                .expect("failed to lookup dev");
1975            let mut context = LookupContext::default();
1976            let pts = dev
1977                .lookup_child(locked, &current_task, &mut context, "pts".into())
1978                .expect("failed to lookup pts");
1979
1980            assert_eq!("/", ns.root().path_escaping_chroot());
1981            assert_eq!("/dev", dev.path_escaping_chroot());
1982            assert_eq!("/dev/pts", pts.path_escaping_chroot());
1983        })
1984        .await;
1985    }
1986
1987    #[::fuchsia::test]
1988    async fn test_shadowing() {
1989        spawn_kernel_and_run(async |locked, current_task| {
1990            let kernel = current_task.kernel();
1991            let root_fs = TmpFs::new_fs(locked, &kernel);
1992            let ns = Namespace::new(root_fs.clone());
1993            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
1994            let mut context = LookupContext::default();
1995            let foo_dir =
1996                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
1997
1998            let foofs1 = TmpFs::new_fs(locked, &kernel);
1999            foo_dir.mount(WhatToMount::Fs(foofs1.clone()), MountFlags::empty()).unwrap();
2000            let mut context = LookupContext::default();
2001            assert!(Arc::ptr_eq(
2002                &ns.root()
2003                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2004                    .unwrap()
2005                    .entry,
2006                foofs1.root()
2007            ));
2008            let foo_dir =
2009                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2010
2011            let ns_clone = ns.clone_namespace();
2012
2013            let foofs2 = TmpFs::new_fs(locked, &kernel);
2014            foo_dir.mount(WhatToMount::Fs(foofs2.clone()), MountFlags::empty()).unwrap();
2015            let mut context = LookupContext::default();
2016            assert!(Arc::ptr_eq(
2017                &ns.root()
2018                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2019                    .unwrap()
2020                    .entry,
2021                foofs2.root()
2022            ));
2023
2024            assert!(Arc::ptr_eq(
2025                &ns_clone
2026                    .root()
2027                    .lookup_child(
2028                        locked,
2029                        &current_task,
2030                        &mut LookupContext::default(),
2031                        "foo".into()
2032                    )
2033                    .unwrap()
2034                    .entry,
2035                foofs1.root()
2036            ));
2037        })
2038        .await;
2039    }
2040
2041    #[::fuchsia::test]
2042    async fn test_unlink_mounted_directory() {
2043        spawn_kernel_and_run(async |locked, current_task| {
2044            let kernel = current_task.kernel();
2045            let root_fs = TmpFs::new_fs(locked, &kernel);
2046            let ns1 = Namespace::new(root_fs.clone());
2047            let ns2 = Namespace::new(root_fs.clone());
2048            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2049            let mut context = LookupContext::default();
2050            let foo_dir =
2051                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2052
2053            let foofs = TmpFs::new_fs(locked, &kernel);
2054            foo_dir.mount(WhatToMount::Fs(foofs), MountFlags::empty()).unwrap();
2055
2056            // Trying to unlink from ns1 should fail.
2057            assert_eq!(
2058                ns1.root()
2059                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2060                    .unwrap_err(),
2061                errno!(EBUSY),
2062            );
2063
2064            // But unlinking from ns2 should succeed.
2065            ns2.root()
2066                .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2067                .expect("unlink failed");
2068
2069            // And it should no longer show up in ns1.
2070            assert_eq!(
2071                ns1.root()
2072                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2073                    .unwrap_err(),
2074                errno!(ENOENT),
2075            );
2076        })
2077        .await;
2078    }
2079
2080    #[::fuchsia::test]
2081    async fn test_rename_mounted_directory() {
2082        spawn_kernel_and_run(async |locked, current_task| {
2083            let kernel = current_task.kernel();
2084            let root_fs = TmpFs::new_fs(locked, &kernel);
2085            let ns1 = Namespace::new(root_fs.clone());
2086            let ns2 = Namespace::new(root_fs.clone());
2087            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2088            let _bar_node = root_fs.root().create_dir(locked, &current_task, "bar".into()).unwrap();
2089            let _baz_node = root_fs.root().create_dir(locked, &current_task, "baz".into()).unwrap();
2090            let mut context = LookupContext::default();
2091            let foo_dir =
2092                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2093
2094            let foofs = TmpFs::new_fs(locked, &kernel);
2095            foo_dir.mount(WhatToMount::Fs(foofs), MountFlags::empty()).unwrap();
2096
2097            // Trying to rename over foo from ns1 should fail.
2098            let root = ns1.root();
2099            assert_eq!(
2100                NamespaceNode::rename(
2101                    locked,
2102                    &current_task,
2103                    &root,
2104                    "bar".into(),
2105                    &root,
2106                    "foo".into(),
2107                    RenameFlags::empty()
2108                )
2109                .unwrap_err(),
2110                errno!(EBUSY),
2111            );
2112            // Likewise the other way.
2113            assert_eq!(
2114                NamespaceNode::rename(
2115                    locked,
2116                    &current_task,
2117                    &root,
2118                    "foo".into(),
2119                    &root,
2120                    "bar".into(),
2121                    RenameFlags::empty()
2122                )
2123                .unwrap_err(),
2124                errno!(EBUSY),
2125            );
2126
2127            // But renaming from ns2 should succeed.
2128            let root = ns2.root();
2129
2130            // First rename the directory with the mount.
2131            NamespaceNode::rename(
2132                locked,
2133                &current_task,
2134                &root,
2135                "foo".into(),
2136                &root,
2137                "bar".into(),
2138                RenameFlags::empty(),
2139            )
2140            .expect("rename failed");
2141
2142            // Renaming over a directory with a mount should also work.
2143            NamespaceNode::rename(
2144                locked,
2145                &current_task,
2146                &root,
2147                "baz".into(),
2148                &root,
2149                "bar".into(),
2150                RenameFlags::empty(),
2151            )
2152            .expect("rename failed");
2153
2154            // "foo" and "baz" should no longer show up in ns1.
2155            assert_eq!(
2156                ns1.root()
2157                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2158                    .unwrap_err(),
2159                errno!(ENOENT)
2160            );
2161            assert_eq!(
2162                ns1.root()
2163                    .lookup_child(locked, &current_task, &mut context, "baz".into())
2164                    .unwrap_err(),
2165                errno!(ENOENT)
2166            );
2167        })
2168        .await;
2169    }
2170
2171    /// Symlinks which need to be traversed across types (nodes and paths), as well as across
2172    /// owning directories, can be tricky to get right.
2173    #[::fuchsia::test]
2174    async fn test_lookup_with_symlink_chain() {
2175        spawn_kernel_and_run(async |locked, current_task| {
2176            // Set up the root filesystem
2177            let kernel = current_task.kernel();
2178            let root_fs = TmpFs::new_fs(locked, &kernel);
2179            let root_node = Arc::clone(root_fs.root());
2180            let _first_subdir_node = root_node
2181                .create_dir(locked, &current_task, "first_subdir".into())
2182                .expect("failed to mkdir dev");
2183            let _second_subdir_node = root_node
2184                .create_dir(locked, &current_task, "second_subdir".into())
2185                .expect("failed to mkdir dev");
2186
2187            // Set up two subdirectories under the root filesystem
2188            let first_subdir_fs = TmpFs::new_fs(locked, &kernel);
2189            let second_subdir_fs = TmpFs::new_fs(locked, &kernel);
2190
2191            let ns = Namespace::new(root_fs);
2192            let mut context = LookupContext::default();
2193            let first_subdir = ns
2194                .root()
2195                .lookup_child(locked, &current_task, &mut context, "first_subdir".into())
2196                .expect("failed to lookup first_subdir");
2197            first_subdir
2198                .mount(WhatToMount::Fs(first_subdir_fs), MountFlags::empty())
2199                .expect("failed to mount first_subdir fs node");
2200            let second_subdir = ns
2201                .root()
2202                .lookup_child(locked, &current_task, &mut context, "second_subdir".into())
2203                .expect("failed to lookup second_subdir");
2204            second_subdir
2205                .mount(WhatToMount::Fs(second_subdir_fs), MountFlags::empty())
2206                .expect("failed to mount second_subdir fs node");
2207
2208            // Create the symlink structure. To trigger potential symlink traversal bugs, we're going
2209            // for the following directory structure:
2210            // / (root)
2211            //     + first_subdir/
2212            //         - real_file
2213            //         - path_symlink (-> real_file)
2214            //     + second_subdir/
2215            //         - node_symlink (-> path_symlink)
2216            let real_file_node = first_subdir
2217                .create_node(
2218                    locked,
2219                    &current_task,
2220                    "real_file".into(),
2221                    mode!(IFREG, 0o777),
2222                    DeviceType::NONE,
2223                )
2224                .expect("failed to create real_file");
2225            first_subdir
2226                .create_symlink(locked, &current_task, "path_symlink".into(), "real_file".into())
2227                .expect("failed to create path_symlink");
2228
2229            let mut no_follow_lookup_context = LookupContext::new(SymlinkMode::NoFollow);
2230            let path_symlink_node = first_subdir
2231                .lookup_child(
2232                    locked,
2233                    &current_task,
2234                    &mut no_follow_lookup_context,
2235                    "path_symlink".into(),
2236                )
2237                .expect("Failed to lookup path_symlink");
2238
2239            // The second symlink needs to be of type SymlinkTarget::Node in order to trip the sensitive
2240            // code path. There's no easy method for creating this type of symlink target, so we'll need
2241            // to construct a node from scratch and insert it into the directory manually.
2242            let node_symlink_node = second_subdir.entry.node.fs().create_node_and_allocate_node_id(
2243                CallbackSymlinkNode::new(move || {
2244                    let node = path_symlink_node.clone();
2245                    Ok(SymlinkTarget::Node(node))
2246                }),
2247                FsNodeInfo::new(mode!(IFLNK, 0o777), current_task.current_fscred()),
2248            );
2249            second_subdir
2250                .entry
2251                .create_entry(
2252                    locked,
2253                    &current_task,
2254                    &MountInfo::detached(),
2255                    "node_symlink".into(),
2256                    move |_locked, _dir, _mount, _name| Ok(node_symlink_node),
2257                )
2258                .expect("failed to create node_symlink entry");
2259
2260            // Finally, exercise the lookup under test.
2261            let mut follow_lookup_context = LookupContext::new(SymlinkMode::Follow);
2262            let node_symlink_resolution = second_subdir
2263                .lookup_child(
2264                    locked,
2265                    &current_task,
2266                    &mut follow_lookup_context,
2267                    "node_symlink".into(),
2268                )
2269                .expect("lookup with symlink chain failed");
2270
2271            // The lookup resolution should have correctly followed the symlinks to the real_file node.
2272            assert!(node_symlink_resolution.entry.node.ino == real_file_node.entry.node.ino);
2273        })
2274        .await;
2275    }
2276}