starnix_core/vfs/
namespace.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mutable_state::{state_accessor, state_implementation};
6use crate::security;
7use crate::task::{CurrentTask, EventHandler, Kernel, Task, WaitCanceler, Waiter};
8use crate::time::utc;
9use crate::vfs::buffers::InputBuffer;
10use crate::vfs::fs_registry::FsRegistry;
11use crate::vfs::pseudo::dynamic_file::{DynamicFile, DynamicFileBuf, DynamicFileSource};
12use crate::vfs::pseudo::simple_file::SimpleFileNode;
13use crate::vfs::socket::{SocketAddress, SocketHandle, UnixSocket};
14use crate::vfs::{
15    CheckAccessReason, DirEntry, DirEntryHandle, FileHandle, FileObject, FileOps, FileSystemHandle,
16    FileSystemOptions, FileWriteGuardMode, FsNode, FsNodeHandle, FsNodeOps, FsStr, FsString,
17    PathBuilder, RenameFlags, SymlinkTarget, UnlinkKind, fileops_impl_dataless,
18    fileops_impl_delegate_read_and_seek, fileops_impl_nonseekable, fileops_impl_noop_sync,
19    fs_node_impl_not_dir,
20};
21use fuchsia_rcu::RcuReadScope;
22use macro_rules_attribute::apply;
23use ref_cast::RefCast;
24use starnix_logging::log_warn;
25use starnix_rcu::RcuHashMap;
26use starnix_sync::{
27    BeforeFsNodeAppend, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, Mutex, RwLock, Unlocked,
28};
29use starnix_types::ownership::WeakRef;
30use starnix_uapi::arc_key::{ArcKey, PtrKey, WeakKey};
31use starnix_uapi::auth::UserAndOrGroupId;
32use starnix_uapi::device_type::DeviceType;
33use starnix_uapi::errors::Errno;
34use starnix_uapi::file_mode::{AccessCheck, FileMode};
35use starnix_uapi::inotify_mask::InotifyMask;
36use starnix_uapi::mount_flags::MountFlags;
37use starnix_uapi::open_flags::OpenFlags;
38use starnix_uapi::unmount_flags::UnmountFlags;
39use starnix_uapi::vfs::{FdEvents, ResolveFlags};
40use starnix_uapi::{NAME_MAX, errno, error};
41use std::borrow::Borrow;
42use std::collections::HashSet;
43use std::fmt;
44use std::hash::{Hash, Hasher};
45use std::ops::{Deref, DerefMut};
46use std::sync::{Arc, Weak};
47
48/// A mount namespace.
49///
50/// The namespace records at which entries filesystems are mounted.
51#[derive(Debug)]
52pub struct Namespace {
53    root_mount: MountHandle,
54
55    // Unique ID of this namespace.
56    pub id: u64,
57}
58
59impl Namespace {
60    pub fn new(fs: FileSystemHandle) -> Arc<Namespace> {
61        Self::new_with_flags(fs, MountFlags::empty())
62    }
63
64    pub fn new_with_flags(fs: FileSystemHandle, flags: MountFlags) -> Arc<Namespace> {
65        let kernel = fs.kernel.upgrade().expect("can't create namespace without a kernel");
66        let root_mount = Mount::new(WhatToMount::Fs(fs), flags);
67        Arc::new(Self { root_mount, id: kernel.get_next_namespace_id() })
68    }
69
70    pub fn root(&self) -> NamespaceNode {
71        self.root_mount.root()
72    }
73
74    pub fn clone_namespace(&self) -> Arc<Namespace> {
75        let kernel =
76            self.root_mount.fs.kernel.upgrade().expect("can't clone namespace without a kernel");
77        Arc::new(Self {
78            root_mount: self.root_mount.clone_mount_recursive(),
79            id: kernel.get_next_namespace_id(),
80        })
81    }
82
83    /// Assuming new_ns is a clone of the namespace that node is from, return the equivalent of
84    /// node in new_ns. If this assumption is violated, returns None.
85    pub fn translate_node(mut node: NamespaceNode, new_ns: &Namespace) -> Option<NamespaceNode> {
86        // Collect the list of mountpoints that leads to this node's mount
87        let mut mountpoints = vec![];
88        let mut mount = node.mount;
89        while let Some(mountpoint) = mount.as_ref().and_then(|m| m.read().mountpoint()) {
90            mountpoints.push(mountpoint.entry);
91            mount = mountpoint.mount;
92        }
93
94        // Follow the same path in the new namespace
95        let mut mount = Arc::clone(&new_ns.root_mount);
96        for mountpoint in mountpoints.iter().rev() {
97            let next_mount =
98                mount.read().submounts.get(ArcKey::ref_cast(mountpoint))?.mount.clone();
99            mount = next_mount;
100        }
101        node.mount = Some(mount).into();
102        Some(node)
103    }
104}
105
106impl FsNodeOps for Arc<Namespace> {
107    fs_node_impl_not_dir!();
108
109    fn create_file_ops(
110        &self,
111        _locked: &mut Locked<FileOpsCore>,
112        _node: &FsNode,
113        _current_task: &CurrentTask,
114        _flags: OpenFlags,
115    ) -> Result<Box<dyn FileOps>, Errno> {
116        Ok(Box::new(MountNamespaceFile(self.clone())))
117    }
118}
119
120pub struct MountNamespaceFile(pub Arc<Namespace>);
121
122impl FileOps for MountNamespaceFile {
123    fileops_impl_nonseekable!();
124    fileops_impl_dataless!();
125    fileops_impl_noop_sync!();
126}
127
128/// An empty struct that we use to track the number of active clients for a mount.
129///
130/// Each active client takes a reference to this object. The unmount operation fails
131/// if there are any active clients of the mount.
132type MountClientMarker = Arc<()>;
133
134/// An instance of a filesystem mounted in a namespace.
135///
136/// At a mount, path traversal switches from one filesystem to another.
137/// The client sees a composed directory structure that glues together the
138/// directories from the underlying FsNodes from those filesystems.
139///
140/// The mounts in a namespace form a mount tree, with `mountpoint` pointing to the parent and
141/// `submounts` pointing to the children.
142pub struct Mount {
143    root: DirEntryHandle,
144    flags: Mutex<MountFlags>,
145    fs: FileSystemHandle,
146
147    /// A unique identifier for this mount reported in /proc/pid/mountinfo.
148    id: u64,
149
150    /// A count of the number of active clients.
151    active_client_counter: MountClientMarker,
152
153    // Lock ordering: mount -> submount
154    state: RwLock<MountState>,
155    // Mount used to contain a Weak<Namespace>. It no longer does because since the mount point
156    // hash was moved from Namespace to Mount, nothing actually uses it. Now that
157    // Namespace::clone_namespace() is implemented in terms of Mount::clone_mount_recursive, it
158    // won't be trivial to add it back. I recommend turning the mountpoint field into an enum of
159    // Mountpoint or Namespace, maybe called "parent", and then traverse up to the top of the tree
160    // if you need to find a Mount's Namespace.
161}
162type MountHandle = Arc<Mount>;
163
164/// Public representation of the mount options.
165#[derive(Clone, Debug)]
166pub struct MountInfo {
167    handle: Option<MountHandle>,
168}
169
170impl MountInfo {
171    /// `MountInfo` for a element that is not tied to a given mount. Mount flags will be considered
172    /// empty.
173    pub fn detached() -> Self {
174        None.into()
175    }
176
177    /// The mount flags of the represented mount.
178    pub fn flags(&self) -> MountFlags {
179        if let Some(handle) = &self.handle {
180            handle.flags()
181        } else {
182            // Consider not mounted node have the NOATIME flags.
183            MountFlags::NOATIME
184        }
185    }
186
187    /// Checks whether this `MountInfo` represents a writable file system mount.
188    pub fn check_readonly_filesystem(&self) -> Result<(), Errno> {
189        if self.flags().contains(MountFlags::RDONLY) {
190            return error!(EROFS);
191        }
192        Ok(())
193    }
194
195    /// Checks whether this `MountInfo` represents an executable file system mount.
196    pub fn check_noexec_filesystem(&self) -> Result<(), Errno> {
197        if self.flags().contains(MountFlags::NOEXEC) {
198            return error!(EACCES);
199        }
200        Ok(())
201    }
202}
203
204impl Deref for MountInfo {
205    type Target = Option<MountHandle>;
206
207    fn deref(&self) -> &Self::Target {
208        &self.handle
209    }
210}
211
212impl DerefMut for MountInfo {
213    fn deref_mut(&mut self) -> &mut Self::Target {
214        &mut self.handle
215    }
216}
217
218impl std::cmp::PartialEq for MountInfo {
219    fn eq(&self, other: &Self) -> bool {
220        self.handle.as_ref().map(Arc::as_ptr) == other.handle.as_ref().map(Arc::as_ptr)
221    }
222}
223
224impl std::cmp::Eq for MountInfo {}
225
226impl Into<MountInfo> for Option<MountHandle> {
227    fn into(self) -> MountInfo {
228        MountInfo { handle: self }
229    }
230}
231
232#[derive(Default)]
233pub struct MountState {
234    /// The namespace node that this mount is mounted on. This is a tuple instead of a
235    /// NamespaceNode because the Mount pointer has to be weak because this is the pointer to the
236    /// parent mount, the parent has a pointer to the children too, and making both strong would be
237    /// a cycle.
238    mountpoint: Option<(Weak<Mount>, DirEntryHandle)>,
239
240    // The set is keyed by the mountpoints which are always descendants of this mount's root.
241    // Conceptually, the set is more akin to a map: `DirEntry -> MountHandle`, but we use a set
242    // instead because `Submount` has a drop implementation that needs both the key and value.
243    //
244    // Each directory entry can only have one mount attached. Mount shadowing works by using the
245    // root of the inner mount as a mountpoint. For example, if filesystem A is mounted at /foo,
246    // mounting filesystem B on /foo will create the mount as a child of the A mount, attached to
247    // A's root, instead of the root mount.
248    submounts: HashSet<Submount>,
249
250    /// The membership of this mount in its peer group. Do not access directly. Instead use
251    /// peer_group(), take_from_peer_group(), and set_peer_group().
252    // TODO(tbodt): Refactor the links into, some kind of extra struct or something? This is hard
253    // because setting this field requires the Arc<Mount>.
254    peer_group_: Option<(Arc<PeerGroup>, PtrKey<Mount>)>,
255    /// The membership of this mount in a PeerGroup's downstream. Do not access directly. Instead
256    /// use upstream(), take_from_upstream(), and set_upstream().
257    upstream_: Option<(Weak<PeerGroup>, PtrKey<Mount>)>,
258}
259
260/// A group of mounts. Setting MS_SHARED on a mount puts it in its own peer group. Any bind mounts
261/// of a mount in the group are also added to the group. A mount created in any mount in a peer
262/// group will be automatically propagated (recreated) in every other mount in the group.
263#[derive(Default)]
264struct PeerGroup {
265    id: u64,
266    state: RwLock<PeerGroupState>,
267}
268#[derive(Default)]
269struct PeerGroupState {
270    mounts: HashSet<WeakKey<Mount>>,
271    downstream: HashSet<WeakKey<Mount>>,
272}
273
274pub enum WhatToMount {
275    Fs(FileSystemHandle),
276    Bind(NamespaceNode),
277}
278
279impl Mount {
280    pub fn new(what: WhatToMount, flags: MountFlags) -> MountHandle {
281        match what {
282            WhatToMount::Fs(fs) => Self::new_with_root(fs.root().clone(), flags),
283            WhatToMount::Bind(node) => {
284                let mount = node.mount.as_ref().expect("can't bind mount from an anonymous node");
285                mount.clone_mount(&node.entry, flags)
286            }
287        }
288    }
289
290    fn new_with_root(root: DirEntryHandle, flags: MountFlags) -> MountHandle {
291        let known_flags = MountFlags::STORED_ON_MOUNT;
292        assert!(
293            !flags.intersects(!known_flags),
294            "mount created with extra flags {:?}",
295            flags - known_flags
296        );
297        let fs = root.node.fs();
298        let kernel = fs.kernel.upgrade().expect("can't create mount without kernel");
299        Arc::new(Self {
300            id: kernel.get_next_mount_id(),
301            flags: Mutex::new(flags),
302            root,
303            active_client_counter: Default::default(),
304            fs,
305            state: Default::default(),
306        })
307    }
308
309    /// A namespace node referring to the root of the mount.
310    pub fn root(self: &MountHandle) -> NamespaceNode {
311        NamespaceNode::new(Arc::clone(self), Arc::clone(&self.root))
312    }
313
314    /// Create the specified mount as a child. Also propagate it to the mount's peer group.
315    fn create_submount(
316        self: &MountHandle,
317        dir: &DirEntryHandle,
318        what: WhatToMount,
319        flags: MountFlags,
320    ) {
321        // TODO(tbodt): Making a copy here is necessary for lock ordering, because the peer group
322        // lock nests inside all mount locks (it would be impractical to reverse this because you
323        // need to lock a mount to get its peer group.) But it opens the door to race conditions
324        // where if a peer are concurrently being added, the mount might not get propagated to the
325        // new peer. The only true solution to this is bigger locks, somehow using the same lock
326        // for the peer group and all of the mounts in the group. Since peer groups are fluid and
327        // can have mounts constantly joining and leaving and then joining other groups, the only
328        // sensible locking option is to use a single global lock for all mounts and peer groups.
329        // This is almost impossible to express in rust. Help.
330        //
331        // Update: Also necessary to make a copy to prevent excess replication, see the comment on
332        // the following Mount::new call.
333        let peers = {
334            let state = self.state.read();
335            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
336        };
337
338        // Create the mount after copying the peer groups, because in the case of creating a bind
339        // mount inside itself, the new mount would get added to our peer group during the
340        // Mount::new call, but we don't want to replicate into it already. For an example see
341        // MountTest.QuizBRecursion.
342        let mount = Mount::new(what, flags);
343
344        if self.read().is_shared() {
345            mount.write().make_shared();
346        }
347
348        for peer in peers {
349            if Arc::ptr_eq(self, &peer) {
350                continue;
351            }
352            let clone = mount.clone_mount_recursive();
353            peer.write().add_submount_internal(dir, clone);
354        }
355
356        self.write().add_submount_internal(dir, mount)
357    }
358
359    fn remove_submount(self: &MountHandle, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
360        // create_submount explains why we need to make a copy of peers.
361        let peers = {
362            let state = self.state.read();
363            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
364        };
365
366        for peer in peers {
367            if Arc::ptr_eq(self, &peer) {
368                continue;
369            }
370            // mount_namespaces(7): If B is shared, then all most-recently-mounted mounts at b on
371            // mounts that receive propagation from mount B and do not have submounts under them are
372            // unmounted.
373            let mut peer = peer.write();
374            if let Some(submount) = peer.submounts.get(mount_hash_key) {
375                if !submount.mount.read().submounts.is_empty() {
376                    continue;
377                }
378            }
379            let _ = peer.remove_submount_internal(mount_hash_key);
380        }
381
382        self.write().remove_submount_internal(mount_hash_key)
383    }
384
385    /// Create a new mount with the same filesystem, flags, and peer group. Used to implement bind
386    /// mounts.
387    fn clone_mount(
388        self: &MountHandle,
389        new_root: &DirEntryHandle,
390        flags: MountFlags,
391    ) -> MountHandle {
392        assert!(new_root.is_descendant_of(&self.root));
393        // According to mount(2) on bind mounts, all flags other than MS_REC are ignored when doing
394        // a bind mount.
395        let clone = Self::new_with_root(Arc::clone(new_root), self.flags());
396
397        if flags.contains(MountFlags::REC) {
398            // This is two steps because the alternative (locking clone.state while iterating over
399            // self.state.submounts) trips tracing_mutex. The lock ordering is parent -> child, and
400            // if the clone is eventually made a child of self, this looks like an ordering
401            // violation. I'm not convinced it's a real issue, but I can't convince myself it's not
402            // either.
403            let mut submounts = vec![];
404            for Submount { dir, mount } in &self.state.read().submounts {
405                submounts.push((dir.clone(), mount.clone_mount_recursive()));
406            }
407            let mut clone_state = clone.write();
408            for (dir, submount) in submounts {
409                clone_state.add_submount_internal(&dir, submount);
410            }
411        }
412
413        // Put the clone in the same peer group
414        let peer_group = self.state.read().peer_group().map(Arc::clone);
415        if let Some(peer_group) = peer_group {
416            clone.write().set_peer_group(peer_group);
417        }
418
419        clone
420    }
421
422    /// Do a clone of the full mount hierarchy below this mount. Used for creating mount
423    /// namespaces and creating copies to use for propagation.
424    fn clone_mount_recursive(self: &MountHandle) -> MountHandle {
425        self.clone_mount(&self.root, MountFlags::REC)
426    }
427
428    pub fn change_propagation(self: &MountHandle, flag: MountFlags, recursive: bool) {
429        let mut state = self.write();
430        match flag {
431            MountFlags::SHARED => state.make_shared(),
432            MountFlags::PRIVATE => state.make_private(),
433            MountFlags::DOWNSTREAM => state.make_downstream(),
434            _ => {
435                log_warn!("mount propagation {:?}", flag);
436                return;
437            }
438        }
439
440        if recursive {
441            for submount in &state.submounts {
442                submount.mount.change_propagation(flag, recursive);
443            }
444        }
445    }
446
447    fn flags(&self) -> MountFlags {
448        *self.flags.lock()
449    }
450
451    pub fn update_flags(self: &MountHandle, mut flags: MountFlags) {
452        flags &= MountFlags::STORED_ON_MOUNT;
453        let atime_flags = MountFlags::NOATIME
454            | MountFlags::NODIRATIME
455            | MountFlags::RELATIME
456            | MountFlags::STRICTATIME;
457        let mut stored_flags = self.flags.lock();
458        if !flags.intersects(atime_flags) {
459            // Since Linux 3.17, if none of MS_NOATIME, MS_NODIRATIME,
460            // MS_RELATIME, or MS_STRICTATIME is specified in mountflags, then
461            // the remount operation preserves the existing values of these
462            // flags (rather than defaulting to MS_RELATIME).
463            flags |= *stored_flags & atime_flags;
464        }
465        // The "effect [of MS_STRICTATIME] is to clear the MS_NOATIME and MS_RELATIME flags."
466        flags &= !MountFlags::STRICTATIME;
467        *stored_flags = flags;
468    }
469
470    /// The number of active clients of this mount.
471    ///
472    /// The mount cannot be unmounted if there are any active clients.
473    fn active_clients(&self) -> usize {
474        // We need to subtract one for our own reference. We are not a real client.
475        Arc::strong_count(&self.active_client_counter) - 1
476    }
477
478    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
479        if !flags.contains(UnmountFlags::DETACH) {
480            if self.active_clients() > 0 || !self.state.read().submounts.is_empty() {
481                return error!(EBUSY);
482            }
483        }
484        let mountpoint = self.state.read().mountpoint().ok_or_else(|| errno!(EINVAL))?;
485        let parent_mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
486        parent_mount.remove_submount(mountpoint.mount_hash_key())
487    }
488
489    /// Returns the security state of the fs.
490    pub fn security_state(&self) -> &security::FileSystemState {
491        &self.fs.security_state
492    }
493
494    /// Returns the name of the fs.
495    pub fn fs_name(&self) -> &'static FsStr {
496        self.fs.name()
497    }
498
499    state_accessor!(Mount, state, Arc<Mount>);
500}
501
502impl MountState {
503    /// Returns true if there is a submount on top of `dir_entry`.
504    pub fn has_submount(&self, dir_entry: &DirEntryHandle) -> bool {
505        self.submounts.contains(ArcKey::ref_cast(dir_entry))
506    }
507
508    /// The NamespaceNode on which this Mount is mounted.
509    fn mountpoint(&self) -> Option<NamespaceNode> {
510        let (mount, entry) = self.mountpoint.as_ref()?;
511        Some(NamespaceNode::new(mount.upgrade()?, entry.clone()))
512    }
513
514    /// Return this mount's current peer group.
515    fn peer_group(&self) -> Option<&Arc<PeerGroup>> {
516        let (group, _) = self.peer_group_.as_ref()?;
517        Some(group)
518    }
519
520    /// Remove this mount from its peer group and return the peer group.
521    fn take_from_peer_group(&mut self) -> Option<Arc<PeerGroup>> {
522        let (old_group, old_mount) = self.peer_group_.take()?;
523        old_group.remove(old_mount);
524        if let Some(upstream) = self.take_from_upstream() {
525            let next_mount =
526                old_group.state.read().mounts.iter().next().map(|w| w.0.upgrade().unwrap());
527            if let Some(next_mount) = next_mount {
528                // TODO(https://fxbug.dev/42065259): Fix the lock ordering here. We've locked next_mount
529                // while self is locked, and since the propagation tree and mount tree are
530                // separate, this could violate the mount -> submount order previously established.
531                next_mount.write().set_upstream(upstream);
532            }
533        }
534        Some(old_group)
535    }
536
537    fn upstream(&self) -> Option<Arc<PeerGroup>> {
538        self.upstream_.as_ref().and_then(|g| g.0.upgrade())
539    }
540
541    fn take_from_upstream(&mut self) -> Option<Arc<PeerGroup>> {
542        let (old_upstream, old_mount) = self.upstream_.take()?;
543        // TODO(tbodt): Reason about whether the upgrade() could possibly return None, and what we
544        // should actually do in that case.
545        let old_upstream = old_upstream.upgrade()?;
546        old_upstream.remove_downstream(old_mount);
547        Some(old_upstream)
548    }
549}
550
551#[apply(state_implementation!)]
552impl MountState<Base = Mount, BaseType = Arc<Mount>> {
553    /// Add a child mount *without propagating it to the peer group*. For internal use only.
554    fn add_submount_internal(&mut self, dir: &DirEntryHandle, mount: MountHandle) {
555        if !dir.is_descendant_of(&self.base.root) {
556            return;
557        }
558
559        let submount = mount.fs.kernel.upgrade().unwrap().mounts.register_mount(dir, mount.clone());
560        let old_mountpoint =
561            mount.state.write().mountpoint.replace((Arc::downgrade(self.base), Arc::clone(dir)));
562        assert!(old_mountpoint.is_none(), "add_submount can only take a newly created mount");
563        // Mount shadowing is implemented by mounting onto the root of the first mount, not by
564        // creating two mounts on the same mountpoint.
565        let old_mount = self.submounts.replace(submount);
566
567        // In rare cases, mount propagation might result in a request to mount on a directory where
568        // something is already mounted. MountTest.LotsOfShadowing will trigger this. Linux handles
569        // this by inserting the new mount between the old mount and the current mount.
570        if let Some(mut old_mount) = old_mount {
571            // Previous state: self[dir] = old_mount
572            // New state: self[dir] = new_mount, new_mount[new_mount.root] = old_mount
573            // The new mount has already been inserted into self, now just update the old mount to
574            // be a child of the new mount.
575            old_mount.mount.write().mountpoint = Some((Arc::downgrade(&mount), Arc::clone(dir)));
576            old_mount.dir = ArcKey(mount.root.clone());
577            mount.write().submounts.insert(old_mount);
578        }
579    }
580
581    fn remove_submount_internal(&mut self, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
582        if self.submounts.remove(mount_hash_key) { Ok(()) } else { error!(EINVAL) }
583    }
584
585    /// Set this mount's peer group.
586    fn set_peer_group(&mut self, group: Arc<PeerGroup>) {
587        self.take_from_peer_group();
588        group.add(self.base);
589        self.peer_group_ = Some((group, Arc::as_ptr(self.base).into()));
590    }
591
592    fn set_upstream(&mut self, group: Arc<PeerGroup>) {
593        self.take_from_upstream();
594        group.add_downstream(self.base);
595        self.upstream_ = Some((Arc::downgrade(&group), Arc::as_ptr(self.base).into()));
596    }
597
598    /// Is the mount in a peer group? Corresponds to MS_SHARED.
599    pub fn is_shared(&self) -> bool {
600        self.peer_group().is_some()
601    }
602
603    /// Put the mount in a peer group. Implements MS_SHARED.
604    pub fn make_shared(&mut self) {
605        if self.is_shared() {
606            return;
607        }
608        let kernel =
609            self.base.fs.kernel.upgrade().expect("can't create new peer group without kernel");
610        self.set_peer_group(PeerGroup::new(kernel.get_next_peer_group_id()));
611    }
612
613    /// Take the mount out of its peer group, also remove upstream if any. Implements MS_PRIVATE.
614    pub fn make_private(&mut self) {
615        self.take_from_peer_group();
616        self.take_from_upstream();
617    }
618
619    /// Take the mount out of its peer group and make it downstream instead. Implements
620    /// MountFlags::DOWNSTREAM (MS_SLAVE).
621    pub fn make_downstream(&mut self) {
622        if let Some(peer_group) = self.take_from_peer_group() {
623            self.set_upstream(peer_group);
624        }
625    }
626}
627
628impl PeerGroup {
629    fn new(id: u64) -> Arc<Self> {
630        Arc::new(Self { id, state: Default::default() })
631    }
632
633    fn add(&self, mount: &Arc<Mount>) {
634        self.state.write().mounts.insert(WeakKey::from(mount));
635    }
636
637    fn remove(&self, mount: PtrKey<Mount>) {
638        self.state.write().mounts.remove(&mount);
639    }
640
641    fn add_downstream(&self, mount: &Arc<Mount>) {
642        self.state.write().downstream.insert(WeakKey::from(mount));
643    }
644
645    fn remove_downstream(&self, mount: PtrKey<Mount>) {
646        self.state.write().downstream.remove(&mount);
647    }
648
649    fn copy_propagation_targets(&self) -> Vec<MountHandle> {
650        let mut buf = vec![];
651        self.collect_propagation_targets(&mut buf);
652        buf
653    }
654
655    fn collect_propagation_targets(&self, buf: &mut Vec<MountHandle>) {
656        let downstream_mounts: Vec<_> = {
657            let state = self.state.read();
658            buf.extend(state.mounts.iter().filter_map(|m| m.0.upgrade()));
659            state.downstream.iter().filter_map(|m| m.0.upgrade()).collect()
660        };
661        for mount in downstream_mounts {
662            let peer_group = mount.read().peer_group().map(Arc::clone);
663            match peer_group {
664                Some(group) => group.collect_propagation_targets(buf),
665                None => buf.push(mount),
666            }
667        }
668    }
669}
670
671impl Drop for Mount {
672    fn drop(&mut self) {
673        let state = self.state.get_mut();
674        state.take_from_peer_group();
675        state.take_from_upstream();
676    }
677}
678
679impl fmt::Debug for Mount {
680    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
681        let state = self.state.read();
682        f.debug_struct("Mount")
683            .field("id", &(self as *const Mount))
684            .field("root", &self.root)
685            .field("mountpoint", &state.mountpoint)
686            .field("submounts", &state.submounts)
687            .finish()
688    }
689}
690
691impl Kernel {
692    pub fn get_next_mount_id(&self) -> u64 {
693        self.next_mount_id.next()
694    }
695
696    pub fn get_next_peer_group_id(&self) -> u64 {
697        self.next_peer_group_id.next()
698    }
699
700    pub fn get_next_namespace_id(&self) -> u64 {
701        self.next_namespace_id.next()
702    }
703}
704
705impl CurrentTask {
706    pub fn create_filesystem(
707        &self,
708        locked: &mut Locked<Unlocked>,
709        fs_type: &FsStr,
710        options: FileSystemOptions,
711    ) -> Result<FileSystemHandle, Errno> {
712        // Please register new file systems via //src/starnix/modules/lib.rs, even if the file
713        // system is implemented inside starnix_core.
714        //
715        // Most file systems should be implemented as modules. The VFS provides various traits that
716        // let starnix_core integrate file systems without needing to depend on the file systems
717        // directly.
718        self.kernel()
719            .expando
720            .get::<FsRegistry>()
721            .create(locked, self, fs_type, options)
722            .ok_or_else(|| errno!(ENODEV, fs_type))?
723    }
724}
725
726// Writes to `sink` the mount flags and LSM mount options for the given `mount`.
727fn write_mount_info(task: &Task, sink: &mut DynamicFileBuf, mount: &Mount) -> Result<(), Errno> {
728    write!(sink, "{}", mount.flags())?;
729    security::sb_show_options(&task.kernel(), sink, &mount)
730}
731
732struct ProcMountsFileSource(WeakRef<Task>);
733
734impl DynamicFileSource for ProcMountsFileSource {
735    fn generate(
736        &self,
737        _current_task: &CurrentTask,
738        sink: &mut DynamicFileBuf,
739    ) -> Result<(), Errno> {
740        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
741        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
742        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
743        // extra work to maintain it.
744        let task = Task::from_weak(&self.0)?;
745        let root = task.fs().root();
746        let ns = task.fs().namespace();
747        for_each_mount(&ns.root_mount, &mut |mount| {
748            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
749            if !mountpoint.is_descendant_of(&root) {
750                return Ok(());
751            }
752            write!(
753                sink,
754                "{} {} {} ",
755                mount.fs.options.source_for_display(),
756                mountpoint.path(&task),
757                mount.fs.name(),
758            )?;
759            write_mount_info(&task, sink, mount)?;
760            writeln!(sink, " 0 0")?;
761            Ok(())
762        })?;
763        Ok(())
764    }
765}
766
767pub struct ProcMountsFile {
768    dynamic_file: DynamicFile<ProcMountsFileSource>,
769}
770
771impl ProcMountsFile {
772    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
773        SimpleFileNode::new(move |_, _| {
774            Ok(Self { dynamic_file: DynamicFile::new(ProcMountsFileSource(task.clone())) })
775        })
776    }
777}
778
779impl FileOps for ProcMountsFile {
780    fileops_impl_delegate_read_and_seek!(self, self.dynamic_file);
781    fileops_impl_noop_sync!();
782
783    fn write(
784        &self,
785        _locked: &mut Locked<FileOpsCore>,
786        _file: &FileObject,
787        _current_task: &CurrentTask,
788        _offset: usize,
789        _data: &mut dyn InputBuffer,
790    ) -> Result<usize, Errno> {
791        error!(ENOSYS)
792    }
793
794    fn wait_async(
795        &self,
796        _locked: &mut Locked<FileOpsCore>,
797        _file: &FileObject,
798        _current_task: &CurrentTask,
799        waiter: &Waiter,
800        _events: FdEvents,
801        _handler: EventHandler,
802    ) -> Option<WaitCanceler> {
803        // Polling this file gives notifications when any change to mounts occurs. This is not
804        // implemented yet, but stubbed for Android init.
805        Some(waiter.fake_wait())
806    }
807
808    fn query_events(
809        &self,
810        _locked: &mut Locked<FileOpsCore>,
811        _file: &FileObject,
812        _current_task: &CurrentTask,
813    ) -> Result<FdEvents, Errno> {
814        Ok(FdEvents::empty())
815    }
816}
817
818#[derive(Clone)]
819pub struct ProcMountinfoFile(WeakRef<Task>);
820impl ProcMountinfoFile {
821    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
822        DynamicFile::new_node(Self(task))
823    }
824}
825impl DynamicFileSource for ProcMountinfoFile {
826    fn generate(
827        &self,
828        _current_task: &CurrentTask,
829        sink: &mut DynamicFileBuf,
830    ) -> Result<(), Errno> {
831        // Returns path to the `dir` from the root of the file system.
832        fn path_from_fs_root(dir: &DirEntryHandle) -> FsString {
833            let mut path = PathBuilder::new();
834            if dir.is_dead() {
835                // Return `/foo/dir//deleted` if the dir was deleted.
836                path.prepend_element("/deleted".into());
837            }
838            let scope = RcuReadScope::new();
839            let mut current = dir.deref();
840            while let Some(parent) = current.parent_ref(&scope) {
841                path.prepend_element(current.local_name(&scope));
842                current = parent;
843            }
844            path.build_absolute()
845        }
846
847        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
848        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
849        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
850        // extra work to maintain it.
851        let task = Task::from_weak(&self.0)?;
852        let root = task.fs().root();
853        let ns = task.fs().namespace();
854        for_each_mount(&ns.root_mount, &mut |mount| {
855            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
856            if !mountpoint.is_descendant_of(&root) {
857                return Ok(());
858            }
859            // Can't fail, mountpoint() and root() can't return a NamespaceNode with no mount
860            let parent = mountpoint.mount.as_ref().unwrap();
861            write!(
862                sink,
863                "{} {} {} {} {} ",
864                mount.id,
865                parent.id,
866                mount.root.node.fs().dev_id,
867                path_from_fs_root(&mount.root),
868                mountpoint.path(&task),
869            )?;
870            write_mount_info(&task, sink, mount)?;
871            if let Some(peer_group) = mount.read().peer_group() {
872                write!(sink, " shared:{}", peer_group.id)?;
873            }
874            if let Some(upstream) = mount.read().upstream() {
875                write!(sink, " master:{}", upstream.id)?;
876            }
877            writeln!(
878                sink,
879                " - {} {} {}",
880                mount.fs.name(),
881                mount.fs.options.source_for_display(),
882                mount.fs.options.flags,
883            )?;
884            Ok(())
885        })?;
886        Ok(())
887    }
888}
889
890fn for_each_mount<E>(
891    mount: &MountHandle,
892    callback: &mut impl FnMut(&MountHandle) -> Result<(), E>,
893) -> Result<(), E> {
894    callback(mount)?;
895    // Collect list first to avoid self deadlock when ProcMountinfoFile::read_at tries to call
896    // NamespaceNode::path()
897    let submounts: Vec<_> = mount.read().submounts.iter().map(|s| s.mount.clone()).collect();
898    for submount in submounts {
899        for_each_mount(&submount, callback)?;
900    }
901    Ok(())
902}
903
904/// The `SymlinkMode` enum encodes how symlinks are followed during path traversal.
905#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)]
906pub enum SymlinkMode {
907    /// Follow a symlink at the end of a path resolution.
908    #[default]
909    Follow,
910
911    /// Do not follow a symlink at the end of a path resolution.
912    NoFollow,
913}
914
915/// The maximum number of symlink traversals that can be made during path resolution.
916pub const MAX_SYMLINK_FOLLOWS: u8 = 40;
917
918/// The context passed during namespace lookups.
919///
920/// Namespace lookups need to mutate a shared context in order to correctly
921/// count the number of remaining symlink traversals.
922pub struct LookupContext {
923    /// The SymlinkMode for the lookup.
924    ///
925    /// As the lookup proceeds, the follow count is decremented each time the
926    /// lookup traverses a symlink.
927    pub symlink_mode: SymlinkMode,
928
929    /// The number of symlinks remaining the follow.
930    ///
931    /// Each time path resolution calls readlink, this value is decremented.
932    pub remaining_follows: u8,
933
934    /// Whether the result of the lookup must be a directory.
935    ///
936    /// For example, if the path ends with a `/` or if userspace passes
937    /// O_DIRECTORY. This flag can be set to true if the lookup encounters a
938    /// symlink that ends with a `/`.
939    pub must_be_directory: bool,
940
941    /// Resolve flags passed to `openat2`. Empty if the lookup originated in any other syscall.
942    pub resolve_flags: ResolveFlags,
943
944    /// Base directory for the lookup. Set only when either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT`
945    /// is passed to `openat2`.
946    pub resolve_base: ResolveBase,
947}
948
949/// Used to specify base directory in `LookupContext` for lookups originating in the `openat2`
950/// syscall with either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT` flag.
951#[derive(Clone, Eq, PartialEq)]
952pub enum ResolveBase {
953    None,
954
955    /// The lookup is not allowed to traverse any node that's not beneath the specified node.
956    Beneath(NamespaceNode),
957
958    /// The lookup should be handled as if the root specified node is the file-system root.
959    InRoot(NamespaceNode),
960}
961
962impl LookupContext {
963    pub fn new(symlink_mode: SymlinkMode) -> LookupContext {
964        LookupContext {
965            symlink_mode,
966            remaining_follows: MAX_SYMLINK_FOLLOWS,
967            must_be_directory: false,
968            resolve_flags: ResolveFlags::empty(),
969            resolve_base: ResolveBase::None,
970        }
971    }
972
973    pub fn with(&self, symlink_mode: SymlinkMode) -> LookupContext {
974        LookupContext { symlink_mode, resolve_base: self.resolve_base.clone(), ..*self }
975    }
976
977    pub fn update_for_path(&mut self, path: &FsStr) {
978        if path.last() == Some(&b'/') {
979            // The last path element must resolve to a directory. This is because a trailing slash
980            // was found in the path.
981            self.must_be_directory = true;
982            // If the last path element is a symlink, we should follow it.
983            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
984            self.symlink_mode = SymlinkMode::Follow;
985        }
986    }
987}
988
989impl Default for LookupContext {
990    fn default() -> Self {
991        LookupContext::new(SymlinkMode::Follow)
992    }
993}
994
995/// Whether the path is reachable from the given root.
996pub enum PathWithReachability {
997    /// The path is reachable from the given root.
998    Reachable(FsString),
999
1000    /// The path is not reachable from the given root.
1001    Unreachable(FsString),
1002}
1003
1004impl PathWithReachability {
1005    pub fn into_path(self) -> FsString {
1006        match self {
1007            PathWithReachability::Reachable(path) => path,
1008            PathWithReachability::Unreachable(path) => path,
1009        }
1010    }
1011}
1012
1013/// A node in a mount namespace.
1014///
1015/// This tree is a composite of the mount tree and the FsNode tree.
1016///
1017/// These nodes are used when traversing paths in a namespace in order to
1018/// present the client the directory structure that includes the mounted
1019/// filesystems.
1020#[derive(Clone)]
1021pub struct NamespaceNode {
1022    /// The mount where this namespace node is mounted.
1023    ///
1024    /// A given FsNode can be mounted in multiple places in a namespace. This
1025    /// field distinguishes between them.
1026    pub mount: MountInfo,
1027
1028    /// The FsNode that corresponds to this namespace entry.
1029    pub entry: DirEntryHandle,
1030}
1031
1032impl NamespaceNode {
1033    pub fn new(mount: MountHandle, entry: DirEntryHandle) -> Self {
1034        Self { mount: Some(mount).into(), entry }
1035    }
1036
1037    /// Create a namespace node that is not mounted in a namespace.
1038    pub fn new_anonymous(entry: DirEntryHandle) -> Self {
1039        Self { mount: None.into(), entry }
1040    }
1041
1042    /// Create a namespace node that is not mounted in a namespace and that refers to a node that
1043    /// is not rooted in a hierarchy and has no name.
1044    pub fn new_anonymous_unrooted(current_task: &CurrentTask, node: FsNodeHandle) -> Self {
1045        let dir_entry = DirEntry::new_unrooted(node);
1046        let _ = security::fs_node_init_with_dentry_no_xattr(current_task, &dir_entry);
1047        Self::new_anonymous(dir_entry)
1048    }
1049
1050    /// Create a FileObject corresponding to this namespace node.
1051    ///
1052    /// This function is the primary way of instantiating FileObjects. Each
1053    /// FileObject records the NamespaceNode that created it in order to
1054    /// remember its path in the Namespace.
1055    pub fn open(
1056        &self,
1057        locked: &mut Locked<Unlocked>,
1058        current_task: &CurrentTask,
1059        flags: OpenFlags,
1060        access_check: AccessCheck,
1061    ) -> Result<FileHandle, Errno> {
1062        let ops = self.entry.node.open(locked, current_task, self, flags, access_check)?;
1063        FileObject::new(locked, current_task, ops, self.clone(), flags)
1064    }
1065
1066    /// Create or open a node in the file system.
1067    ///
1068    /// Works for any type of node other than a symlink.
1069    ///
1070    /// Will return an existing node unless `flags` contains `OpenFlags::EXCL`.
1071    pub fn open_create_node<L>(
1072        &self,
1073        locked: &mut Locked<L>,
1074        current_task: &CurrentTask,
1075        name: &FsStr,
1076        mode: FileMode,
1077        dev: DeviceType,
1078        flags: OpenFlags,
1079    ) -> Result<NamespaceNode, Errno>
1080    where
1081        L: LockEqualOrBefore<FileOpsCore>,
1082    {
1083        let owner = current_task.current_fscred();
1084        let mode = current_task.fs().apply_umask(mode);
1085        let create_fn =
1086            |locked: &mut Locked<L>, dir: &FsNodeHandle, mount: &MountInfo, name: &_| {
1087                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1088            };
1089        let entry = if flags.contains(OpenFlags::EXCL) {
1090            self.entry.create_entry(locked, current_task, &self.mount, name, create_fn)
1091        } else {
1092            self.entry.get_or_create_entry(locked, current_task, &self.mount, name, create_fn)
1093        }?;
1094        Ok(self.with_new_entry(entry))
1095    }
1096
1097    pub fn into_active(self) -> ActiveNamespaceNode {
1098        ActiveNamespaceNode::new(self)
1099    }
1100
1101    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1102        self.into_active().into_mapping(mode)
1103    }
1104
1105    /// Create a node in the file system.
1106    ///
1107    /// Works for any type of node other than a symlink.
1108    ///
1109    /// Does not return an existing node.
1110    pub fn create_node<L>(
1111        &self,
1112        locked: &mut Locked<L>,
1113        current_task: &CurrentTask,
1114        name: &FsStr,
1115        mode: FileMode,
1116        dev: DeviceType,
1117    ) -> Result<NamespaceNode, Errno>
1118    where
1119        L: LockEqualOrBefore<FileOpsCore>,
1120    {
1121        let owner = current_task.current_fscred();
1122        let mode = current_task.fs().apply_umask(mode);
1123        let entry = self.entry.create_entry(
1124            locked,
1125            current_task,
1126            &self.mount,
1127            name,
1128            |locked, dir, mount, name| {
1129                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1130            },
1131        )?;
1132        Ok(self.with_new_entry(entry))
1133    }
1134
1135    /// Create a symlink in the file system.
1136    ///
1137    /// To create another type of node, use `create_node`.
1138    pub fn create_symlink<L>(
1139        &self,
1140        locked: &mut Locked<L>,
1141        current_task: &CurrentTask,
1142        name: &FsStr,
1143        target: &FsStr,
1144    ) -> Result<NamespaceNode, Errno>
1145    where
1146        L: LockEqualOrBefore<FileOpsCore>,
1147    {
1148        let owner = current_task.current_fscred();
1149        let entry = self.entry.create_entry(
1150            locked,
1151            current_task,
1152            &self.mount,
1153            name,
1154            |locked, dir, mount, name| {
1155                dir.create_symlink(locked, current_task, mount, name, target, owner)
1156            },
1157        )?;
1158        Ok(self.with_new_entry(entry))
1159    }
1160
1161    /// Creates an anonymous file.
1162    ///
1163    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
1164    ///
1165    /// Used by O_TMPFILE.
1166    pub fn create_tmpfile<L>(
1167        &self,
1168        locked: &mut Locked<L>,
1169        current_task: &CurrentTask,
1170        mode: FileMode,
1171        flags: OpenFlags,
1172    ) -> Result<NamespaceNode, Errno>
1173    where
1174        L: LockEqualOrBefore<FileOpsCore>,
1175    {
1176        let owner = current_task.current_fscred();
1177        let mode = current_task.fs().apply_umask(mode);
1178        Ok(self.with_new_entry(self.entry.create_tmpfile(
1179            locked,
1180            current_task,
1181            &self.mount,
1182            mode,
1183            owner,
1184            flags,
1185        )?))
1186    }
1187
1188    pub fn link<L>(
1189        &self,
1190        locked: &mut Locked<L>,
1191        current_task: &CurrentTask,
1192        name: &FsStr,
1193        child: &FsNodeHandle,
1194    ) -> Result<NamespaceNode, Errno>
1195    where
1196        L: LockEqualOrBefore<FileOpsCore>,
1197    {
1198        let dir_entry = self.entry.create_entry(
1199            locked,
1200            current_task,
1201            &self.mount,
1202            name,
1203            |locked, dir, mount, name| dir.link(locked, current_task, mount, name, child),
1204        )?;
1205        Ok(self.with_new_entry(dir_entry))
1206    }
1207
1208    pub fn bind_socket<L>(
1209        &self,
1210        locked: &mut Locked<L>,
1211        current_task: &CurrentTask,
1212        name: &FsStr,
1213        socket: SocketHandle,
1214        socket_address: SocketAddress,
1215        mode: FileMode,
1216    ) -> Result<NamespaceNode, Errno>
1217    where
1218        L: LockEqualOrBefore<FileOpsCore>,
1219    {
1220        let dir_entry = self.entry.create_entry(
1221            locked,
1222            current_task,
1223            &self.mount,
1224            name,
1225            |locked, dir, mount, name| {
1226                let node = dir.create_node(
1227                    locked,
1228                    current_task,
1229                    mount,
1230                    name,
1231                    mode,
1232                    DeviceType::NONE,
1233                    current_task.current_fscred(),
1234                )?;
1235                if let Some(unix_socket) = socket.downcast_socket::<UnixSocket>() {
1236                    unix_socket.bind_socket_to_node(&socket, socket_address, &node)?;
1237                } else {
1238                    return error!(ENOTSUP);
1239                }
1240                Ok(node)
1241            },
1242        )?;
1243        Ok(self.with_new_entry(dir_entry))
1244    }
1245
1246    pub fn unlink<L>(
1247        &self,
1248        locked: &mut Locked<L>,
1249        current_task: &CurrentTask,
1250        name: &FsStr,
1251        kind: UnlinkKind,
1252        must_be_directory: bool,
1253    ) -> Result<(), Errno>
1254    where
1255        L: LockEqualOrBefore<FileOpsCore>,
1256    {
1257        if DirEntry::is_reserved_name(name) {
1258            match kind {
1259                UnlinkKind::Directory => {
1260                    if name == ".." {
1261                        error!(ENOTEMPTY)
1262                    } else if self.parent().is_none() {
1263                        // The client is attempting to remove the root.
1264                        error!(EBUSY)
1265                    } else {
1266                        error!(EINVAL)
1267                    }
1268                }
1269                UnlinkKind::NonDirectory => error!(ENOTDIR),
1270            }
1271        } else {
1272            self.entry.unlink(locked, current_task, &self.mount, name, kind, must_be_directory)
1273        }
1274    }
1275
1276    /// Traverse down a parent-to-child link in the namespace.
1277    pub fn lookup_child<L>(
1278        &self,
1279        locked: &mut Locked<L>,
1280        current_task: &CurrentTask,
1281        context: &mut LookupContext,
1282        basename: &FsStr,
1283    ) -> Result<NamespaceNode, Errno>
1284    where
1285        L: LockEqualOrBefore<FileOpsCore>,
1286    {
1287        if !self.entry.node.is_dir() {
1288            return error!(ENOTDIR);
1289        }
1290
1291        if basename.len() > NAME_MAX as usize {
1292            return error!(ENAMETOOLONG);
1293        }
1294
1295        let child = if basename.is_empty() || basename == "." {
1296            self.clone()
1297        } else if basename == ".." {
1298            let root = match &context.resolve_base {
1299                ResolveBase::None => current_task.fs().root(),
1300                ResolveBase::Beneath(node) => {
1301                    // Do not allow traversal out of the 'node'.
1302                    if *self == *node {
1303                        return error!(EXDEV);
1304                    }
1305                    current_task.fs().root()
1306                }
1307                ResolveBase::InRoot(root) => root.clone(),
1308            };
1309
1310            // Make sure this can't escape a chroot.
1311            if *self == root { root } else { self.parent().unwrap_or_else(|| self.clone()) }
1312        } else {
1313            let mut child = self.with_new_entry(self.entry.component_lookup(
1314                locked,
1315                current_task,
1316                &self.mount,
1317                basename,
1318            )?);
1319            while child.entry.node.is_lnk() {
1320                match context.symlink_mode {
1321                    SymlinkMode::NoFollow => {
1322                        break;
1323                    }
1324                    SymlinkMode::Follow => {
1325                        if context.remaining_follows == 0
1326                            || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
1327                        {
1328                            return error!(ELOOP);
1329                        }
1330                        context.remaining_follows -= 1;
1331                        child = match child.readlink(locked, current_task)? {
1332                            SymlinkTarget::Path(link_target) => {
1333                                let link_directory = if link_target[0] == b'/' {
1334                                    // If the path is absolute, we'll resolve the root directory.
1335                                    match &context.resolve_base {
1336                                        ResolveBase::None => current_task.fs().root(),
1337                                        ResolveBase::Beneath(_) => return error!(EXDEV),
1338                                        ResolveBase::InRoot(root) => root.clone(),
1339                                    }
1340                                } else {
1341                                    // If the path is not absolute, it's a relative directory. Let's
1342                                    // try to get the parent of the current child, or in the case
1343                                    // that the child is the root we can just use that directly.
1344                                    child.parent().unwrap_or(child)
1345                                };
1346                                current_task.lookup_path(
1347                                    locked,
1348                                    context,
1349                                    link_directory,
1350                                    link_target.as_ref(),
1351                                )?
1352                            }
1353                            SymlinkTarget::Node(node) => {
1354                                if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
1355                                    return error!(ELOOP);
1356                                }
1357                                node
1358                            }
1359                        }
1360                    }
1361                };
1362            }
1363
1364            child.enter_mount()
1365        };
1366
1367        if context.resolve_flags.contains(ResolveFlags::NO_XDEV) && child.mount != self.mount {
1368            return error!(EXDEV);
1369        }
1370
1371        if context.must_be_directory && !child.entry.node.is_dir() {
1372            return error!(ENOTDIR);
1373        }
1374
1375        Ok(child)
1376    }
1377
1378    /// Traverse up a child-to-parent link in the namespace.
1379    ///
1380    /// This traversal matches the child-to-parent link in the underlying
1381    /// FsNode except at mountpoints, where the link switches from one
1382    /// filesystem to another.
1383    pub fn parent(&self) -> Option<NamespaceNode> {
1384        let mountpoint_or_self = self.escape_mount();
1385        let parent = mountpoint_or_self.entry.parent()?;
1386        Some(mountpoint_or_self.with_new_entry(parent))
1387    }
1388
1389    /// Returns the parent, but does not escape mounts i.e. returns None if this node
1390    /// is the root of a mount.
1391    pub fn parent_within_mount(&self) -> Option<DirEntryHandle> {
1392        if let Ok(_) = self.mount_if_root() {
1393            return None;
1394        }
1395        self.entry.parent()
1396    }
1397
1398    /// Whether this namespace node is a descendant of the given node.
1399    ///
1400    /// Walks up the namespace node tree looking for ancestor. If ancestor is
1401    /// found, returns true. Otherwise, returns false.
1402    pub fn is_descendant_of(&self, ancestor: &NamespaceNode) -> bool {
1403        let ancestor = ancestor.escape_mount();
1404        let mut current = self.escape_mount();
1405        while current != ancestor {
1406            if let Some(parent) = current.parent() {
1407                current = parent.escape_mount();
1408            } else {
1409                return false;
1410            }
1411        }
1412        true
1413    }
1414
1415    /// If this is a mount point, return the root of the mount. Otherwise return self.
1416    fn enter_mount(&self) -> NamespaceNode {
1417        // While the child is a mountpoint, replace child with the mount's root.
1418        fn enter_one_mount(node: &NamespaceNode) -> Option<NamespaceNode> {
1419            if let Some(mount) = node.mount.deref() {
1420                if let Some(submount) =
1421                    mount.state.read().submounts.get(ArcKey::ref_cast(&node.entry))
1422                {
1423                    return Some(submount.mount.root());
1424                }
1425            }
1426            None
1427        }
1428        let mut inner = self.clone();
1429        while let Some(inner_root) = enter_one_mount(&inner) {
1430            inner = inner_root;
1431        }
1432        inner
1433    }
1434
1435    /// If this is the root of a mount, return the mount point. Otherwise return self.
1436    ///
1437    /// This is not exactly the same as parent(). If parent() is called on a root, it will escape
1438    /// the mount, but then return the parent of the mount point instead of the mount point.
1439    fn escape_mount(&self) -> NamespaceNode {
1440        let mut mountpoint_or_self = self.clone();
1441        while let Some(mountpoint) = mountpoint_or_self.mountpoint() {
1442            mountpoint_or_self = mountpoint;
1443        }
1444        mountpoint_or_self
1445    }
1446
1447    /// If this node is the root of a mount, return it. Otherwise EINVAL.
1448    pub fn mount_if_root(&self) -> Result<&MountHandle, Errno> {
1449        if let Some(mount) = self.mount.deref() {
1450            if Arc::ptr_eq(&self.entry, &mount.root) {
1451                return Ok(mount);
1452            }
1453        }
1454        error!(EINVAL)
1455    }
1456
1457    /// Returns the mountpoint at this location in the namespace.
1458    ///
1459    /// If this node is mounted in another node, this function returns the node
1460    /// at which this node is mounted. Otherwise, returns None.
1461    fn mountpoint(&self) -> Option<NamespaceNode> {
1462        self.mount_if_root().ok()?.read().mountpoint()
1463    }
1464
1465    /// The path from the task's root to this node.
1466    pub fn path(&self, task: &Task) -> FsString {
1467        self.path_from_root(Some(&task.fs().root())).into_path()
1468    }
1469
1470    /// The path from the root of the namespace to this node.
1471    pub fn path_escaping_chroot(&self) -> FsString {
1472        self.path_from_root(None).into_path()
1473    }
1474
1475    /// Returns the path to this node, accounting for a custom root.
1476    /// A task may have a custom root set by `chroot`.
1477    pub fn path_from_root(&self, root: Option<&NamespaceNode>) -> PathWithReachability {
1478        if self.mount.is_none() {
1479            return PathWithReachability::Reachable(self.entry.node.internal_name());
1480        }
1481
1482        let mut path = PathBuilder::new();
1483        let mut current = self.escape_mount();
1484        if let Some(root) = root {
1485            let scope = RcuReadScope::new();
1486            // The current node is expected to intersect with the custom root as we travel up the tree.
1487            let root = root.escape_mount();
1488            while current != root {
1489                if let Some(parent) = current.parent() {
1490                    path.prepend_element(current.entry.local_name(&scope));
1491                    current = parent.escape_mount();
1492                } else {
1493                    // This node hasn't intersected with the custom root and has reached the namespace root.
1494                    let mut absolute_path = path.build_absolute();
1495                    if self.entry.is_dead() {
1496                        absolute_path.extend_from_slice(b" (deleted)");
1497                    }
1498
1499                    return PathWithReachability::Unreachable(absolute_path);
1500                }
1501            }
1502        } else {
1503            // No custom root, so travel up the tree to the namespace root.
1504            let scope = RcuReadScope::new();
1505            while let Some(parent) = current.parent() {
1506                path.prepend_element(current.entry.local_name(&scope));
1507                current = parent.escape_mount();
1508            }
1509        }
1510
1511        let mut absolute_path = path.build_absolute();
1512        if self.entry.is_dead() {
1513            absolute_path.extend_from_slice(b" (deleted)");
1514        }
1515
1516        PathWithReachability::Reachable(absolute_path)
1517    }
1518
1519    pub fn mount(&self, what: WhatToMount, flags: MountFlags) -> Result<(), Errno> {
1520        let flags = flags & (MountFlags::STORED_ON_MOUNT | MountFlags::REC);
1521        let mountpoint = self.enter_mount();
1522        let mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
1523        mount.create_submount(&mountpoint.entry, what, flags);
1524        Ok(())
1525    }
1526
1527    /// If this is the root of a filesystem, unmount. Otherwise return EINVAL.
1528    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
1529        let mount = self.enter_mount().mount_if_root()?.clone();
1530        mount.unmount(flags)
1531    }
1532
1533    pub fn rename<L>(
1534        locked: &mut Locked<L>,
1535        current_task: &CurrentTask,
1536        old_parent: &NamespaceNode,
1537        old_name: &FsStr,
1538        new_parent: &NamespaceNode,
1539        new_name: &FsStr,
1540        flags: RenameFlags,
1541    ) -> Result<(), Errno>
1542    where
1543        L: LockEqualOrBefore<FileOpsCore>,
1544    {
1545        DirEntry::rename(
1546            locked,
1547            current_task,
1548            &old_parent.entry,
1549            &old_parent.mount,
1550            old_name,
1551            &new_parent.entry,
1552            &new_parent.mount,
1553            new_name,
1554            flags,
1555        )
1556    }
1557
1558    fn with_new_entry(&self, entry: DirEntryHandle) -> NamespaceNode {
1559        Self { mount: self.mount.clone(), entry }
1560    }
1561
1562    fn mount_hash_key(&self) -> &ArcKey<DirEntry> {
1563        ArcKey::ref_cast(&self.entry)
1564    }
1565
1566    pub fn suid_and_sgid(&self, current_task: &CurrentTask) -> Result<UserAndOrGroupId, Errno> {
1567        if self.mount.flags().contains(MountFlags::NOSUID) {
1568            Ok(UserAndOrGroupId::default())
1569        } else {
1570            self.entry.node.info().suid_and_sgid(current_task, &self.entry.node)
1571        }
1572    }
1573
1574    pub fn update_atime(&self) {
1575        // Do not update the atime of this node if it is mounted with the NOATIME flag.
1576        if !self.mount.flags().contains(MountFlags::NOATIME) {
1577            self.entry.node.update_info(|info| {
1578                let now = utc::utc_now();
1579                info.time_access = now;
1580                info.pending_time_access_update = true;
1581            });
1582        }
1583    }
1584
1585    pub fn readlink<L>(
1586        &self,
1587        locked: &mut Locked<L>,
1588        current_task: &CurrentTask,
1589    ) -> Result<SymlinkTarget, Errno>
1590    where
1591        L: LockEqualOrBefore<FileOpsCore>,
1592    {
1593        self.update_atime();
1594        self.entry.node.readlink(locked, current_task)
1595    }
1596
1597    pub fn notify(&self, event_mask: InotifyMask) {
1598        if self.mount.is_some() {
1599            self.entry.notify(event_mask);
1600        }
1601    }
1602
1603    /// Check whether the node can be accessed in the current context with the specified access
1604    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1605    /// owner or is in the file's group.
1606    pub fn check_access<L>(
1607        &self,
1608        locked: &mut Locked<L>,
1609        current_task: &CurrentTask,
1610        permission_flags: impl Into<security::PermissionFlags>,
1611        reason: CheckAccessReason,
1612    ) -> Result<(), Errno>
1613    where
1614        L: LockEqualOrBefore<FileOpsCore>,
1615    {
1616        self.entry.node.check_access(
1617            locked,
1618            current_task,
1619            &self.mount,
1620            permission_flags,
1621            reason,
1622            self,
1623        )
1624    }
1625
1626    /// Checks if O_NOATIME is allowed,
1627    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1628        self.entry.node.check_o_noatime_allowed(current_task)
1629    }
1630
1631    pub fn truncate<L>(
1632        &self,
1633        locked: &mut Locked<L>,
1634        current_task: &CurrentTask,
1635        length: u64,
1636    ) -> Result<(), Errno>
1637    where
1638        L: LockBefore<BeforeFsNodeAppend>,
1639    {
1640        self.entry.node.truncate(locked, current_task, &self.mount, length)?;
1641        self.entry.notify_ignoring_excl_unlink(InotifyMask::MODIFY);
1642        Ok(())
1643    }
1644}
1645
1646impl fmt::Debug for NamespaceNode {
1647    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1648        f.debug_struct("NamespaceNode")
1649            .field("path", &self.path_escaping_chroot())
1650            .field("mount", &self.mount)
1651            .field("entry", &self.entry)
1652            .finish()
1653    }
1654}
1655
1656// Eq/Hash impls intended for the MOUNT_POINTS hash
1657impl PartialEq for NamespaceNode {
1658    fn eq(&self, other: &Self) -> bool {
1659        self.mount.as_ref().map(Arc::as_ptr).eq(&other.mount.as_ref().map(Arc::as_ptr))
1660            && Arc::ptr_eq(&self.entry, &other.entry)
1661    }
1662}
1663impl Eq for NamespaceNode {}
1664impl Hash for NamespaceNode {
1665    fn hash<H: Hasher>(&self, state: &mut H) {
1666        self.mount.as_ref().map(Arc::as_ptr).hash(state);
1667        Arc::as_ptr(&self.entry).hash(state);
1668    }
1669}
1670
1671/// A namespace node that keeps the underly mount busy.
1672#[derive(Debug, Clone)]
1673pub struct ActiveNamespaceNode {
1674    /// The underlying namespace node.
1675    name: NamespaceNode,
1676
1677    /// Adds a reference to the mount client marker to prevent the mount from
1678    /// being removed while the NamespaceNode is active. Is None iff mount is
1679    /// None.
1680    _marker: Option<MountClientMarker>,
1681}
1682
1683impl ActiveNamespaceNode {
1684    pub fn new(name: NamespaceNode) -> Self {
1685        let marker = name.mount.as_ref().map(|mount| mount.active_client_counter.clone());
1686        Self { name, _marker: marker }
1687    }
1688
1689    pub fn to_passive(&self) -> NamespaceNode {
1690        self.deref().clone()
1691    }
1692
1693    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1694        if let Some(mode) = mode {
1695            self.entry.node.write_guard_state.lock().acquire(mode)?;
1696        }
1697        Ok(Arc::new(FileMapping { name: self, mode }))
1698    }
1699}
1700
1701impl Deref for ActiveNamespaceNode {
1702    type Target = NamespaceNode;
1703
1704    fn deref(&self) -> &Self::Target {
1705        &self.name
1706    }
1707}
1708
1709impl PartialEq for ActiveNamespaceNode {
1710    fn eq(&self, other: &Self) -> bool {
1711        self.deref().eq(other.deref())
1712    }
1713}
1714impl Eq for ActiveNamespaceNode {}
1715impl Hash for ActiveNamespaceNode {
1716    fn hash<H: Hasher>(&self, state: &mut H) {
1717        self.deref().hash(state)
1718    }
1719}
1720
1721#[derive(Debug, Clone, PartialEq, Eq)]
1722#[must_use]
1723pub struct FileMapping {
1724    pub name: ActiveNamespaceNode,
1725    mode: Option<FileWriteGuardMode>,
1726}
1727
1728impl Drop for FileMapping {
1729    fn drop(&mut self) {
1730        if let Some(mode) = self.mode {
1731            self.name.entry.node.write_guard_state.lock().release(mode);
1732        }
1733    }
1734}
1735
1736/// Tracks all mounts, keyed by mount point.
1737pub struct Mounts {
1738    mounts: RcuHashMap<WeakKey<DirEntry>, Vec<ArcKey<Mount>>>,
1739}
1740
1741impl Mounts {
1742    pub fn new() -> Self {
1743        Mounts { mounts: RcuHashMap::default() }
1744    }
1745
1746    /// Registers the mount in the global mounts map.
1747    fn register_mount(&self, dir_entry: &Arc<DirEntry>, mount: MountHandle) -> Submount {
1748        let mut mounts = self.mounts.lock();
1749        let key = WeakKey::from(dir_entry);
1750        let mut vec = mounts.get(&key).unwrap_or_else(|| {
1751            dir_entry.set_has_mounts(true);
1752            Vec::new()
1753        });
1754        vec.push(ArcKey(mount.clone()));
1755        mounts.insert(key, vec);
1756        Submount { dir: ArcKey(dir_entry.clone()), mount }
1757    }
1758
1759    /// Unregisters the mount.  This is called by `Submount::drop`.
1760    fn unregister_mount(&self, dir_entry: &Arc<DirEntry>, mount: &MountHandle) {
1761        let mut mounts = self.mounts.lock();
1762        let key = WeakKey::from(dir_entry);
1763        if let Some(mut vec) = mounts.get(&key) {
1764            let index = vec.iter().position(|e| e == ArcKey::ref_cast(mount)).unwrap();
1765            if vec.len() == 1 {
1766                mounts.remove(&key);
1767                dir_entry.set_has_mounts(false);
1768            } else {
1769                vec.swap_remove(index);
1770                mounts.insert(key, vec);
1771            }
1772        }
1773    }
1774
1775    /// Unmounts all mounts associated with `dir_entry`.  This is called when `dir_entry` is
1776    /// unlinked (which would normally result in EBUSY, but not if it isn't mounted in the local
1777    /// namespace).
1778    pub fn unmount(&self, dir_entry: &DirEntry) {
1779        let mounts = self.mounts.lock().remove(&PtrKey::from(dir_entry as *const _));
1780        if let Some(mounts) = mounts {
1781            for mount in mounts {
1782                // Ignore errors.
1783                let _ = mount.unmount(UnmountFlags::DETACH);
1784            }
1785        }
1786    }
1787
1788    /// Drain mounts. For each drained mount, force a FileSystem unmount.
1789    // TODO(https://fxbug.dev/295073633): Graceful shutdown should try to first unmount the mounts
1790    // and only force a FileSystem unmount on failure.
1791    pub fn clear(&self) {
1792        for (_dir_entry, mounts) in self.mounts.lock().drain() {
1793            for mount in mounts {
1794                mount.fs.force_unmount_ops();
1795            }
1796        }
1797    }
1798}
1799
1800/// A RAII object that unregisters a mount when dropped.
1801#[derive(Debug)]
1802struct Submount {
1803    dir: ArcKey<DirEntry>,
1804    mount: MountHandle,
1805}
1806
1807impl Drop for Submount {
1808    fn drop(&mut self) {
1809        self.mount.fs.kernel.upgrade().unwrap().mounts.unregister_mount(&self.dir, &self.mount)
1810    }
1811}
1812
1813/// Submount is stored in a mount's submounts hash set, which is keyed by the mountpoint.
1814impl Eq for Submount {}
1815impl PartialEq<Self> for Submount {
1816    fn eq(&self, other: &Self) -> bool {
1817        self.dir == other.dir
1818    }
1819}
1820impl Hash for Submount {
1821    fn hash<H: Hasher>(&self, state: &mut H) {
1822        self.dir.hash(state)
1823    }
1824}
1825
1826impl Borrow<ArcKey<DirEntry>> for Submount {
1827    fn borrow(&self) -> &ArcKey<DirEntry> {
1828        &self.dir
1829    }
1830}
1831
1832#[cfg(test)]
1833mod test {
1834    use crate::fs::tmpfs::TmpFs;
1835    use crate::testing::spawn_kernel_and_run;
1836    use crate::vfs::namespace::DeviceType;
1837    use crate::vfs::{
1838        CallbackSymlinkNode, FsNodeInfo, LookupContext, MountInfo, Namespace, NamespaceNode,
1839        RenameFlags, SymlinkMode, SymlinkTarget, UnlinkKind, WhatToMount,
1840    };
1841    use starnix_uapi::mount_flags::MountFlags;
1842    use starnix_uapi::{errno, mode};
1843    use std::sync::Arc;
1844
1845    #[::fuchsia::test]
1846    async fn test_namespace() {
1847        spawn_kernel_and_run(async |locked, current_task| {
1848            let kernel = current_task.kernel();
1849            let root_fs = TmpFs::new_fs(locked, &kernel);
1850            let root_node = Arc::clone(root_fs.root());
1851            let _dev_node = root_node
1852                .create_dir(locked, &current_task, "dev".into())
1853                .expect("failed to mkdir dev");
1854            let dev_fs = TmpFs::new_fs(locked, &kernel);
1855            let dev_root_node = Arc::clone(dev_fs.root());
1856            let _dev_pts_node = dev_root_node
1857                .create_dir(locked, &current_task, "pts".into())
1858                .expect("failed to mkdir pts");
1859
1860            let ns = Namespace::new(root_fs);
1861            let mut context = LookupContext::default();
1862            let dev = ns
1863                .root()
1864                .lookup_child(locked, &current_task, &mut context, "dev".into())
1865                .expect("failed to lookup dev");
1866            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1867                .expect("failed to mount dev root node");
1868
1869            let mut context = LookupContext::default();
1870            let dev = ns
1871                .root()
1872                .lookup_child(locked, &current_task, &mut context, "dev".into())
1873                .expect("failed to lookup dev");
1874            let mut context = LookupContext::default();
1875            let pts = dev
1876                .lookup_child(locked, &current_task, &mut context, "pts".into())
1877                .expect("failed to lookup pts");
1878            let pts_parent =
1879                pts.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of pts");
1880            assert!(Arc::ptr_eq(&pts_parent.entry, &dev.entry));
1881
1882            let dev_parent =
1883                dev.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of dev");
1884            assert!(Arc::ptr_eq(&dev_parent.entry, &ns.root().entry));
1885        })
1886        .await;
1887    }
1888
1889    #[::fuchsia::test]
1890    async fn test_mount_does_not_upgrade() {
1891        spawn_kernel_and_run(async |locked, current_task| {
1892            let kernel = current_task.kernel();
1893            let root_fs = TmpFs::new_fs(locked, &kernel);
1894            let root_node = Arc::clone(root_fs.root());
1895            let _dev_node = root_node
1896                .create_dir(locked, &current_task, "dev".into())
1897                .expect("failed to mkdir dev");
1898            let dev_fs = TmpFs::new_fs(locked, &kernel);
1899            let dev_root_node = Arc::clone(dev_fs.root());
1900            let _dev_pts_node = dev_root_node
1901                .create_dir(locked, &current_task, "pts".into())
1902                .expect("failed to mkdir pts");
1903
1904            let ns = Namespace::new(root_fs);
1905            let mut context = LookupContext::default();
1906            let dev = ns
1907                .root()
1908                .lookup_child(locked, &current_task, &mut context, "dev".into())
1909                .expect("failed to lookup dev");
1910            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1911                .expect("failed to mount dev root node");
1912            let mut context = LookupContext::default();
1913            let new_dev = ns
1914                .root()
1915                .lookup_child(locked, &current_task, &mut context, "dev".into())
1916                .expect("failed to lookup dev again");
1917            assert!(!Arc::ptr_eq(&dev.entry, &new_dev.entry));
1918            assert_ne!(&dev, &new_dev);
1919
1920            let mut context = LookupContext::default();
1921            let _new_pts = new_dev
1922                .lookup_child(locked, &current_task, &mut context, "pts".into())
1923                .expect("failed to lookup pts");
1924            let mut context = LookupContext::default();
1925            assert!(dev.lookup_child(locked, &current_task, &mut context, "pts".into()).is_err());
1926        })
1927        .await;
1928    }
1929
1930    #[::fuchsia::test]
1931    async fn test_path() {
1932        spawn_kernel_and_run(async |locked, current_task| {
1933            let kernel = current_task.kernel();
1934            let root_fs = TmpFs::new_fs(locked, &kernel);
1935            let root_node = Arc::clone(root_fs.root());
1936            let _dev_node = root_node
1937                .create_dir(locked, &current_task, "dev".into())
1938                .expect("failed to mkdir dev");
1939            let dev_fs = TmpFs::new_fs(locked, &kernel);
1940            let dev_root_node = Arc::clone(dev_fs.root());
1941            let _dev_pts_node = dev_root_node
1942                .create_dir(locked, &current_task, "pts".into())
1943                .expect("failed to mkdir pts");
1944
1945            let ns = Namespace::new(root_fs);
1946            let mut context = LookupContext::default();
1947            let dev = ns
1948                .root()
1949                .lookup_child(locked, &current_task, &mut context, "dev".into())
1950                .expect("failed to lookup dev");
1951            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1952                .expect("failed to mount dev root node");
1953
1954            let mut context = LookupContext::default();
1955            let dev = ns
1956                .root()
1957                .lookup_child(locked, &current_task, &mut context, "dev".into())
1958                .expect("failed to lookup dev");
1959            let mut context = LookupContext::default();
1960            let pts = dev
1961                .lookup_child(locked, &current_task, &mut context, "pts".into())
1962                .expect("failed to lookup pts");
1963
1964            assert_eq!("/", ns.root().path_escaping_chroot());
1965            assert_eq!("/dev", dev.path_escaping_chroot());
1966            assert_eq!("/dev/pts", pts.path_escaping_chroot());
1967        })
1968        .await;
1969    }
1970
1971    #[::fuchsia::test]
1972    async fn test_shadowing() {
1973        spawn_kernel_and_run(async |locked, current_task| {
1974            let kernel = current_task.kernel();
1975            let root_fs = TmpFs::new_fs(locked, &kernel);
1976            let ns = Namespace::new(root_fs.clone());
1977            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
1978            let mut context = LookupContext::default();
1979            let foo_dir =
1980                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
1981
1982            let foofs1 = TmpFs::new_fs(locked, &kernel);
1983            foo_dir.mount(WhatToMount::Fs(foofs1.clone()), MountFlags::empty()).unwrap();
1984            let mut context = LookupContext::default();
1985            assert!(Arc::ptr_eq(
1986                &ns.root()
1987                    .lookup_child(locked, &current_task, &mut context, "foo".into())
1988                    .unwrap()
1989                    .entry,
1990                foofs1.root()
1991            ));
1992            let foo_dir =
1993                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
1994
1995            let ns_clone = ns.clone_namespace();
1996
1997            let foofs2 = TmpFs::new_fs(locked, &kernel);
1998            foo_dir.mount(WhatToMount::Fs(foofs2.clone()), MountFlags::empty()).unwrap();
1999            let mut context = LookupContext::default();
2000            assert!(Arc::ptr_eq(
2001                &ns.root()
2002                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2003                    .unwrap()
2004                    .entry,
2005                foofs2.root()
2006            ));
2007
2008            assert!(Arc::ptr_eq(
2009                &ns_clone
2010                    .root()
2011                    .lookup_child(
2012                        locked,
2013                        &current_task,
2014                        &mut LookupContext::default(),
2015                        "foo".into()
2016                    )
2017                    .unwrap()
2018                    .entry,
2019                foofs1.root()
2020            ));
2021        })
2022        .await;
2023    }
2024
2025    #[::fuchsia::test]
2026    async fn test_unlink_mounted_directory() {
2027        spawn_kernel_and_run(async |locked, current_task| {
2028            let kernel = current_task.kernel();
2029            let root_fs = TmpFs::new_fs(locked, &kernel);
2030            let ns1 = Namespace::new(root_fs.clone());
2031            let ns2 = Namespace::new(root_fs.clone());
2032            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2033            let mut context = LookupContext::default();
2034            let foo_dir =
2035                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2036
2037            let foofs = TmpFs::new_fs(locked, &kernel);
2038            foo_dir.mount(WhatToMount::Fs(foofs), MountFlags::empty()).unwrap();
2039
2040            // Trying to unlink from ns1 should fail.
2041            assert_eq!(
2042                ns1.root()
2043                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2044                    .unwrap_err(),
2045                errno!(EBUSY),
2046            );
2047
2048            // But unlinking from ns2 should succeed.
2049            ns2.root()
2050                .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2051                .expect("unlink failed");
2052
2053            // And it should no longer show up in ns1.
2054            assert_eq!(
2055                ns1.root()
2056                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2057                    .unwrap_err(),
2058                errno!(ENOENT),
2059            );
2060        })
2061        .await;
2062    }
2063
2064    #[::fuchsia::test]
2065    async fn test_rename_mounted_directory() {
2066        spawn_kernel_and_run(async |locked, current_task| {
2067            let kernel = current_task.kernel();
2068            let root_fs = TmpFs::new_fs(locked, &kernel);
2069            let ns1 = Namespace::new(root_fs.clone());
2070            let ns2 = Namespace::new(root_fs.clone());
2071            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2072            let _bar_node = root_fs.root().create_dir(locked, &current_task, "bar".into()).unwrap();
2073            let _baz_node = root_fs.root().create_dir(locked, &current_task, "baz".into()).unwrap();
2074            let mut context = LookupContext::default();
2075            let foo_dir =
2076                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2077
2078            let foofs = TmpFs::new_fs(locked, &kernel);
2079            foo_dir.mount(WhatToMount::Fs(foofs), MountFlags::empty()).unwrap();
2080
2081            // Trying to rename over foo from ns1 should fail.
2082            let root = ns1.root();
2083            assert_eq!(
2084                NamespaceNode::rename(
2085                    locked,
2086                    &current_task,
2087                    &root,
2088                    "bar".into(),
2089                    &root,
2090                    "foo".into(),
2091                    RenameFlags::empty()
2092                )
2093                .unwrap_err(),
2094                errno!(EBUSY),
2095            );
2096            // Likewise the other way.
2097            assert_eq!(
2098                NamespaceNode::rename(
2099                    locked,
2100                    &current_task,
2101                    &root,
2102                    "foo".into(),
2103                    &root,
2104                    "bar".into(),
2105                    RenameFlags::empty()
2106                )
2107                .unwrap_err(),
2108                errno!(EBUSY),
2109            );
2110
2111            // But renaming from ns2 should succeed.
2112            let root = ns2.root();
2113
2114            // First rename the directory with the mount.
2115            NamespaceNode::rename(
2116                locked,
2117                &current_task,
2118                &root,
2119                "foo".into(),
2120                &root,
2121                "bar".into(),
2122                RenameFlags::empty(),
2123            )
2124            .expect("rename failed");
2125
2126            // Renaming over a directory with a mount should also work.
2127            NamespaceNode::rename(
2128                locked,
2129                &current_task,
2130                &root,
2131                "baz".into(),
2132                &root,
2133                "bar".into(),
2134                RenameFlags::empty(),
2135            )
2136            .expect("rename failed");
2137
2138            // "foo" and "baz" should no longer show up in ns1.
2139            assert_eq!(
2140                ns1.root()
2141                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2142                    .unwrap_err(),
2143                errno!(ENOENT)
2144            );
2145            assert_eq!(
2146                ns1.root()
2147                    .lookup_child(locked, &current_task, &mut context, "baz".into())
2148                    .unwrap_err(),
2149                errno!(ENOENT)
2150            );
2151        })
2152        .await;
2153    }
2154
2155    /// Symlinks which need to be traversed across types (nodes and paths), as well as across
2156    /// owning directories, can be tricky to get right.
2157    #[::fuchsia::test]
2158    async fn test_lookup_with_symlink_chain() {
2159        spawn_kernel_and_run(async |locked, current_task| {
2160            // Set up the root filesystem
2161            let kernel = current_task.kernel();
2162            let root_fs = TmpFs::new_fs(locked, &kernel);
2163            let root_node = Arc::clone(root_fs.root());
2164            let _first_subdir_node = root_node
2165                .create_dir(locked, &current_task, "first_subdir".into())
2166                .expect("failed to mkdir dev");
2167            let _second_subdir_node = root_node
2168                .create_dir(locked, &current_task, "second_subdir".into())
2169                .expect("failed to mkdir dev");
2170
2171            // Set up two subdirectories under the root filesystem
2172            let first_subdir_fs = TmpFs::new_fs(locked, &kernel);
2173            let second_subdir_fs = TmpFs::new_fs(locked, &kernel);
2174
2175            let ns = Namespace::new(root_fs);
2176            let mut context = LookupContext::default();
2177            let first_subdir = ns
2178                .root()
2179                .lookup_child(locked, &current_task, &mut context, "first_subdir".into())
2180                .expect("failed to lookup first_subdir");
2181            first_subdir
2182                .mount(WhatToMount::Fs(first_subdir_fs), MountFlags::empty())
2183                .expect("failed to mount first_subdir fs node");
2184            let second_subdir = ns
2185                .root()
2186                .lookup_child(locked, &current_task, &mut context, "second_subdir".into())
2187                .expect("failed to lookup second_subdir");
2188            second_subdir
2189                .mount(WhatToMount::Fs(second_subdir_fs), MountFlags::empty())
2190                .expect("failed to mount second_subdir fs node");
2191
2192            // Create the symlink structure. To trigger potential symlink traversal bugs, we're going
2193            // for the following directory structure:
2194            // / (root)
2195            //     + first_subdir/
2196            //         - real_file
2197            //         - path_symlink (-> real_file)
2198            //     + second_subdir/
2199            //         - node_symlink (-> path_symlink)
2200            let real_file_node = first_subdir
2201                .create_node(
2202                    locked,
2203                    &current_task,
2204                    "real_file".into(),
2205                    mode!(IFREG, 0o777),
2206                    DeviceType::NONE,
2207                )
2208                .expect("failed to create real_file");
2209            first_subdir
2210                .create_symlink(locked, &current_task, "path_symlink".into(), "real_file".into())
2211                .expect("failed to create path_symlink");
2212
2213            let mut no_follow_lookup_context = LookupContext::new(SymlinkMode::NoFollow);
2214            let path_symlink_node = first_subdir
2215                .lookup_child(
2216                    locked,
2217                    &current_task,
2218                    &mut no_follow_lookup_context,
2219                    "path_symlink".into(),
2220                )
2221                .expect("Failed to lookup path_symlink");
2222
2223            // The second symlink needs to be of type SymlinkTarget::Node in order to trip the sensitive
2224            // code path. There's no easy method for creating this type of symlink target, so we'll need
2225            // to construct a node from scratch and insert it into the directory manually.
2226            let node_symlink_node = second_subdir.entry.node.fs().create_node_and_allocate_node_id(
2227                CallbackSymlinkNode::new(move || {
2228                    let node = path_symlink_node.clone();
2229                    Ok(SymlinkTarget::Node(node))
2230                }),
2231                FsNodeInfo::new(mode!(IFLNK, 0o777), current_task.current_fscred()),
2232            );
2233            second_subdir
2234                .entry
2235                .create_entry(
2236                    locked,
2237                    &current_task,
2238                    &MountInfo::detached(),
2239                    "node_symlink".into(),
2240                    move |_locked, _dir, _mount, _name| Ok(node_symlink_node),
2241                )
2242                .expect("failed to create node_symlink entry");
2243
2244            // Finally, exercise the lookup under test.
2245            let mut follow_lookup_context = LookupContext::new(SymlinkMode::Follow);
2246            let node_symlink_resolution = second_subdir
2247                .lookup_child(
2248                    locked,
2249                    &current_task,
2250                    &mut follow_lookup_context,
2251                    "node_symlink".into(),
2252                )
2253                .expect("lookup with symlink chain failed");
2254
2255            // The lookup resolution should have correctly followed the symlinks to the real_file node.
2256            assert!(node_symlink_resolution.entry.node.ino == real_file_node.entry.node.ino);
2257        })
2258        .await;
2259    }
2260}