starnix_core/vfs/
namespace.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mutable_state::{state_accessor, state_implementation};
6use crate::security;
7use crate::task::{CurrentTask, EventHandler, Kernel, Task, WaitCanceler, Waiter};
8use crate::time::utc;
9use crate::vfs::buffers::InputBuffer;
10use crate::vfs::fs_registry::FsRegistry;
11use crate::vfs::pseudo::dynamic_file::{DynamicFile, DynamicFileBuf, DynamicFileSource};
12use crate::vfs::pseudo::simple_file::SimpleFileNode;
13use crate::vfs::socket::{SocketAddress, SocketHandle, UnixSocket};
14use crate::vfs::{
15    CheckAccessReason, DirEntry, DirEntryHandle, FileHandle, FileObject, FileOps, FileSystemHandle,
16    FileSystemOptions, FileWriteGuardMode, FsNode, FsNodeHandle, FsNodeOps, FsStr, FsString,
17    PathBuilder, RenameFlags, SymlinkTarget, UnlinkKind, fileops_impl_dataless,
18    fileops_impl_delegate_read_and_seek, fileops_impl_nonseekable, fileops_impl_noop_sync,
19    fs_node_impl_not_dir,
20};
21use macro_rules_attribute::apply;
22use ref_cast::RefCast;
23use starnix_logging::log_warn;
24use starnix_sync::{
25    BeforeFsNodeAppend, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, Mutex, RwLock, Unlocked,
26};
27use starnix_types::ownership::WeakRef;
28use starnix_uapi::arc_key::{ArcKey, PtrKey, WeakKey};
29use starnix_uapi::auth::UserAndOrGroupId;
30use starnix_uapi::device_type::DeviceType;
31use starnix_uapi::errors::Errno;
32use starnix_uapi::file_mode::{AccessCheck, FileMode};
33use starnix_uapi::inotify_mask::InotifyMask;
34use starnix_uapi::mount_flags::MountFlags;
35use starnix_uapi::open_flags::OpenFlags;
36use starnix_uapi::unmount_flags::UnmountFlags;
37use starnix_uapi::vfs::{FdEvents, ResolveFlags};
38use starnix_uapi::{NAME_MAX, errno, error};
39use std::borrow::Borrow;
40use std::collections::hash_map::Entry;
41use std::collections::{HashMap, HashSet};
42use std::fmt;
43use std::hash::{Hash, Hasher};
44use std::ops::{Deref, DerefMut};
45use std::sync::{Arc, Weak};
46
47/// A mount namespace.
48///
49/// The namespace records at which entries filesystems are mounted.
50#[derive(Debug)]
51pub struct Namespace {
52    root_mount: MountHandle,
53
54    // Unique ID of this namespace.
55    pub id: u64,
56}
57
58impl Namespace {
59    pub fn new(fs: FileSystemHandle) -> Arc<Namespace> {
60        Self::new_with_flags(fs, MountFlags::empty())
61    }
62
63    pub fn new_with_flags(fs: FileSystemHandle, flags: MountFlags) -> Arc<Namespace> {
64        let kernel = fs.kernel.upgrade().expect("can't create namespace without a kernel");
65        let root_mount = Mount::new(WhatToMount::Fs(fs), flags);
66        Arc::new(Self { root_mount, id: kernel.get_next_namespace_id() })
67    }
68
69    pub fn root(&self) -> NamespaceNode {
70        self.root_mount.root()
71    }
72
73    pub fn clone_namespace(&self) -> Arc<Namespace> {
74        let kernel =
75            self.root_mount.fs.kernel.upgrade().expect("can't clone namespace without a kernel");
76        Arc::new(Self {
77            root_mount: self.root_mount.clone_mount_recursive(),
78            id: kernel.get_next_namespace_id(),
79        })
80    }
81
82    /// Assuming new_ns is a clone of the namespace that node is from, return the equivalent of
83    /// node in new_ns. If this assumption is violated, returns None.
84    pub fn translate_node(mut node: NamespaceNode, new_ns: &Namespace) -> Option<NamespaceNode> {
85        // Collect the list of mountpoints that leads to this node's mount
86        let mut mountpoints = vec![];
87        let mut mount = node.mount;
88        while let Some(mountpoint) = mount.as_ref().and_then(|m| m.mountpoint()) {
89            mountpoints.push(mountpoint.entry);
90            mount = mountpoint.mount;
91        }
92
93        // Follow the same path in the new namespace
94        let mut mount = Arc::clone(&new_ns.root_mount);
95        for mountpoint in mountpoints.iter().rev() {
96            let next_mount =
97                mount.read().submounts.get(ArcKey::ref_cast(mountpoint))?.mount.clone();
98            mount = next_mount;
99        }
100        node.mount = Some(mount).into();
101        Some(node)
102    }
103}
104
105impl FsNodeOps for Arc<Namespace> {
106    fs_node_impl_not_dir!();
107
108    fn create_file_ops(
109        &self,
110        _locked: &mut Locked<FileOpsCore>,
111        _node: &FsNode,
112        _current_task: &CurrentTask,
113        _flags: OpenFlags,
114    ) -> Result<Box<dyn FileOps>, Errno> {
115        Ok(Box::new(MountNamespaceFile(self.clone())))
116    }
117}
118
119pub struct MountNamespaceFile(pub Arc<Namespace>);
120
121impl FileOps for MountNamespaceFile {
122    fileops_impl_nonseekable!();
123    fileops_impl_dataless!();
124    fileops_impl_noop_sync!();
125}
126
127/// An empty struct that we use to track the number of active clients for a mount.
128///
129/// Each active client takes a reference to this object. The unmount operation fails
130/// if there are any active clients of the mount.
131type MountClientMarker = Arc<()>;
132
133/// An instance of a filesystem mounted in a namespace.
134///
135/// At a mount, path traversal switches from one filesystem to another.
136/// The client sees a composed directory structure that glues together the
137/// directories from the underlying FsNodes from those filesystems.
138///
139/// The mounts in a namespace form a mount tree, with `mountpoint` pointing to the parent and
140/// `submounts` pointing to the children.
141pub struct Mount {
142    root: DirEntryHandle,
143    flags: Mutex<MountFlags>,
144    fs: FileSystemHandle,
145
146    /// A unique identifier for this mount reported in /proc/pid/mountinfo.
147    id: u64,
148
149    /// A count of the number of active clients.
150    active_client_counter: MountClientMarker,
151
152    // Lock ordering: mount -> submount
153    state: RwLock<MountState>,
154    // Mount used to contain a Weak<Namespace>. It no longer does because since the mount point
155    // hash was moved from Namespace to Mount, nothing actually uses it. Now that
156    // Namespace::clone_namespace() is implemented in terms of Mount::clone_mount_recursive, it
157    // won't be trivial to add it back. I recommend turning the mountpoint field into an enum of
158    // Mountpoint or Namespace, maybe called "parent", and then traverse up to the top of the tree
159    // if you need to find a Mount's Namespace.
160}
161type MountHandle = Arc<Mount>;
162
163/// Public representation of the mount options.
164#[derive(Clone, Debug)]
165pub struct MountInfo {
166    handle: Option<MountHandle>,
167}
168
169impl MountInfo {
170    /// `MountInfo` for a element that is not tied to a given mount. Mount flags will be considered
171    /// empty.
172    pub fn detached() -> Self {
173        None.into()
174    }
175
176    /// The mount flags of the represented mount.
177    pub fn flags(&self) -> MountFlags {
178        if let Some(handle) = &self.handle {
179            handle.flags()
180        } else {
181            // Consider not mounted node have the NOATIME flags.
182            MountFlags::NOATIME
183        }
184    }
185
186    /// Checks whether this `MountInfo` represents a writable file system mount.
187    pub fn check_readonly_filesystem(&self) -> Result<(), Errno> {
188        if self.flags().contains(MountFlags::RDONLY) {
189            return error!(EROFS);
190        }
191        Ok(())
192    }
193
194    /// Checks whether this `MountInfo` represents an executable file system mount.
195    pub fn check_noexec_filesystem(&self) -> Result<(), Errno> {
196        if self.flags().contains(MountFlags::NOEXEC) {
197            return error!(EACCES);
198        }
199        Ok(())
200    }
201}
202
203impl Deref for MountInfo {
204    type Target = Option<MountHandle>;
205
206    fn deref(&self) -> &Self::Target {
207        &self.handle
208    }
209}
210
211impl DerefMut for MountInfo {
212    fn deref_mut(&mut self) -> &mut Self::Target {
213        &mut self.handle
214    }
215}
216
217impl std::cmp::PartialEq for MountInfo {
218    fn eq(&self, other: &Self) -> bool {
219        self.handle.as_ref().map(Arc::as_ptr) == other.handle.as_ref().map(Arc::as_ptr)
220    }
221}
222
223impl std::cmp::Eq for MountInfo {}
224
225impl Into<MountInfo> for Option<MountHandle> {
226    fn into(self) -> MountInfo {
227        MountInfo { handle: self }
228    }
229}
230
231#[derive(Default)]
232pub struct MountState {
233    /// The namespace node that this mount is mounted on. This is a tuple instead of a
234    /// NamespaceNode because the Mount pointer has to be weak because this is the pointer to the
235    /// parent mount, the parent has a pointer to the children too, and making both strong would be
236    /// a cycle.
237    mountpoint: Option<(Weak<Mount>, DirEntryHandle)>,
238
239    // The set is keyed by the mountpoints which are always descendants of this mount's root.
240    // Conceptually, the set is more akin to a map: `DirEntry -> MountHandle`, but we use a set
241    // instead because `Submount` has a drop implementation that needs both the key and value.
242    //
243    // Each directory entry can only have one mount attached. Mount shadowing works by using the
244    // root of the inner mount as a mountpoint. For example, if filesystem A is mounted at /foo,
245    // mounting filesystem B on /foo will create the mount as a child of the A mount, attached to
246    // A's root, instead of the root mount.
247    submounts: HashSet<Submount>,
248
249    /// The membership of this mount in its peer group. Do not access directly. Instead use
250    /// peer_group(), take_from_peer_group(), and set_peer_group().
251    // TODO(tbodt): Refactor the links into, some kind of extra struct or something? This is hard
252    // because setting this field requires the Arc<Mount>.
253    peer_group_: Option<(Arc<PeerGroup>, PtrKey<Mount>)>,
254    /// The membership of this mount in a PeerGroup's downstream. Do not access directly. Instead
255    /// use upstream(), take_from_upstream(), and set_upstream().
256    upstream_: Option<(Weak<PeerGroup>, PtrKey<Mount>)>,
257}
258
259/// A group of mounts. Setting MS_SHARED on a mount puts it in its own peer group. Any bind mounts
260/// of a mount in the group are also added to the group. A mount created in any mount in a peer
261/// group will be automatically propagated (recreated) in every other mount in the group.
262#[derive(Default)]
263struct PeerGroup {
264    id: u64,
265    state: RwLock<PeerGroupState>,
266}
267#[derive(Default)]
268struct PeerGroupState {
269    mounts: HashSet<WeakKey<Mount>>,
270    downstream: HashSet<WeakKey<Mount>>,
271}
272
273pub enum WhatToMount {
274    Fs(FileSystemHandle),
275    Bind(NamespaceNode),
276}
277
278impl Mount {
279    pub fn new(what: WhatToMount, flags: MountFlags) -> MountHandle {
280        match what {
281            WhatToMount::Fs(fs) => Self::new_with_root(fs.root().clone(), flags),
282            WhatToMount::Bind(node) => {
283                let mount = node.mount.as_ref().expect("can't bind mount from an anonymous node");
284                mount.clone_mount(&node.entry, flags)
285            }
286        }
287    }
288
289    fn new_with_root(root: DirEntryHandle, flags: MountFlags) -> MountHandle {
290        let known_flags = MountFlags::STORED_ON_MOUNT;
291        assert!(
292            !flags.intersects(!known_flags),
293            "mount created with extra flags {:?}",
294            flags - known_flags
295        );
296        let fs = root.node.fs();
297        let kernel = fs.kernel.upgrade().expect("can't create mount without kernel");
298        Arc::new(Self {
299            id: kernel.get_next_mount_id(),
300            flags: Mutex::new(flags),
301            root,
302            active_client_counter: Default::default(),
303            fs,
304            state: Default::default(),
305        })
306    }
307
308    /// A namespace node referring to the root of the mount.
309    pub fn root(self: &MountHandle) -> NamespaceNode {
310        NamespaceNode::new(Arc::clone(self), Arc::clone(&self.root))
311    }
312
313    /// Returns true if there is a submount on top of `dir_entry`.
314    pub fn has_submount(&self, dir_entry: &DirEntryHandle) -> bool {
315        self.state.read().submounts.contains(ArcKey::ref_cast(dir_entry))
316    }
317
318    /// The NamespaceNode on which this Mount is mounted.
319    fn mountpoint(&self) -> Option<NamespaceNode> {
320        let state = self.state.read();
321        let (mount, entry) = state.mountpoint.as_ref()?;
322        Some(NamespaceNode::new(mount.upgrade()?, entry.clone()))
323    }
324
325    /// Create the specified mount as a child. Also propagate it to the mount's peer group.
326    fn create_submount(
327        self: &MountHandle,
328        dir: &DirEntryHandle,
329        what: WhatToMount,
330        flags: MountFlags,
331    ) {
332        // TODO(tbodt): Making a copy here is necessary for lock ordering, because the peer group
333        // lock nests inside all mount locks (it would be impractical to reverse this because you
334        // need to lock a mount to get its peer group.) But it opens the door to race conditions
335        // where if a peer are concurrently being added, the mount might not get propagated to the
336        // new peer. The only true solution to this is bigger locks, somehow using the same lock
337        // for the peer group and all of the mounts in the group. Since peer groups are fluid and
338        // can have mounts constantly joining and leaving and then joining other groups, the only
339        // sensible locking option is to use a single global lock for all mounts and peer groups.
340        // This is almost impossible to express in rust. Help.
341        //
342        // Update: Also necessary to make a copy to prevent excess replication, see the comment on
343        // the following Mount::new call.
344        let peers = {
345            let state = self.state.read();
346            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
347        };
348
349        // Create the mount after copying the peer groups, because in the case of creating a bind
350        // mount inside itself, the new mount would get added to our peer group during the
351        // Mount::new call, but we don't want to replicate into it already. For an example see
352        // MountTest.QuizBRecursion.
353        let mount = Mount::new(what, flags);
354
355        if self.read().is_shared() {
356            mount.write().make_shared();
357        }
358
359        for peer in peers {
360            if Arc::ptr_eq(self, &peer) {
361                continue;
362            }
363            let clone = mount.clone_mount_recursive();
364            peer.write().add_submount_internal(dir, clone);
365        }
366
367        self.write().add_submount_internal(dir, mount)
368    }
369
370    fn remove_submount(
371        self: &MountHandle,
372        mount_hash_key: &ArcKey<DirEntry>,
373        propagate: bool,
374    ) -> Result<(), Errno> {
375        if propagate {
376            // create_submount explains why we need to make a copy of peers.
377            let peers = {
378                let state = self.state.read();
379                state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
380            };
381
382            for peer in peers {
383                if Arc::ptr_eq(self, &peer) {
384                    continue;
385                }
386                let _ = peer.write().remove_submount_internal(mount_hash_key);
387            }
388        }
389
390        self.write().remove_submount_internal(mount_hash_key)
391    }
392
393    /// Create a new mount with the same filesystem, flags, and peer group. Used to implement bind
394    /// mounts.
395    fn clone_mount(
396        self: &MountHandle,
397        new_root: &DirEntryHandle,
398        flags: MountFlags,
399    ) -> MountHandle {
400        assert!(new_root.is_descendant_of(&self.root));
401        // According to mount(2) on bind mounts, all flags other than MS_REC are ignored when doing
402        // a bind mount.
403        let clone = Self::new_with_root(Arc::clone(new_root), self.flags());
404
405        if flags.contains(MountFlags::REC) {
406            // This is two steps because the alternative (locking clone.state while iterating over
407            // self.state.submounts) trips tracing_mutex. The lock ordering is parent -> child, and
408            // if the clone is eventually made a child of self, this looks like an ordering
409            // violation. I'm not convinced it's a real issue, but I can't convince myself it's not
410            // either.
411            let mut submounts = vec![];
412            for Submount { dir, mount } in &self.state.read().submounts {
413                submounts.push((dir.clone(), mount.clone_mount_recursive()));
414            }
415            let mut clone_state = clone.write();
416            for (dir, submount) in submounts {
417                clone_state.add_submount_internal(&dir, submount);
418            }
419        }
420
421        // Put the clone in the same peer group
422        let peer_group = self.state.read().peer_group().map(Arc::clone);
423        if let Some(peer_group) = peer_group {
424            clone.write().set_peer_group(peer_group);
425        }
426
427        clone
428    }
429
430    /// Do a clone of the full mount hierarchy below this mount. Used for creating mount
431    /// namespaces and creating copies to use for propagation.
432    fn clone_mount_recursive(self: &MountHandle) -> MountHandle {
433        self.clone_mount(&self.root, MountFlags::REC)
434    }
435
436    pub fn change_propagation(self: &MountHandle, flag: MountFlags, recursive: bool) {
437        let mut state = self.write();
438        match flag {
439            MountFlags::SHARED => state.make_shared(),
440            MountFlags::PRIVATE => state.make_private(),
441            MountFlags::DOWNSTREAM => state.make_downstream(),
442            _ => {
443                log_warn!("mount propagation {:?}", flag);
444                return;
445            }
446        }
447
448        if recursive {
449            for submount in &state.submounts {
450                submount.mount.change_propagation(flag, recursive);
451            }
452        }
453    }
454
455    fn flags(&self) -> MountFlags {
456        *self.flags.lock()
457    }
458
459    pub fn update_flags(self: &MountHandle, mut flags: MountFlags) {
460        flags &= MountFlags::STORED_ON_MOUNT;
461        let atime_flags = MountFlags::NOATIME
462            | MountFlags::NODIRATIME
463            | MountFlags::RELATIME
464            | MountFlags::STRICTATIME;
465        let mut stored_flags = self.flags.lock();
466        if !flags.intersects(atime_flags) {
467            // Since Linux 3.17, if none of MS_NOATIME, MS_NODIRATIME,
468            // MS_RELATIME, or MS_STRICTATIME is specified in mountflags, then
469            // the remount operation preserves the existing values of these
470            // flags (rather than defaulting to MS_RELATIME).
471            flags |= *stored_flags & atime_flags;
472        }
473        // The "effect [of MS_STRICTATIME] is to clear the MS_NOATIME and MS_RELATIME flags."
474        flags &= !MountFlags::STRICTATIME;
475        *stored_flags = flags;
476    }
477
478    /// The number of active clients of this mount.
479    ///
480    /// The mount cannot be unmounted if there are any active clients.
481    fn active_clients(&self) -> usize {
482        // We need to subtract one for our own reference. We are not a real client.
483        Arc::strong_count(&self.active_client_counter) - 1
484    }
485
486    pub fn unmount(&self, flags: UnmountFlags, propagate: bool) -> Result<(), Errno> {
487        if !flags.contains(UnmountFlags::DETACH) {
488            if self.active_clients() > 0 || !self.state.read().submounts.is_empty() {
489                return error!(EBUSY);
490            }
491        }
492        let mountpoint = self.mountpoint().ok_or_else(|| errno!(EINVAL))?;
493        let parent_mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
494        parent_mount.remove_submount(mountpoint.mount_hash_key(), propagate)
495    }
496
497    /// Returns the security state of the fs.
498    pub fn security_state(&self) -> &security::FileSystemState {
499        &self.fs.security_state
500    }
501
502    /// Returns the name of the fs.
503    pub fn fs_name(&self) -> &'static FsStr {
504        self.fs.name()
505    }
506
507    state_accessor!(Mount, state, Arc<Mount>);
508}
509
510impl MountState {
511    /// Return this mount's current peer group.
512    fn peer_group(&self) -> Option<&Arc<PeerGroup>> {
513        let (group, _) = self.peer_group_.as_ref()?;
514        Some(group)
515    }
516
517    /// Remove this mount from its peer group and return the peer group.
518    fn take_from_peer_group(&mut self) -> Option<Arc<PeerGroup>> {
519        let (old_group, old_mount) = self.peer_group_.take()?;
520        old_group.remove(old_mount);
521        if let Some(upstream) = self.take_from_upstream() {
522            let next_mount =
523                old_group.state.read().mounts.iter().next().map(|w| w.0.upgrade().unwrap());
524            if let Some(next_mount) = next_mount {
525                // TODO(https://fxbug.dev/42065259): Fix the lock ordering here. We've locked next_mount
526                // while self is locked, and since the propagation tree and mount tree are
527                // separate, this could violate the mount -> submount order previously established.
528                next_mount.write().set_upstream(upstream);
529            }
530        }
531        Some(old_group)
532    }
533
534    fn upstream(&self) -> Option<Arc<PeerGroup>> {
535        self.upstream_.as_ref().and_then(|g| g.0.upgrade())
536    }
537
538    fn take_from_upstream(&mut self) -> Option<Arc<PeerGroup>> {
539        let (old_upstream, old_mount) = self.upstream_.take()?;
540        // TODO(tbodt): Reason about whether the upgrade() could possibly return None, and what we
541        // should actually do in that case.
542        let old_upstream = old_upstream.upgrade()?;
543        old_upstream.remove_downstream(old_mount);
544        Some(old_upstream)
545    }
546}
547
548#[apply(state_implementation!)]
549impl MountState<Base = Mount, BaseType = Arc<Mount>> {
550    /// Add a child mount *without propagating it to the peer group*. For internal use only.
551    fn add_submount_internal(&mut self, dir: &DirEntryHandle, mount: MountHandle) {
552        if !dir.is_descendant_of(&self.base.root) {
553            return;
554        }
555
556        let submount = mount.fs.kernel.upgrade().unwrap().mounts.register_mount(dir, mount.clone());
557        let old_mountpoint =
558            mount.state.write().mountpoint.replace((Arc::downgrade(self.base), Arc::clone(dir)));
559        assert!(old_mountpoint.is_none(), "add_submount can only take a newly created mount");
560        // Mount shadowing is implemented by mounting onto the root of the first mount, not by
561        // creating two mounts on the same mountpoint.
562        let old_mount = self.submounts.replace(submount);
563
564        // In rare cases, mount propagation might result in a request to mount on a directory where
565        // something is already mounted. MountTest.LotsOfShadowing will trigger this. Linux handles
566        // this by inserting the new mount between the old mount and the current mount.
567        if let Some(mut old_mount) = old_mount {
568            // Previous state: self[dir] = old_mount
569            // New state: self[dir] = new_mount, new_mount[new_mount.root] = old_mount
570            // The new mount has already been inserted into self, now just update the old mount to
571            // be a child of the new mount.
572            old_mount.mount.write().mountpoint = Some((Arc::downgrade(&mount), Arc::clone(dir)));
573            old_mount.dir = ArcKey(mount.root.clone());
574            mount.write().submounts.insert(old_mount);
575        }
576    }
577
578    fn remove_submount_internal(&mut self, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
579        if self.submounts.remove(mount_hash_key) { Ok(()) } else { error!(EINVAL) }
580    }
581
582    /// Set this mount's peer group.
583    fn set_peer_group(&mut self, group: Arc<PeerGroup>) {
584        self.take_from_peer_group();
585        group.add(self.base);
586        self.peer_group_ = Some((group, Arc::as_ptr(self.base).into()));
587    }
588
589    fn set_upstream(&mut self, group: Arc<PeerGroup>) {
590        self.take_from_upstream();
591        group.add_downstream(self.base);
592        self.upstream_ = Some((Arc::downgrade(&group), Arc::as_ptr(self.base).into()));
593    }
594
595    /// Is the mount in a peer group? Corresponds to MS_SHARED.
596    pub fn is_shared(&self) -> bool {
597        self.peer_group().is_some()
598    }
599
600    /// Put the mount in a peer group. Implements MS_SHARED.
601    pub fn make_shared(&mut self) {
602        if self.is_shared() {
603            return;
604        }
605        let kernel =
606            self.base.fs.kernel.upgrade().expect("can't create new peer group without kernel");
607        self.set_peer_group(PeerGroup::new(kernel.get_next_peer_group_id()));
608    }
609
610    /// Take the mount out of its peer group, also remove upstream if any. Implements MS_PRIVATE.
611    pub fn make_private(&mut self) {
612        self.take_from_peer_group();
613        self.take_from_upstream();
614    }
615
616    /// Take the mount out of its peer group and make it downstream instead. Implements
617    /// MountFlags::DOWNSTREAM (MS_SLAVE).
618    pub fn make_downstream(&mut self) {
619        if let Some(peer_group) = self.take_from_peer_group() {
620            self.set_upstream(peer_group);
621        }
622    }
623}
624
625impl PeerGroup {
626    fn new(id: u64) -> Arc<Self> {
627        Arc::new(Self { id, state: Default::default() })
628    }
629
630    fn add(&self, mount: &Arc<Mount>) {
631        self.state.write().mounts.insert(WeakKey::from(mount));
632    }
633
634    fn remove(&self, mount: PtrKey<Mount>) {
635        self.state.write().mounts.remove(&mount);
636    }
637
638    fn add_downstream(&self, mount: &Arc<Mount>) {
639        self.state.write().downstream.insert(WeakKey::from(mount));
640    }
641
642    fn remove_downstream(&self, mount: PtrKey<Mount>) {
643        self.state.write().downstream.remove(&mount);
644    }
645
646    fn copy_propagation_targets(&self) -> Vec<MountHandle> {
647        let mut buf = vec![];
648        self.collect_propagation_targets(&mut buf);
649        buf
650    }
651
652    fn collect_propagation_targets(&self, buf: &mut Vec<MountHandle>) {
653        let downstream_mounts: Vec<_> = {
654            let state = self.state.read();
655            buf.extend(state.mounts.iter().filter_map(|m| m.0.upgrade()));
656            state.downstream.iter().filter_map(|m| m.0.upgrade()).collect()
657        };
658        for mount in downstream_mounts {
659            let peer_group = mount.read().peer_group().map(Arc::clone);
660            match peer_group {
661                Some(group) => group.collect_propagation_targets(buf),
662                None => buf.push(mount),
663            }
664        }
665    }
666}
667
668impl Drop for Mount {
669    fn drop(&mut self) {
670        let state = self.state.get_mut();
671        state.take_from_peer_group();
672        state.take_from_upstream();
673    }
674}
675
676impl fmt::Debug for Mount {
677    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
678        let state = self.state.read();
679        f.debug_struct("Mount")
680            .field("id", &(self as *const Mount))
681            .field("root", &self.root)
682            .field("mountpoint", &state.mountpoint)
683            .field("submounts", &state.submounts)
684            .finish()
685    }
686}
687
688impl Kernel {
689    pub fn get_next_mount_id(&self) -> u64 {
690        self.next_mount_id.next()
691    }
692
693    pub fn get_next_peer_group_id(&self) -> u64 {
694        self.next_peer_group_id.next()
695    }
696
697    pub fn get_next_namespace_id(&self) -> u64 {
698        self.next_namespace_id.next()
699    }
700}
701
702impl CurrentTask {
703    pub fn create_filesystem(
704        &self,
705        locked: &mut Locked<Unlocked>,
706        fs_type: &FsStr,
707        options: FileSystemOptions,
708    ) -> Result<FileSystemHandle, Errno> {
709        // Please register new file systems via //src/starnix/modules/lib.rs, even if the file
710        // system is implemented inside starnix_core.
711        //
712        // Most file systems should be implemented as modules. The VFS provides various traits that
713        // let starnix_core integrate file systems without needing to depend on the file systems
714        // directly.
715        self.kernel()
716            .expando
717            .get::<FsRegistry>()
718            .create(locked, self, fs_type, options)
719            .ok_or_else(|| errno!(ENODEV, fs_type))?
720    }
721}
722
723// Writes to `sink` the mount flags and LSM mount options for the given `mount`.
724fn write_mount_info(task: &Task, sink: &mut DynamicFileBuf, mount: &Mount) -> Result<(), Errno> {
725    write!(sink, "{}", mount.flags())?;
726    security::sb_show_options(&task.kernel(), sink, &mount)
727}
728
729struct ProcMountsFileSource(WeakRef<Task>);
730
731impl DynamicFileSource for ProcMountsFileSource {
732    fn generate(
733        &self,
734        _current_task: &CurrentTask,
735        sink: &mut DynamicFileBuf,
736    ) -> Result<(), Errno> {
737        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
738        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
739        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
740        // extra work to maintain it.
741        let task = Task::from_weak(&self.0)?;
742        let root = task.fs().root();
743        let ns = task.fs().namespace();
744        for_each_mount(&ns.root_mount, &mut |mount| {
745            let mountpoint = mount.mountpoint().unwrap_or_else(|| mount.root());
746            if !mountpoint.is_descendant_of(&root) {
747                return Ok(());
748            }
749            write!(
750                sink,
751                "{} {} {} ",
752                mount.fs.options.source_for_display(),
753                mountpoint.path(&task),
754                mount.fs.name(),
755            )?;
756            write_mount_info(&task, sink, mount)?;
757            writeln!(sink, " 0 0")?;
758            Ok(())
759        })?;
760        Ok(())
761    }
762}
763
764pub struct ProcMountsFile {
765    dynamic_file: DynamicFile<ProcMountsFileSource>,
766}
767
768impl ProcMountsFile {
769    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
770        SimpleFileNode::new(move || {
771            Ok(Self { dynamic_file: DynamicFile::new(ProcMountsFileSource(task.clone())) })
772        })
773    }
774}
775
776impl FileOps for ProcMountsFile {
777    fileops_impl_delegate_read_and_seek!(self, self.dynamic_file);
778    fileops_impl_noop_sync!();
779
780    fn write(
781        &self,
782        _locked: &mut Locked<FileOpsCore>,
783        _file: &FileObject,
784        _current_task: &CurrentTask,
785        _offset: usize,
786        _data: &mut dyn InputBuffer,
787    ) -> Result<usize, Errno> {
788        error!(ENOSYS)
789    }
790
791    fn wait_async(
792        &self,
793        _locked: &mut Locked<FileOpsCore>,
794        _file: &FileObject,
795        _current_task: &CurrentTask,
796        waiter: &Waiter,
797        _events: FdEvents,
798        _handler: EventHandler,
799    ) -> Option<WaitCanceler> {
800        // Polling this file gives notifications when any change to mounts occurs. This is not
801        // implemented yet, but stubbed for Android init.
802        Some(waiter.fake_wait())
803    }
804
805    fn query_events(
806        &self,
807        _locked: &mut Locked<FileOpsCore>,
808        _file: &FileObject,
809        _current_task: &CurrentTask,
810    ) -> Result<FdEvents, Errno> {
811        Ok(FdEvents::empty())
812    }
813}
814
815#[derive(Clone)]
816pub struct ProcMountinfoFile(WeakRef<Task>);
817impl ProcMountinfoFile {
818    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
819        DynamicFile::new_node(Self(task))
820    }
821}
822impl DynamicFileSource for ProcMountinfoFile {
823    fn generate(
824        &self,
825        _current_task: &CurrentTask,
826        sink: &mut DynamicFileBuf,
827    ) -> Result<(), Errno> {
828        // Returns path to the `dir` from the root of the file system.
829        fn path_from_fs_root(dir: &DirEntryHandle) -> FsString {
830            let mut path = PathBuilder::new();
831            if dir.read().is_dead() {
832                // Return `/foo/dir//deleted` if the dir was deleted.
833                path.prepend_element("/deleted".into());
834            }
835            let mut current = dir.clone();
836            loop {
837                let parent = {
838                    let state = current.read();
839                    if state.parent().is_some() {
840                        path.prepend_element(state.local_name());
841                    }
842                    state.parent().clone()
843                };
844                if let Some(next) = parent {
845                    current = next
846                } else {
847                    break;
848                }
849            }
850            path.build_absolute()
851        }
852
853        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
854        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
855        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
856        // extra work to maintain it.
857        let task = Task::from_weak(&self.0)?;
858        let root = task.fs().root();
859        let ns = task.fs().namespace();
860        for_each_mount(&ns.root_mount, &mut |mount| {
861            let mountpoint = mount.mountpoint().unwrap_or_else(|| mount.root());
862            if !mountpoint.is_descendant_of(&root) {
863                return Ok(());
864            }
865            // Can't fail, mountpoint() and root() can't return a NamespaceNode with no mount
866            let parent = mountpoint.mount.as_ref().unwrap();
867            write!(
868                sink,
869                "{} {} {} {} {} ",
870                mount.id,
871                parent.id,
872                mount.root.node.fs().dev_id,
873                path_from_fs_root(&mount.root),
874                mountpoint.path(&task),
875            )?;
876            write_mount_info(&task, sink, mount)?;
877            if let Some(peer_group) = mount.read().peer_group() {
878                write!(sink, " shared:{}", peer_group.id)?;
879            }
880            if let Some(upstream) = mount.read().upstream() {
881                write!(sink, " master:{}", upstream.id)?;
882            }
883            writeln!(
884                sink,
885                " - {} {} {}",
886                mount.fs.name(),
887                mount.fs.options.source_for_display(),
888                mount.fs.options.flags,
889            )?;
890            Ok(())
891        })?;
892        Ok(())
893    }
894}
895
896fn for_each_mount<E>(
897    mount: &MountHandle,
898    callback: &mut impl FnMut(&MountHandle) -> Result<(), E>,
899) -> Result<(), E> {
900    callback(mount)?;
901    // Collect list first to avoid self deadlock when ProcMountinfoFile::read_at tries to call
902    // NamespaceNode::path()
903    let submounts: Vec<_> = mount.read().submounts.iter().map(|s| s.mount.clone()).collect();
904    for submount in submounts {
905        for_each_mount(&submount, callback)?;
906    }
907    Ok(())
908}
909
910/// The `SymlinkMode` enum encodes how symlinks are followed during path traversal.
911#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)]
912pub enum SymlinkMode {
913    /// Follow a symlink at the end of a path resolution.
914    #[default]
915    Follow,
916
917    /// Do not follow a symlink at the end of a path resolution.
918    NoFollow,
919}
920
921/// The maximum number of symlink traversals that can be made during path resolution.
922pub const MAX_SYMLINK_FOLLOWS: u8 = 40;
923
924/// The context passed during namespace lookups.
925///
926/// Namespace lookups need to mutate a shared context in order to correctly
927/// count the number of remaining symlink traversals.
928pub struct LookupContext {
929    /// The SymlinkMode for the lookup.
930    ///
931    /// As the lookup proceeds, the follow count is decremented each time the
932    /// lookup traverses a symlink.
933    pub symlink_mode: SymlinkMode,
934
935    /// The number of symlinks remaining the follow.
936    ///
937    /// Each time path resolution calls readlink, this value is decremented.
938    pub remaining_follows: u8,
939
940    /// Whether the result of the lookup must be a directory.
941    ///
942    /// For example, if the path ends with a `/` or if userspace passes
943    /// O_DIRECTORY. This flag can be set to true if the lookup encounters a
944    /// symlink that ends with a `/`.
945    pub must_be_directory: bool,
946
947    /// Resolve flags passed to `openat2`. Empty if the lookup originated in any other syscall.
948    pub resolve_flags: ResolveFlags,
949
950    /// Base directory for the lookup. Set only when either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT`
951    /// is passed to `openat2`.
952    pub resolve_base: ResolveBase,
953}
954
955/// Used to specify base directory in `LookupContext` for lookups originating in the `openat2`
956/// syscall with either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT` flag.
957#[derive(Clone, Eq, PartialEq)]
958pub enum ResolveBase {
959    None,
960
961    /// The lookup is not allowed to traverse any node that's not beneath the specified node.
962    Beneath(NamespaceNode),
963
964    /// The lookup should be handled as if the root specified node is the file-system root.
965    InRoot(NamespaceNode),
966}
967
968impl LookupContext {
969    pub fn new(symlink_mode: SymlinkMode) -> LookupContext {
970        LookupContext {
971            symlink_mode,
972            remaining_follows: MAX_SYMLINK_FOLLOWS,
973            must_be_directory: false,
974            resolve_flags: ResolveFlags::empty(),
975            resolve_base: ResolveBase::None,
976        }
977    }
978
979    pub fn with(&self, symlink_mode: SymlinkMode) -> LookupContext {
980        LookupContext { symlink_mode, resolve_base: self.resolve_base.clone(), ..*self }
981    }
982
983    pub fn update_for_path(&mut self, path: &FsStr) {
984        if path.last() == Some(&b'/') {
985            // The last path element must resolve to a directory. This is because a trailing slash
986            // was found in the path.
987            self.must_be_directory = true;
988            // If the last path element is a symlink, we should follow it.
989            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
990            self.symlink_mode = SymlinkMode::Follow;
991        }
992    }
993}
994
995impl Default for LookupContext {
996    fn default() -> Self {
997        LookupContext::new(SymlinkMode::Follow)
998    }
999}
1000
1001/// Whether the path is reachable from the given root.
1002pub enum PathWithReachability {
1003    /// The path is reachable from the given root.
1004    Reachable(FsString),
1005
1006    /// The path is not reachable from the given root.
1007    Unreachable(FsString),
1008}
1009
1010impl PathWithReachability {
1011    pub fn into_path(self) -> FsString {
1012        match self {
1013            PathWithReachability::Reachable(path) => path,
1014            PathWithReachability::Unreachable(path) => path,
1015        }
1016    }
1017}
1018
1019/// A node in a mount namespace.
1020///
1021/// This tree is a composite of the mount tree and the FsNode tree.
1022///
1023/// These nodes are used when traversing paths in a namespace in order to
1024/// present the client the directory structure that includes the mounted
1025/// filesystems.
1026#[derive(Clone)]
1027pub struct NamespaceNode {
1028    /// The mount where this namespace node is mounted.
1029    ///
1030    /// A given FsNode can be mounted in multiple places in a namespace. This
1031    /// field distinguishes between them.
1032    pub mount: MountInfo,
1033
1034    /// The FsNode that corresponds to this namespace entry.
1035    pub entry: DirEntryHandle,
1036}
1037
1038impl NamespaceNode {
1039    pub fn new(mount: MountHandle, entry: DirEntryHandle) -> Self {
1040        Self { mount: Some(mount).into(), entry }
1041    }
1042
1043    /// Create a namespace node that is not mounted in a namespace.
1044    pub fn new_anonymous(entry: DirEntryHandle) -> Self {
1045        Self { mount: None.into(), entry }
1046    }
1047
1048    /// Create a namespace node that is not mounted in a namespace and that refers to a node that
1049    /// is not rooted in a hierarchy and has no name.
1050    pub fn new_anonymous_unrooted(current_task: &CurrentTask, node: FsNodeHandle) -> Self {
1051        let dir_entry = DirEntry::new_unrooted(node);
1052        let _ = security::fs_node_init_with_dentry_no_xattr(current_task, &dir_entry);
1053        Self::new_anonymous(dir_entry)
1054    }
1055
1056    /// Create a FileObject corresponding to this namespace node.
1057    ///
1058    /// This function is the primary way of instantiating FileObjects. Each
1059    /// FileObject records the NamespaceNode that created it in order to
1060    /// remember its path in the Namespace.
1061    pub fn open(
1062        &self,
1063        locked: &mut Locked<Unlocked>,
1064        current_task: &CurrentTask,
1065        flags: OpenFlags,
1066        access_check: AccessCheck,
1067    ) -> Result<FileHandle, Errno> {
1068        let ops = self.entry.node.open(locked, current_task, self, flags, access_check)?;
1069        FileObject::new(locked, current_task, ops, self.clone(), flags)
1070    }
1071
1072    /// Create or open a node in the file system.
1073    ///
1074    /// Works for any type of node other than a symlink.
1075    ///
1076    /// Will return an existing node unless `flags` contains `OpenFlags::EXCL`.
1077    pub fn open_create_node<L>(
1078        &self,
1079        locked: &mut Locked<L>,
1080        current_task: &CurrentTask,
1081        name: &FsStr,
1082        mode: FileMode,
1083        dev: DeviceType,
1084        flags: OpenFlags,
1085    ) -> Result<NamespaceNode, Errno>
1086    where
1087        L: LockEqualOrBefore<FileOpsCore>,
1088    {
1089        let owner = current_task.current_fscred();
1090        let mode = current_task.fs().apply_umask(mode);
1091        let create_fn =
1092            |locked: &mut Locked<L>, dir: &FsNodeHandle, mount: &MountInfo, name: &_| {
1093                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1094            };
1095        let entry = if flags.contains(OpenFlags::EXCL) {
1096            self.entry.create_entry(locked, current_task, &self.mount, name, create_fn)
1097        } else {
1098            self.entry.get_or_create_entry(locked, current_task, &self.mount, name, create_fn)
1099        }?;
1100        Ok(self.with_new_entry(entry))
1101    }
1102
1103    pub fn into_active(self) -> ActiveNamespaceNode {
1104        ActiveNamespaceNode::new(self)
1105    }
1106
1107    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1108        self.into_active().into_mapping(mode)
1109    }
1110
1111    /// Create a node in the file system.
1112    ///
1113    /// Works for any type of node other than a symlink.
1114    ///
1115    /// Does not return an existing node.
1116    pub fn create_node<L>(
1117        &self,
1118        locked: &mut Locked<L>,
1119        current_task: &CurrentTask,
1120        name: &FsStr,
1121        mode: FileMode,
1122        dev: DeviceType,
1123    ) -> Result<NamespaceNode, Errno>
1124    where
1125        L: LockEqualOrBefore<FileOpsCore>,
1126    {
1127        let owner = current_task.current_fscred();
1128        let mode = current_task.fs().apply_umask(mode);
1129        let entry = self.entry.create_entry(
1130            locked,
1131            current_task,
1132            &self.mount,
1133            name,
1134            |locked, dir, mount, name| {
1135                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1136            },
1137        )?;
1138        Ok(self.with_new_entry(entry))
1139    }
1140
1141    /// Create a symlink in the file system.
1142    ///
1143    /// To create another type of node, use `create_node`.
1144    pub fn create_symlink<L>(
1145        &self,
1146        locked: &mut Locked<L>,
1147        current_task: &CurrentTask,
1148        name: &FsStr,
1149        target: &FsStr,
1150    ) -> Result<NamespaceNode, Errno>
1151    where
1152        L: LockEqualOrBefore<FileOpsCore>,
1153    {
1154        let owner = current_task.current_fscred();
1155        let entry = self.entry.create_entry(
1156            locked,
1157            current_task,
1158            &self.mount,
1159            name,
1160            |locked, dir, mount, name| {
1161                dir.create_symlink(locked, current_task, mount, name, target, owner)
1162            },
1163        )?;
1164        Ok(self.with_new_entry(entry))
1165    }
1166
1167    /// Creates an anonymous file.
1168    ///
1169    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
1170    ///
1171    /// Used by O_TMPFILE.
1172    pub fn create_tmpfile<L>(
1173        &self,
1174        locked: &mut Locked<L>,
1175        current_task: &CurrentTask,
1176        mode: FileMode,
1177        flags: OpenFlags,
1178    ) -> Result<NamespaceNode, Errno>
1179    where
1180        L: LockEqualOrBefore<FileOpsCore>,
1181    {
1182        let owner = current_task.current_fscred();
1183        let mode = current_task.fs().apply_umask(mode);
1184        Ok(self.with_new_entry(self.entry.create_tmpfile(
1185            locked,
1186            current_task,
1187            &self.mount,
1188            mode,
1189            owner,
1190            flags,
1191        )?))
1192    }
1193
1194    pub fn link<L>(
1195        &self,
1196        locked: &mut Locked<L>,
1197        current_task: &CurrentTask,
1198        name: &FsStr,
1199        child: &FsNodeHandle,
1200    ) -> Result<NamespaceNode, Errno>
1201    where
1202        L: LockEqualOrBefore<FileOpsCore>,
1203    {
1204        let dir_entry = self.entry.create_entry(
1205            locked,
1206            current_task,
1207            &self.mount,
1208            name,
1209            |locked, dir, mount, name| dir.link(locked, current_task, mount, name, child),
1210        )?;
1211        Ok(self.with_new_entry(dir_entry))
1212    }
1213
1214    pub fn bind_socket<L>(
1215        &self,
1216        locked: &mut Locked<L>,
1217        current_task: &CurrentTask,
1218        name: &FsStr,
1219        socket: SocketHandle,
1220        socket_address: SocketAddress,
1221        mode: FileMode,
1222    ) -> Result<NamespaceNode, Errno>
1223    where
1224        L: LockEqualOrBefore<FileOpsCore>,
1225    {
1226        let dir_entry = self.entry.create_entry(
1227            locked,
1228            current_task,
1229            &self.mount,
1230            name,
1231            |locked, dir, mount, name| {
1232                let node = dir.create_node(
1233                    locked,
1234                    current_task,
1235                    mount,
1236                    name,
1237                    mode,
1238                    DeviceType::NONE,
1239                    current_task.current_fscred(),
1240                )?;
1241                if let Some(unix_socket) = socket.downcast_socket::<UnixSocket>() {
1242                    unix_socket.bind_socket_to_node(&socket, socket_address, &node)?;
1243                } else {
1244                    return error!(ENOTSUP);
1245                }
1246                Ok(node)
1247            },
1248        )?;
1249        Ok(self.with_new_entry(dir_entry))
1250    }
1251
1252    pub fn unlink<L>(
1253        &self,
1254        locked: &mut Locked<L>,
1255        current_task: &CurrentTask,
1256        name: &FsStr,
1257        kind: UnlinkKind,
1258        must_be_directory: bool,
1259    ) -> Result<(), Errno>
1260    where
1261        L: LockEqualOrBefore<FileOpsCore>,
1262    {
1263        if DirEntry::is_reserved_name(name) {
1264            match kind {
1265                UnlinkKind::Directory => {
1266                    if name == ".." {
1267                        error!(ENOTEMPTY)
1268                    } else if self.parent().is_none() {
1269                        // The client is attempting to remove the root.
1270                        error!(EBUSY)
1271                    } else {
1272                        error!(EINVAL)
1273                    }
1274                }
1275                UnlinkKind::NonDirectory => error!(ENOTDIR),
1276            }
1277        } else {
1278            self.entry.unlink(locked, current_task, &self.mount, name, kind, must_be_directory)
1279        }
1280    }
1281
1282    /// Traverse down a parent-to-child link in the namespace.
1283    pub fn lookup_child<L>(
1284        &self,
1285        locked: &mut Locked<L>,
1286        current_task: &CurrentTask,
1287        context: &mut LookupContext,
1288        basename: &FsStr,
1289    ) -> Result<NamespaceNode, Errno>
1290    where
1291        L: LockEqualOrBefore<FileOpsCore>,
1292    {
1293        if !self.entry.node.is_dir() {
1294            return error!(ENOTDIR);
1295        }
1296
1297        if basename.len() > NAME_MAX as usize {
1298            return error!(ENAMETOOLONG);
1299        }
1300
1301        let child = if basename.is_empty() || basename == "." {
1302            self.clone()
1303        } else if basename == ".." {
1304            let root = match &context.resolve_base {
1305                ResolveBase::None => current_task.fs().root(),
1306                ResolveBase::Beneath(node) => {
1307                    // Do not allow traversal out of the 'node'.
1308                    if *self == *node {
1309                        return error!(EXDEV);
1310                    }
1311                    current_task.fs().root()
1312                }
1313                ResolveBase::InRoot(root) => root.clone(),
1314            };
1315
1316            // Make sure this can't escape a chroot.
1317            if *self == root { root } else { self.parent().unwrap_or_else(|| self.clone()) }
1318        } else {
1319            let mut child = self.with_new_entry(self.entry.component_lookup(
1320                locked,
1321                current_task,
1322                &self.mount,
1323                basename,
1324            )?);
1325            while child.entry.node.is_lnk() {
1326                match context.symlink_mode {
1327                    SymlinkMode::NoFollow => {
1328                        break;
1329                    }
1330                    SymlinkMode::Follow => {
1331                        if context.remaining_follows == 0
1332                            || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
1333                        {
1334                            return error!(ELOOP);
1335                        }
1336                        context.remaining_follows -= 1;
1337                        child = match child.readlink(locked, current_task)? {
1338                            SymlinkTarget::Path(link_target) => {
1339                                let link_directory = if link_target[0] == b'/' {
1340                                    // If the path is absolute, we'll resolve the root directory.
1341                                    match &context.resolve_base {
1342                                        ResolveBase::None => current_task.fs().root(),
1343                                        ResolveBase::Beneath(_) => return error!(EXDEV),
1344                                        ResolveBase::InRoot(root) => root.clone(),
1345                                    }
1346                                } else {
1347                                    // If the path is not absolute, it's a relative directory. Let's
1348                                    // try to get the parent of the current child, or in the case
1349                                    // that the child is the root we can just use that directly.
1350                                    child.parent().unwrap_or(child)
1351                                };
1352                                current_task.lookup_path(
1353                                    locked,
1354                                    context,
1355                                    link_directory,
1356                                    link_target.as_ref(),
1357                                )?
1358                            }
1359                            SymlinkTarget::Node(node) => {
1360                                if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
1361                                    return error!(ELOOP);
1362                                }
1363                                node
1364                            }
1365                        }
1366                    }
1367                };
1368            }
1369
1370            child.enter_mount()
1371        };
1372
1373        if context.resolve_flags.contains(ResolveFlags::NO_XDEV) && child.mount != self.mount {
1374            return error!(EXDEV);
1375        }
1376
1377        if context.must_be_directory && !child.entry.node.is_dir() {
1378            return error!(ENOTDIR);
1379        }
1380
1381        Ok(child)
1382    }
1383
1384    /// Traverse up a child-to-parent link in the namespace.
1385    ///
1386    /// This traversal matches the child-to-parent link in the underlying
1387    /// FsNode except at mountpoints, where the link switches from one
1388    /// filesystem to another.
1389    pub fn parent(&self) -> Option<NamespaceNode> {
1390        let mountpoint_or_self = self.escape_mount();
1391        let parent = mountpoint_or_self.entry.read().parent().clone()?;
1392        Some(mountpoint_or_self.with_new_entry(parent))
1393    }
1394
1395    /// Returns the parent, but does not escape mounts i.e. returns None if this node
1396    /// is the root of a mount.
1397    pub fn parent_within_mount(&self) -> Option<DirEntryHandle> {
1398        if let Ok(_) = self.mount_if_root() {
1399            return None;
1400        }
1401        self.entry.read().parent().clone()
1402    }
1403
1404    /// Whether this namespace node is a descendant of the given node.
1405    ///
1406    /// Walks up the namespace node tree looking for ancestor. If ancestor is
1407    /// found, returns true. Otherwise, returns false.
1408    pub fn is_descendant_of(&self, ancestor: &NamespaceNode) -> bool {
1409        let ancestor = ancestor.escape_mount();
1410        let mut current = self.escape_mount();
1411        while current != ancestor {
1412            if let Some(parent) = current.parent() {
1413                current = parent.escape_mount();
1414            } else {
1415                return false;
1416            }
1417        }
1418        true
1419    }
1420
1421    /// If this is a mount point, return the root of the mount. Otherwise return self.
1422    fn enter_mount(&self) -> NamespaceNode {
1423        // While the child is a mountpoint, replace child with the mount's root.
1424        fn enter_one_mount(node: &NamespaceNode) -> Option<NamespaceNode> {
1425            if let Some(mount) = node.mount.deref() {
1426                if let Some(submount) =
1427                    mount.state.read().submounts.get(ArcKey::ref_cast(&node.entry))
1428                {
1429                    return Some(submount.mount.root());
1430                }
1431            }
1432            None
1433        }
1434        let mut inner = self.clone();
1435        while let Some(inner_root) = enter_one_mount(&inner) {
1436            inner = inner_root;
1437        }
1438        inner
1439    }
1440
1441    /// If this is the root of a mount, return the mount point. Otherwise return self.
1442    ///
1443    /// This is not exactly the same as parent(). If parent() is called on a root, it will escape
1444    /// the mount, but then return the parent of the mount point instead of the mount point.
1445    fn escape_mount(&self) -> NamespaceNode {
1446        let mut mountpoint_or_self = self.clone();
1447        while let Some(mountpoint) = mountpoint_or_self.mountpoint() {
1448            mountpoint_or_self = mountpoint;
1449        }
1450        mountpoint_or_self
1451    }
1452
1453    /// If this node is the root of a mount, return it. Otherwise EINVAL.
1454    pub fn mount_if_root(&self) -> Result<&MountHandle, Errno> {
1455        if let Some(mount) = self.mount.deref() {
1456            if Arc::ptr_eq(&self.entry, &mount.root) {
1457                return Ok(mount);
1458            }
1459        }
1460        error!(EINVAL)
1461    }
1462
1463    /// Returns the mountpoint at this location in the namespace.
1464    ///
1465    /// If this node is mounted in another node, this function returns the node
1466    /// at which this node is mounted. Otherwise, returns None.
1467    fn mountpoint(&self) -> Option<NamespaceNode> {
1468        self.mount_if_root().ok()?.mountpoint()
1469    }
1470
1471    /// The path from the task's root to this node.
1472    pub fn path(&self, task: &Task) -> FsString {
1473        self.path_from_root(Some(&task.fs().root())).into_path()
1474    }
1475
1476    /// The path from the root of the namespace to this node.
1477    pub fn path_escaping_chroot(&self) -> FsString {
1478        self.path_from_root(None).into_path()
1479    }
1480
1481    /// Returns the path to this node, accounting for a custom root.
1482    /// A task may have a custom root set by `chroot`.
1483    pub fn path_from_root(&self, root: Option<&NamespaceNode>) -> PathWithReachability {
1484        if self.mount.is_none() {
1485            return PathWithReachability::Reachable(self.entry.node.internal_name());
1486        }
1487
1488        let mut path = PathBuilder::new();
1489        let mut current = self.escape_mount();
1490        if let Some(root) = root {
1491            // The current node is expected to intersect with the custom root as we travel up the tree.
1492            let root = root.escape_mount();
1493            while current != root {
1494                if let Some(parent) = current.parent() {
1495                    path.prepend_element(current.entry.read().local_name());
1496                    current = parent.escape_mount();
1497                } else {
1498                    // This node hasn't intersected with the custom root and has reached the namespace root.
1499                    let mut absolute_path = path.build_absolute();
1500                    if self.entry.read().is_dead() {
1501                        absolute_path.extend_from_slice(b" (deleted)");
1502                    }
1503
1504                    return PathWithReachability::Unreachable(absolute_path);
1505                }
1506            }
1507        } else {
1508            // No custom root, so travel up the tree to the namespace root.
1509            while let Some(parent) = current.parent() {
1510                path.prepend_element(current.entry.read().local_name());
1511                current = parent.escape_mount();
1512            }
1513        }
1514
1515        let mut absolute_path = path.build_absolute();
1516        if self.entry.read().is_dead() {
1517            absolute_path.extend_from_slice(b" (deleted)");
1518        }
1519
1520        PathWithReachability::Reachable(absolute_path)
1521    }
1522
1523    pub fn mount(&self, what: WhatToMount, flags: MountFlags) -> Result<(), Errno> {
1524        let flags = flags & (MountFlags::STORED_ON_MOUNT | MountFlags::REC);
1525        let mountpoint = self.enter_mount();
1526        let mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
1527        mount.create_submount(&mountpoint.entry, what, flags);
1528        Ok(())
1529    }
1530
1531    /// If this is the root of a filesystem, unmount. Otherwise return EINVAL.
1532    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
1533        let propagate = self.mount_if_root().map_or(false, |mount| mount.read().is_shared());
1534        let mount = self.enter_mount().mount_if_root()?.clone();
1535        mount.unmount(flags, propagate)
1536    }
1537
1538    pub fn rename<L>(
1539        locked: &mut Locked<L>,
1540        current_task: &CurrentTask,
1541        old_parent: &NamespaceNode,
1542        old_name: &FsStr,
1543        new_parent: &NamespaceNode,
1544        new_name: &FsStr,
1545        flags: RenameFlags,
1546    ) -> Result<(), Errno>
1547    where
1548        L: LockEqualOrBefore<FileOpsCore>,
1549    {
1550        DirEntry::rename(
1551            locked,
1552            current_task,
1553            &old_parent.entry,
1554            &old_parent.mount,
1555            old_name,
1556            &new_parent.entry,
1557            &new_parent.mount,
1558            new_name,
1559            flags,
1560        )
1561    }
1562
1563    fn with_new_entry(&self, entry: DirEntryHandle) -> NamespaceNode {
1564        Self { mount: self.mount.clone(), entry }
1565    }
1566
1567    fn mount_hash_key(&self) -> &ArcKey<DirEntry> {
1568        ArcKey::ref_cast(&self.entry)
1569    }
1570
1571    pub fn suid_and_sgid(&self, current_task: &CurrentTask) -> Result<UserAndOrGroupId, Errno> {
1572        if self.mount.flags().contains(MountFlags::NOSUID) {
1573            Ok(UserAndOrGroupId::default())
1574        } else {
1575            self.entry.node.info().suid_and_sgid(current_task, &self.entry.node)
1576        }
1577    }
1578
1579    pub fn update_atime(&self) {
1580        // Do not update the atime of this node if it is mounted with the NOATIME flag.
1581        if !self.mount.flags().contains(MountFlags::NOATIME) {
1582            self.entry.node.update_info(|info| {
1583                let now = utc::utc_now();
1584                info.time_access = now;
1585                info.pending_time_access_update = true;
1586            });
1587        }
1588    }
1589
1590    pub fn readlink<L>(
1591        &self,
1592        locked: &mut Locked<L>,
1593        current_task: &CurrentTask,
1594    ) -> Result<SymlinkTarget, Errno>
1595    where
1596        L: LockEqualOrBefore<FileOpsCore>,
1597    {
1598        self.update_atime();
1599        self.entry.node.readlink(locked, current_task)
1600    }
1601
1602    pub fn notify(&self, event_mask: InotifyMask) {
1603        if self.mount.is_some() {
1604            self.entry.notify(event_mask);
1605        }
1606    }
1607
1608    /// Check whether the node can be accessed in the current context with the specified access
1609    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1610    /// owner or is in the file's group.
1611    pub fn check_access<L>(
1612        &self,
1613        locked: &mut Locked<L>,
1614        current_task: &CurrentTask,
1615        permission_flags: impl Into<security::PermissionFlags>,
1616        reason: CheckAccessReason,
1617    ) -> Result<(), Errno>
1618    where
1619        L: LockEqualOrBefore<FileOpsCore>,
1620    {
1621        self.entry.node.check_access(
1622            locked,
1623            current_task,
1624            &self.mount,
1625            permission_flags,
1626            reason,
1627            self,
1628        )
1629    }
1630
1631    /// Checks if O_NOATIME is allowed,
1632    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1633        self.entry.node.check_o_noatime_allowed(current_task)
1634    }
1635
1636    pub fn truncate<L>(
1637        &self,
1638        locked: &mut Locked<L>,
1639        current_task: &CurrentTask,
1640        length: u64,
1641    ) -> Result<(), Errno>
1642    where
1643        L: LockBefore<BeforeFsNodeAppend>,
1644    {
1645        self.entry.node.truncate(locked, current_task, &self.mount, length)?;
1646        self.entry.notify_ignoring_excl_unlink(InotifyMask::MODIFY);
1647        Ok(())
1648    }
1649}
1650
1651impl fmt::Debug for NamespaceNode {
1652    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1653        f.debug_struct("NamespaceNode")
1654            .field("path", &self.path_escaping_chroot())
1655            .field("mount", &self.mount)
1656            .field("entry", &self.entry)
1657            .finish()
1658    }
1659}
1660
1661// Eq/Hash impls intended for the MOUNT_POINTS hash
1662impl PartialEq for NamespaceNode {
1663    fn eq(&self, other: &Self) -> bool {
1664        self.mount.as_ref().map(Arc::as_ptr).eq(&other.mount.as_ref().map(Arc::as_ptr))
1665            && Arc::ptr_eq(&self.entry, &other.entry)
1666    }
1667}
1668impl Eq for NamespaceNode {}
1669impl Hash for NamespaceNode {
1670    fn hash<H: Hasher>(&self, state: &mut H) {
1671        self.mount.as_ref().map(Arc::as_ptr).hash(state);
1672        Arc::as_ptr(&self.entry).hash(state);
1673    }
1674}
1675
1676/// A namespace node that keeps the underly mount busy.
1677#[derive(Debug, Clone)]
1678pub struct ActiveNamespaceNode {
1679    /// The underlying namespace node.
1680    name: NamespaceNode,
1681
1682    /// Adds a reference to the mount client marker to prevent the mount from
1683    /// being removed while the NamespaceNode is active. Is None iff mount is
1684    /// None.
1685    _marker: Option<MountClientMarker>,
1686}
1687
1688impl ActiveNamespaceNode {
1689    pub fn new(name: NamespaceNode) -> Self {
1690        let marker = name.mount.as_ref().map(|mount| mount.active_client_counter.clone());
1691        Self { name, _marker: marker }
1692    }
1693
1694    pub fn to_passive(&self) -> NamespaceNode {
1695        self.deref().clone()
1696    }
1697
1698    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1699        if let Some(mode) = mode {
1700            self.entry.node.write_guard_state.lock().acquire(mode)?;
1701        }
1702        Ok(Arc::new(FileMapping { name: self, mode }))
1703    }
1704}
1705
1706impl Deref for ActiveNamespaceNode {
1707    type Target = NamespaceNode;
1708
1709    fn deref(&self) -> &Self::Target {
1710        &self.name
1711    }
1712}
1713
1714impl PartialEq for ActiveNamespaceNode {
1715    fn eq(&self, other: &Self) -> bool {
1716        self.deref().eq(other.deref())
1717    }
1718}
1719impl Eq for ActiveNamespaceNode {}
1720impl Hash for ActiveNamespaceNode {
1721    fn hash<H: Hasher>(&self, state: &mut H) {
1722        self.deref().hash(state)
1723    }
1724}
1725
1726#[derive(Debug, Clone, PartialEq, Eq)]
1727#[must_use]
1728pub struct FileMapping {
1729    pub name: ActiveNamespaceNode,
1730    mode: Option<FileWriteGuardMode>,
1731}
1732
1733impl Drop for FileMapping {
1734    fn drop(&mut self) {
1735        if let Some(mode) = self.mode {
1736            self.name.entry.node.write_guard_state.lock().release(mode);
1737        }
1738    }
1739}
1740
1741/// Tracks all mounts, keyed by mount point.
1742pub struct Mounts {
1743    mounts: Mutex<HashMap<WeakKey<DirEntry>, Vec<ArcKey<Mount>>>>,
1744}
1745
1746impl Mounts {
1747    pub fn new() -> Self {
1748        Mounts { mounts: Mutex::default() }
1749    }
1750
1751    /// Registers the mount in the global mounts map.
1752    fn register_mount(&self, dir_entry: &Arc<DirEntry>, mount: MountHandle) -> Submount {
1753        let mut mounts = self.mounts.lock();
1754        mounts
1755            .entry(WeakKey::from(dir_entry))
1756            .or_insert_with(|| {
1757                dir_entry.set_has_mounts(true);
1758                Vec::new()
1759            })
1760            .push(ArcKey(mount.clone()));
1761        Submount { dir: ArcKey(dir_entry.clone()), mount }
1762    }
1763
1764    /// Unregisters the mount.  This is called by `Submount::drop`.
1765    fn unregister_mount(&self, dir_entry: &Arc<DirEntry>, mount: &MountHandle) {
1766        let mut mounts = self.mounts.lock();
1767        let Entry::Occupied(mut o) = mounts.entry(WeakKey::from(dir_entry)) else {
1768            // This can happen if called from `unmount` below.
1769            return;
1770        };
1771        // This is O(N), but directory entries with large numbers of mounts should be rare.
1772        let index = o.get().iter().position(|e| e == ArcKey::ref_cast(mount)).unwrap();
1773        if o.get().len() == 1 {
1774            o.remove_entry();
1775            dir_entry.set_has_mounts(false);
1776        } else {
1777            o.get_mut().swap_remove(index);
1778        }
1779    }
1780
1781    /// Unmounts all mounts associated with `dir_entry`.  This is called when `dir_entry` is
1782    /// unlinked (which would normally result in EBUSY, but not if it isn't mounted in the local
1783    /// namespace).
1784    pub fn unmount(&self, dir_entry: &DirEntry) {
1785        let mounts = self.mounts.lock().remove(&PtrKey::from(dir_entry as *const _));
1786        if let Some(mounts) = mounts {
1787            for mount in mounts {
1788                // Ignore errors.
1789                let _ = mount.unmount(UnmountFlags::default(), false);
1790            }
1791        }
1792    }
1793
1794    /// Drain mounts. For each drained mount, force a FileSystem unmount.
1795    // TODO(https://fxbug.dev/295073633): Graceful shutdown should try to first unmount the mounts
1796    // and only force a FileSystem unmount on failure.
1797    pub fn clear(&self) {
1798        for (_dir_entry, mounts) in self.mounts.lock().drain() {
1799            for mount in mounts {
1800                mount.fs.force_unmount_ops();
1801            }
1802        }
1803    }
1804}
1805
1806/// A RAII object that unregisters a mount when dropped.
1807#[derive(Debug)]
1808struct Submount {
1809    dir: ArcKey<DirEntry>,
1810    mount: MountHandle,
1811}
1812
1813impl Drop for Submount {
1814    fn drop(&mut self) {
1815        self.mount.fs.kernel.upgrade().unwrap().mounts.unregister_mount(&self.dir, &self.mount)
1816    }
1817}
1818
1819/// Submount is stored in a mount's submounts hash set, which is keyed by the mountpoint.
1820impl Eq for Submount {}
1821impl PartialEq<Self> for Submount {
1822    fn eq(&self, other: &Self) -> bool {
1823        self.dir == other.dir
1824    }
1825}
1826impl Hash for Submount {
1827    fn hash<H: Hasher>(&self, state: &mut H) {
1828        self.dir.hash(state)
1829    }
1830}
1831
1832impl Borrow<ArcKey<DirEntry>> for Submount {
1833    fn borrow(&self) -> &ArcKey<DirEntry> {
1834        &self.dir
1835    }
1836}
1837
1838#[cfg(test)]
1839mod test {
1840    use crate::fs::tmpfs::TmpFs;
1841    use crate::testing::spawn_kernel_and_run;
1842    use crate::vfs::namespace::DeviceType;
1843    use crate::vfs::{
1844        CallbackSymlinkNode, FsNodeInfo, LookupContext, MountInfo, Namespace, NamespaceNode,
1845        RenameFlags, SymlinkMode, SymlinkTarget, UnlinkKind, WhatToMount,
1846    };
1847    use starnix_uapi::mount_flags::MountFlags;
1848    use starnix_uapi::{errno, mode};
1849    use std::sync::Arc;
1850
1851    #[::fuchsia::test]
1852    async fn test_namespace() {
1853        spawn_kernel_and_run(async |locked, current_task| {
1854            let kernel = current_task.kernel();
1855            let root_fs = TmpFs::new_fs(locked, &kernel);
1856            let root_node = Arc::clone(root_fs.root());
1857            let _dev_node = root_node
1858                .create_dir(locked, &current_task, "dev".into())
1859                .expect("failed to mkdir dev");
1860            let dev_fs = TmpFs::new_fs(locked, &kernel);
1861            let dev_root_node = Arc::clone(dev_fs.root());
1862            let _dev_pts_node = dev_root_node
1863                .create_dir(locked, &current_task, "pts".into())
1864                .expect("failed to mkdir pts");
1865
1866            let ns = Namespace::new(root_fs);
1867            let mut context = LookupContext::default();
1868            let dev = ns
1869                .root()
1870                .lookup_child(locked, &current_task, &mut context, "dev".into())
1871                .expect("failed to lookup dev");
1872            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1873                .expect("failed to mount dev root node");
1874
1875            let mut context = LookupContext::default();
1876            let dev = ns
1877                .root()
1878                .lookup_child(locked, &current_task, &mut context, "dev".into())
1879                .expect("failed to lookup dev");
1880            let mut context = LookupContext::default();
1881            let pts = dev
1882                .lookup_child(locked, &current_task, &mut context, "pts".into())
1883                .expect("failed to lookup pts");
1884            let pts_parent =
1885                pts.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of pts");
1886            assert!(Arc::ptr_eq(&pts_parent.entry, &dev.entry));
1887
1888            let dev_parent =
1889                dev.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of dev");
1890            assert!(Arc::ptr_eq(&dev_parent.entry, &ns.root().entry));
1891        })
1892        .await;
1893    }
1894
1895    #[::fuchsia::test]
1896    async fn test_mount_does_not_upgrade() {
1897        spawn_kernel_and_run(async |locked, current_task| {
1898            let kernel = current_task.kernel();
1899            let root_fs = TmpFs::new_fs(locked, &kernel);
1900            let root_node = Arc::clone(root_fs.root());
1901            let _dev_node = root_node
1902                .create_dir(locked, &current_task, "dev".into())
1903                .expect("failed to mkdir dev");
1904            let dev_fs = TmpFs::new_fs(locked, &kernel);
1905            let dev_root_node = Arc::clone(dev_fs.root());
1906            let _dev_pts_node = dev_root_node
1907                .create_dir(locked, &current_task, "pts".into())
1908                .expect("failed to mkdir pts");
1909
1910            let ns = Namespace::new(root_fs);
1911            let mut context = LookupContext::default();
1912            let dev = ns
1913                .root()
1914                .lookup_child(locked, &current_task, &mut context, "dev".into())
1915                .expect("failed to lookup dev");
1916            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1917                .expect("failed to mount dev root node");
1918            let mut context = LookupContext::default();
1919            let new_dev = ns
1920                .root()
1921                .lookup_child(locked, &current_task, &mut context, "dev".into())
1922                .expect("failed to lookup dev again");
1923            assert!(!Arc::ptr_eq(&dev.entry, &new_dev.entry));
1924            assert_ne!(&dev, &new_dev);
1925
1926            let mut context = LookupContext::default();
1927            let _new_pts = new_dev
1928                .lookup_child(locked, &current_task, &mut context, "pts".into())
1929                .expect("failed to lookup pts");
1930            let mut context = LookupContext::default();
1931            assert!(dev.lookup_child(locked, &current_task, &mut context, "pts".into()).is_err());
1932        })
1933        .await;
1934    }
1935
1936    #[::fuchsia::test]
1937    async fn test_path() {
1938        spawn_kernel_and_run(async |locked, current_task| {
1939            let kernel = current_task.kernel();
1940            let root_fs = TmpFs::new_fs(locked, &kernel);
1941            let root_node = Arc::clone(root_fs.root());
1942            let _dev_node = root_node
1943                .create_dir(locked, &current_task, "dev".into())
1944                .expect("failed to mkdir dev");
1945            let dev_fs = TmpFs::new_fs(locked, &kernel);
1946            let dev_root_node = Arc::clone(dev_fs.root());
1947            let _dev_pts_node = dev_root_node
1948                .create_dir(locked, &current_task, "pts".into())
1949                .expect("failed to mkdir pts");
1950
1951            let ns = Namespace::new(root_fs);
1952            let mut context = LookupContext::default();
1953            let dev = ns
1954                .root()
1955                .lookup_child(locked, &current_task, &mut context, "dev".into())
1956                .expect("failed to lookup dev");
1957            dev.mount(WhatToMount::Fs(dev_fs), MountFlags::empty())
1958                .expect("failed to mount dev root node");
1959
1960            let mut context = LookupContext::default();
1961            let dev = ns
1962                .root()
1963                .lookup_child(locked, &current_task, &mut context, "dev".into())
1964                .expect("failed to lookup dev");
1965            let mut context = LookupContext::default();
1966            let pts = dev
1967                .lookup_child(locked, &current_task, &mut context, "pts".into())
1968                .expect("failed to lookup pts");
1969
1970            assert_eq!("/", ns.root().path_escaping_chroot());
1971            assert_eq!("/dev", dev.path_escaping_chroot());
1972            assert_eq!("/dev/pts", pts.path_escaping_chroot());
1973        })
1974        .await;
1975    }
1976
1977    #[::fuchsia::test]
1978    async fn test_shadowing() {
1979        spawn_kernel_and_run(async |locked, current_task| {
1980            let kernel = current_task.kernel();
1981            let root_fs = TmpFs::new_fs(locked, &kernel);
1982            let ns = Namespace::new(root_fs.clone());
1983            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
1984            let mut context = LookupContext::default();
1985            let foo_dir =
1986                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
1987
1988            let foofs1 = TmpFs::new_fs(locked, &kernel);
1989            foo_dir.mount(WhatToMount::Fs(foofs1.clone()), MountFlags::empty()).unwrap();
1990            let mut context = LookupContext::default();
1991            assert!(Arc::ptr_eq(
1992                &ns.root()
1993                    .lookup_child(locked, &current_task, &mut context, "foo".into())
1994                    .unwrap()
1995                    .entry,
1996                foofs1.root()
1997            ));
1998            let foo_dir =
1999                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2000
2001            let ns_clone = ns.clone_namespace();
2002
2003            let foofs2 = TmpFs::new_fs(locked, &kernel);
2004            foo_dir.mount(WhatToMount::Fs(foofs2.clone()), MountFlags::empty()).unwrap();
2005            let mut context = LookupContext::default();
2006            assert!(Arc::ptr_eq(
2007                &ns.root()
2008                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2009                    .unwrap()
2010                    .entry,
2011                foofs2.root()
2012            ));
2013
2014            assert!(Arc::ptr_eq(
2015                &ns_clone
2016                    .root()
2017                    .lookup_child(
2018                        locked,
2019                        &current_task,
2020                        &mut LookupContext::default(),
2021                        "foo".into()
2022                    )
2023                    .unwrap()
2024                    .entry,
2025                foofs1.root()
2026            ));
2027        })
2028        .await;
2029    }
2030
2031    #[::fuchsia::test]
2032    async fn test_unlink_mounted_directory() {
2033        spawn_kernel_and_run(async |locked, current_task| {
2034            let kernel = current_task.kernel();
2035            let root_fs = TmpFs::new_fs(locked, &kernel);
2036            let ns1 = Namespace::new(root_fs.clone());
2037            let ns2 = Namespace::new(root_fs.clone());
2038            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2039            let mut context = LookupContext::default();
2040            let foo_dir =
2041                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2042
2043            let foofs = TmpFs::new_fs(locked, &kernel);
2044            foo_dir.mount(WhatToMount::Fs(foofs), MountFlags::empty()).unwrap();
2045
2046            // Trying to unlink from ns1 should fail.
2047            assert_eq!(
2048                ns1.root()
2049                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2050                    .unwrap_err(),
2051                errno!(EBUSY),
2052            );
2053
2054            // But unlinking from ns2 should succeed.
2055            ns2.root()
2056                .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2057                .expect("unlink failed");
2058
2059            // And it should no longer show up in ns1.
2060            assert_eq!(
2061                ns1.root()
2062                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2063                    .unwrap_err(),
2064                errno!(ENOENT),
2065            );
2066        })
2067        .await;
2068    }
2069
2070    #[::fuchsia::test]
2071    async fn test_rename_mounted_directory() {
2072        spawn_kernel_and_run(async |locked, current_task| {
2073            let kernel = current_task.kernel();
2074            let root_fs = TmpFs::new_fs(locked, &kernel);
2075            let ns1 = Namespace::new(root_fs.clone());
2076            let ns2 = Namespace::new(root_fs.clone());
2077            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2078            let _bar_node = root_fs.root().create_dir(locked, &current_task, "bar".into()).unwrap();
2079            let _baz_node = root_fs.root().create_dir(locked, &current_task, "baz".into()).unwrap();
2080            let mut context = LookupContext::default();
2081            let foo_dir =
2082                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2083
2084            let foofs = TmpFs::new_fs(locked, &kernel);
2085            foo_dir.mount(WhatToMount::Fs(foofs), MountFlags::empty()).unwrap();
2086
2087            // Trying to rename over foo from ns1 should fail.
2088            let root = ns1.root();
2089            assert_eq!(
2090                NamespaceNode::rename(
2091                    locked,
2092                    &current_task,
2093                    &root,
2094                    "bar".into(),
2095                    &root,
2096                    "foo".into(),
2097                    RenameFlags::empty()
2098                )
2099                .unwrap_err(),
2100                errno!(EBUSY),
2101            );
2102            // Likewise the other way.
2103            assert_eq!(
2104                NamespaceNode::rename(
2105                    locked,
2106                    &current_task,
2107                    &root,
2108                    "foo".into(),
2109                    &root,
2110                    "bar".into(),
2111                    RenameFlags::empty()
2112                )
2113                .unwrap_err(),
2114                errno!(EBUSY),
2115            );
2116
2117            // But renaming from ns2 should succeed.
2118            let root = ns2.root();
2119
2120            // First rename the directory with the mount.
2121            NamespaceNode::rename(
2122                locked,
2123                &current_task,
2124                &root,
2125                "foo".into(),
2126                &root,
2127                "bar".into(),
2128                RenameFlags::empty(),
2129            )
2130            .expect("rename failed");
2131
2132            // Renaming over a directory with a mount should also work.
2133            NamespaceNode::rename(
2134                locked,
2135                &current_task,
2136                &root,
2137                "baz".into(),
2138                &root,
2139                "bar".into(),
2140                RenameFlags::empty(),
2141            )
2142            .expect("rename failed");
2143
2144            // "foo" and "baz" should no longer show up in ns1.
2145            assert_eq!(
2146                ns1.root()
2147                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2148                    .unwrap_err(),
2149                errno!(ENOENT)
2150            );
2151            assert_eq!(
2152                ns1.root()
2153                    .lookup_child(locked, &current_task, &mut context, "baz".into())
2154                    .unwrap_err(),
2155                errno!(ENOENT)
2156            );
2157        })
2158        .await;
2159    }
2160
2161    /// Symlinks which need to be traversed across types (nodes and paths), as well as across
2162    /// owning directories, can be tricky to get right.
2163    #[::fuchsia::test]
2164    async fn test_lookup_with_symlink_chain() {
2165        spawn_kernel_and_run(async |locked, current_task| {
2166            // Set up the root filesystem
2167            let kernel = current_task.kernel();
2168            let root_fs = TmpFs::new_fs(locked, &kernel);
2169            let root_node = Arc::clone(root_fs.root());
2170            let _first_subdir_node = root_node
2171                .create_dir(locked, &current_task, "first_subdir".into())
2172                .expect("failed to mkdir dev");
2173            let _second_subdir_node = root_node
2174                .create_dir(locked, &current_task, "second_subdir".into())
2175                .expect("failed to mkdir dev");
2176
2177            // Set up two subdirectories under the root filesystem
2178            let first_subdir_fs = TmpFs::new_fs(locked, &kernel);
2179            let second_subdir_fs = TmpFs::new_fs(locked, &kernel);
2180
2181            let ns = Namespace::new(root_fs);
2182            let mut context = LookupContext::default();
2183            let first_subdir = ns
2184                .root()
2185                .lookup_child(locked, &current_task, &mut context, "first_subdir".into())
2186                .expect("failed to lookup first_subdir");
2187            first_subdir
2188                .mount(WhatToMount::Fs(first_subdir_fs), MountFlags::empty())
2189                .expect("failed to mount first_subdir fs node");
2190            let second_subdir = ns
2191                .root()
2192                .lookup_child(locked, &current_task, &mut context, "second_subdir".into())
2193                .expect("failed to lookup second_subdir");
2194            second_subdir
2195                .mount(WhatToMount::Fs(second_subdir_fs), MountFlags::empty())
2196                .expect("failed to mount second_subdir fs node");
2197
2198            // Create the symlink structure. To trigger potential symlink traversal bugs, we're going
2199            // for the following directory structure:
2200            // / (root)
2201            //     + first_subdir/
2202            //         - real_file
2203            //         - path_symlink (-> real_file)
2204            //     + second_subdir/
2205            //         - node_symlink (-> path_symlink)
2206            let real_file_node = first_subdir
2207                .create_node(
2208                    locked,
2209                    &current_task,
2210                    "real_file".into(),
2211                    mode!(IFREG, 0o777),
2212                    DeviceType::NONE,
2213                )
2214                .expect("failed to create real_file");
2215            first_subdir
2216                .create_symlink(locked, &current_task, "path_symlink".into(), "real_file".into())
2217                .expect("failed to create path_symlink");
2218
2219            let mut no_follow_lookup_context = LookupContext::new(SymlinkMode::NoFollow);
2220            let path_symlink_node = first_subdir
2221                .lookup_child(
2222                    locked,
2223                    &current_task,
2224                    &mut no_follow_lookup_context,
2225                    "path_symlink".into(),
2226                )
2227                .expect("Failed to lookup path_symlink");
2228
2229            // The second symlink needs to be of type SymlinkTarget::Node in order to trip the sensitive
2230            // code path. There's no easy method for creating this type of symlink target, so we'll need
2231            // to construct a node from scratch and insert it into the directory manually.
2232            let node_symlink_node = second_subdir.entry.node.fs().create_node_and_allocate_node_id(
2233                CallbackSymlinkNode::new(move || {
2234                    let node = path_symlink_node.clone();
2235                    Ok(SymlinkTarget::Node(node))
2236                }),
2237                FsNodeInfo::new(mode!(IFLNK, 0o777), current_task.current_fscred()),
2238            );
2239            second_subdir
2240                .entry
2241                .create_entry(
2242                    locked,
2243                    &current_task,
2244                    &MountInfo::detached(),
2245                    "node_symlink".into(),
2246                    move |_locked, _dir, _mount, _name| Ok(node_symlink_node),
2247                )
2248                .expect("failed to create node_symlink entry");
2249
2250            // Finally, exercise the lookup under test.
2251            let mut follow_lookup_context = LookupContext::new(SymlinkMode::Follow);
2252            let node_symlink_resolution = second_subdir
2253                .lookup_child(
2254                    locked,
2255                    &current_task,
2256                    &mut follow_lookup_context,
2257                    "node_symlink".into(),
2258                )
2259                .expect("lookup with symlink chain failed");
2260
2261            // The lookup resolution should have correctly followed the symlinks to the real_file node.
2262            assert!(node_symlink_resolution.entry.node.ino == real_file_node.entry.node.ino);
2263        })
2264        .await;
2265    }
2266}