Skip to main content

starnix_core/vfs/
namespace.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mutable_state::{state_accessor, state_implementation};
6use crate::security;
7use crate::task::{CurrentTask, EventHandler, Kernel, Task, WaitCanceler, Waiter};
8use crate::time::utc;
9use crate::vfs::fs_registry::FsRegistry;
10use crate::vfs::pseudo::dynamic_file::{DynamicFile, DynamicFileBuf, DynamicFileSource};
11use crate::vfs::pseudo::simple_file::SimpleFileNode;
12use crate::vfs::socket::{SocketAddress, SocketHandle, UnixSocket};
13use crate::vfs::{
14    CheckAccessReason, DirEntry, DirEntryHandle, FileHandle, FileObject, FileOps, FileSystemHandle,
15    FileSystemOptions, FileWriteGuardMode, FsContext, FsNode, FsNodeHandle, FsNodeOps, FsStr,
16    FsString, PathBuilder, RenameFlags, SymlinkTarget, UnlinkKind, fileops_impl_dataless,
17    fileops_impl_delegate_read_write_and_seek, fileops_impl_nonseekable, fileops_impl_noop_sync,
18    fs_node_impl_not_dir,
19};
20use fuchsia_rcu::RcuReadScope;
21use macro_rules_attribute::apply;
22use ref_cast::RefCast;
23use starnix_logging::log_warn;
24use starnix_rcu::RcuHashMap;
25use starnix_sync::{
26    BeforeFsNodeAppend, FileOpsCore, LockEqualOrBefore, Locked, Mutex, RwLock, Unlocked,
27};
28use starnix_types::ownership::WeakRef;
29use starnix_uapi::arc_key::{ArcKey, PtrKey, WeakKey};
30use starnix_uapi::auth::UserAndOrGroupId;
31use starnix_uapi::device_id::DeviceId;
32use starnix_uapi::errors::Errno;
33use starnix_uapi::file_mode::{AccessCheck, FileMode};
34use starnix_uapi::inotify_mask::InotifyMask;
35use starnix_uapi::mount_flags::{
36    AtomicMountpointFlags, FileSystemFlags, MountFlags, MountpointFlags,
37};
38use starnix_uapi::open_flags::OpenFlags;
39use starnix_uapi::unmount_flags::UnmountFlags;
40use starnix_uapi::vfs::{FdEvents, ResolveFlags};
41use starnix_uapi::{NAME_MAX, errno, error};
42use std::borrow::Borrow;
43use std::collections::HashSet;
44use std::fmt;
45use std::hash::{Hash, Hasher};
46use std::ops::{Deref, DerefMut};
47use std::sync::atomic::Ordering;
48use std::sync::{Arc, Weak};
49
50/// A mount namespace.
51///
52/// The namespace records at which entries filesystems are mounted.
53#[derive(Debug)]
54pub struct Namespace {
55    root_mount: MountHandle,
56
57    // Unique ID of this namespace.
58    pub id: u64,
59}
60
61impl Namespace {
62    pub fn new(fs: FileSystemHandle) -> Arc<Namespace> {
63        Self::new_with_flags(fs, MountpointFlags::empty())
64    }
65
66    pub fn new_with_flags(fs: FileSystemHandle, flags: MountpointFlags) -> Arc<Namespace> {
67        let kernel = fs.kernel.upgrade().expect("can't create namespace without a kernel");
68        let root_mount = Mount::new(WhatToMount::Fs(fs), flags);
69        Arc::new(Self { root_mount, id: kernel.get_next_namespace_id() })
70    }
71
72    pub fn root(&self) -> NamespaceNode {
73        self.root_mount.root()
74    }
75
76    pub fn clone_namespace(&self) -> Arc<Namespace> {
77        let kernel =
78            self.root_mount.fs.kernel.upgrade().expect("can't clone namespace without a kernel");
79        Arc::new(Self {
80            root_mount: self.root_mount.clone_mount_recursive(),
81            id: kernel.get_next_namespace_id(),
82        })
83    }
84
85    /// Assuming new_ns is a clone of the namespace that node is from, return the equivalent of
86    /// node in new_ns. If this assumption is violated, returns None.
87    pub fn translate_node(mut node: NamespaceNode, new_ns: &Namespace) -> Option<NamespaceNode> {
88        // Collect the list of mountpoints that leads to this node's mount
89        let mut mountpoints = vec![];
90        let mut mount = node.mount;
91        while let Some(mountpoint) = mount.as_ref().and_then(|m| m.read().mountpoint()) {
92            mountpoints.push(mountpoint.entry);
93            mount = mountpoint.mount;
94        }
95
96        // Follow the same path in the new namespace
97        let mut mount = Arc::clone(&new_ns.root_mount);
98        for mountpoint in mountpoints.iter().rev() {
99            let next_mount =
100                mount.read().submounts.get(ArcKey::ref_cast(mountpoint))?.mount.clone();
101            mount = next_mount;
102        }
103        node.mount = Some(mount).into();
104        Some(node)
105    }
106}
107
108impl FsNodeOps for Arc<Namespace> {
109    fs_node_impl_not_dir!();
110
111    fn create_file_ops(
112        &self,
113        _locked: &mut Locked<FileOpsCore>,
114        _node: &FsNode,
115        _current_task: &CurrentTask,
116        _flags: OpenFlags,
117    ) -> Result<Box<dyn FileOps>, Errno> {
118        Ok(Box::new(MountNamespaceFile(self.clone())))
119    }
120}
121
122pub struct MountNamespaceFile(pub Arc<Namespace>);
123
124impl FileOps for MountNamespaceFile {
125    fileops_impl_nonseekable!();
126    fileops_impl_dataless!();
127    fileops_impl_noop_sync!();
128}
129
130/// An empty struct that we use to track the number of active clients for a mount.
131///
132/// Each active client takes a reference to this object. The unmount operation fails
133/// if there are any active clients of the mount.
134type MountClientMarker = Arc<()>;
135
136/// An instance of a filesystem mounted in a namespace.
137///
138/// At a mount, path traversal switches from one filesystem to another.
139/// The client sees a composed directory structure that glues together the
140/// directories from the underlying FsNodes from those filesystems.
141///
142/// The mounts in a namespace form a mount tree, with `mountpoint` pointing to the parent and
143/// `submounts` pointing to the children.
144pub struct Mount {
145    root: DirEntryHandle,
146    fs: FileSystemHandle,
147
148    /// Holds the flags specific to this mount of the underlying filesystem.
149    flags: AtomicMountpointFlags,
150
151    /// Lock used to serialize updates of `flags` to ensure consistency during remount operations.
152    flags_lock: Mutex<()>,
153
154    /// A unique identifier for this mount reported in /proc/pid/mountinfo.
155    id: u64,
156
157    /// A count of the number of active clients.
158    active_client_counter: MountClientMarker,
159
160    // Lock ordering: mount -> submount
161    state: RwLock<MountState>,
162    // Mount used to contain a Weak<Namespace>. It no longer does because since the mount point
163    // hash was moved from Namespace to Mount, nothing actually uses it. Now that
164    // Namespace::clone_namespace() is implemented in terms of Mount::clone_mount_recursive, it
165    // won't be trivial to add it back. I recommend turning the mountpoint field into an enum of
166    // Mountpoint or Namespace, maybe called "parent", and then traverse up to the top of the tree
167    // if you need to find a Mount's Namespace.
168}
169type MountHandle = Arc<Mount>;
170
171/// Public representation of the mount options.
172#[derive(Clone, Debug)]
173pub struct MountInfo {
174    handle: Option<MountHandle>,
175}
176
177impl MountInfo {
178    /// `MountInfo` for a element that is not tied to a given mount. Mount flags will be considered
179    /// empty.
180    pub fn detached() -> Self {
181        None.into()
182    }
183
184    /// The mount flags of the represented mount.
185    pub fn flags(&self) -> MountFlags {
186        if let Some(handle) = &self.handle {
187            handle.flags()
188        } else {
189            // Consider not mounted node have the NOATIME flags.
190            MountFlags::NOATIME
191        }
192    }
193
194    /// Checks whether this `MountInfo` represents a writable file system mount.
195    pub fn check_readonly_filesystem(&self) -> Result<(), Errno> {
196        if self.flags().contains(MountFlags::RDONLY) {
197            return error!(EROFS);
198        }
199        Ok(())
200    }
201
202    /// Checks whether this `MountInfo` represents an executable file system mount.
203    pub fn check_noexec_filesystem(&self) -> Result<(), Errno> {
204        if self.flags().contains(MountFlags::NOEXEC) {
205            return error!(EACCES);
206        }
207        Ok(())
208    }
209}
210
211impl Deref for MountInfo {
212    type Target = Option<MountHandle>;
213
214    fn deref(&self) -> &Self::Target {
215        &self.handle
216    }
217}
218
219impl DerefMut for MountInfo {
220    fn deref_mut(&mut self) -> &mut Self::Target {
221        &mut self.handle
222    }
223}
224
225impl std::cmp::PartialEq for MountInfo {
226    fn eq(&self, other: &Self) -> bool {
227        self.handle.as_ref().map(Arc::as_ptr) == other.handle.as_ref().map(Arc::as_ptr)
228    }
229}
230
231impl std::cmp::Eq for MountInfo {}
232
233impl Into<MountInfo> for Option<MountHandle> {
234    fn into(self) -> MountInfo {
235        MountInfo { handle: self }
236    }
237}
238
239#[derive(Default)]
240pub struct MountState {
241    /// The namespace node that this mount is mounted on. This is a tuple instead of a
242    /// NamespaceNode because the Mount pointer has to be weak because this is the pointer to the
243    /// parent mount, the parent has a pointer to the children too, and making both strong would be
244    /// a cycle.
245    mountpoint: Option<(Weak<Mount>, DirEntryHandle)>,
246
247    // The set is keyed by the mountpoints which are always descendants of this mount's root.
248    // Conceptually, the set is more akin to a map: `DirEntry -> MountHandle`, but we use a set
249    // instead because `Submount` has a drop implementation that needs both the key and value.
250    //
251    // Each directory entry can only have one mount attached. Mount shadowing works by using the
252    // root of the inner mount as a mountpoint. For example, if filesystem A is mounted at /foo,
253    // mounting filesystem B on /foo will create the mount as a child of the A mount, attached to
254    // A's root, instead of the root mount.
255    submounts: HashSet<Submount>,
256
257    /// The membership of this mount in its peer group. Do not access directly. Instead use
258    /// peer_group(), take_from_peer_group(), and set_peer_group().
259    // TODO(tbodt): Refactor the links into, some kind of extra struct or something? This is hard
260    // because setting this field requires the Arc<Mount>.
261    peer_group_: Option<(Arc<PeerGroup>, PtrKey<Mount>)>,
262    /// The membership of this mount in a PeerGroup's downstream. Do not access directly. Instead
263    /// use upstream(), take_from_upstream(), and set_upstream().
264    upstream_: Option<(Weak<PeerGroup>, PtrKey<Mount>)>,
265}
266
267/// A group of mounts. Setting MS_SHARED on a mount puts it in its own peer group. Any bind mounts
268/// of a mount in the group are also added to the group. A mount created in any mount in a peer
269/// group will be automatically propagated (recreated) in every other mount in the group.
270#[derive(Default)]
271struct PeerGroup {
272    id: u64,
273    state: RwLock<PeerGroupState>,
274}
275#[derive(Default)]
276struct PeerGroupState {
277    mounts: HashSet<WeakKey<Mount>>,
278    downstream: HashSet<WeakKey<Mount>>,
279}
280
281pub enum WhatToMount {
282    Fs(FileSystemHandle),
283    Bind(NamespaceNode),
284}
285
286impl Mount {
287    pub fn new(what: WhatToMount, mut flags: MountpointFlags) -> MountHandle {
288        match what {
289            WhatToMount::Fs(fs) => {
290                // If `flags` does not explicitly specify an access-time flag then default to `RELATIME`.
291                flags.default_atime_from(MountpointFlags::RELATIME);
292                Self::new_with_root(fs.root().clone(), flags)
293            }
294            WhatToMount::Bind(node) => {
295                let mount = node.mount.as_ref().expect("can't bind mount from an anonymous node");
296                mount.clone_mount(&node.entry, flags.into())
297            }
298        }
299    }
300
301    fn new_with_root(root: DirEntryHandle, flags: MountpointFlags) -> MountHandle {
302        let fs = root.node.fs();
303        let kernel = fs.kernel.upgrade().expect("can't create mount without kernel");
304        Arc::new(Self {
305            id: kernel.get_next_mount_id(),
306            flags: (flags & MountpointFlags::STORED_ON_MOUNT).into(),
307            flags_lock: Mutex::new(()),
308            root,
309            active_client_counter: Default::default(),
310            fs,
311            state: Default::default(),
312        })
313    }
314
315    /// A namespace node referring to the root of the mount.
316    pub fn root(self: &MountHandle) -> NamespaceNode {
317        NamespaceNode::new(Arc::clone(self), Arc::clone(&self.root))
318    }
319
320    /// Create the specified mount as a child. Also propagate it to the mount's peer group.
321    fn create_submount(
322        self: &MountHandle,
323        dir: &DirEntryHandle,
324        what: WhatToMount,
325        flags: MountpointFlags,
326    ) {
327        // TODO(tbodt): Making a copy here is necessary for lock ordering, because the peer group
328        // lock nests inside all mount locks (it would be impractical to reverse this because you
329        // need to lock a mount to get its peer group.) But it opens the door to race conditions
330        // where if a peer are concurrently being added, the mount might not get propagated to the
331        // new peer. The only true solution to this is bigger locks, somehow using the same lock
332        // for the peer group and all of the mounts in the group. Since peer groups are fluid and
333        // can have mounts constantly joining and leaving and then joining other groups, the only
334        // sensible locking option is to use a single global lock for all mounts and peer groups.
335        // This is almost impossible to express in rust. Help.
336        //
337        // Update: Also necessary to make a copy to prevent excess replication, see the comment on
338        // the following Mount::new call.
339        let peers = {
340            let state = self.state.read();
341            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
342        };
343
344        // Create the mount after copying the peer groups, because in the case of creating a bind
345        // mount inside itself, the new mount would get added to our peer group during the
346        // Mount::new call, but we don't want to replicate into it already. For an example see
347        // MountTest.QuizBRecursion.
348        let mount = Mount::new(what, flags);
349
350        if self.read().is_shared() {
351            mount.write().make_shared();
352        }
353
354        for peer in peers {
355            if Arc::ptr_eq(self, &peer) {
356                continue;
357            }
358            let clone = mount.clone_mount_recursive();
359            peer.write().add_submount_internal(dir, clone);
360        }
361
362        self.write().add_submount_internal(dir, mount)
363    }
364
365    fn remove_submount(self: &MountHandle, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
366        // create_submount explains why we need to make a copy of peers.
367        let peers = {
368            let state = self.state.read();
369            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
370        };
371
372        for peer in peers {
373            if Arc::ptr_eq(self, &peer) {
374                continue;
375            }
376            // mount_namespaces(7): If B is shared, then all most-recently-mounted mounts at b on
377            // mounts that receive propagation from mount B and do not have submounts under them are
378            // unmounted.
379            let mut peer = peer.write();
380            if let Some(submount) = peer.submounts.get(mount_hash_key) {
381                if !submount.mount.read().submounts.is_empty() {
382                    continue;
383                }
384            }
385            let _ = peer.remove_submount_internal(mount_hash_key);
386        }
387
388        self.write().remove_submount_internal(mount_hash_key)
389    }
390
391    /// Create a new mount with the same filesystem, flags, and peer group. Used to implement bind
392    /// mounts.
393    fn clone_mount(
394        self: &MountHandle,
395        new_root: &DirEntryHandle,
396        flags: MountFlags,
397    ) -> MountHandle {
398        assert!(new_root.is_descendant_of(&self.root));
399        // According to mount(2) on bind mounts, all flags other than MS_REC are ignored when doing
400        // a bind mount.
401        let clone = Self::new_with_root(Arc::clone(new_root), self.mount_flags());
402
403        if flags.contains(MountFlags::REC) {
404            // This is two steps because the alternative (locking clone.state while iterating over
405            // self.state.submounts) trips tracing_mutex. The lock ordering is parent -> child, and
406            // if the clone is eventually made a child of self, this looks like an ordering
407            // violation. I'm not convinced it's a real issue, but I can't convince myself it's not
408            // either.
409            let mut submounts = vec![];
410            for Submount { dir, mount } in &self.state.read().submounts {
411                submounts.push((dir.clone(), mount.clone_mount_recursive()));
412            }
413            let mut clone_state = clone.write();
414            for (dir, submount) in submounts {
415                clone_state.add_submount_internal(&dir, submount);
416            }
417        }
418
419        // Put the clone in the same peer group
420        let peer_group = self.state.read().peer_group().map(Arc::clone);
421        if let Some(peer_group) = peer_group {
422            clone.write().set_peer_group(peer_group);
423        }
424
425        clone
426    }
427
428    /// Do a clone of the full mount hierarchy below this mount. Used for creating mount
429    /// namespaces and creating copies to use for propagation.
430    fn clone_mount_recursive(self: &MountHandle) -> MountHandle {
431        self.clone_mount(&self.root, MountFlags::REC)
432    }
433
434    pub fn change_propagation(self: &MountHandle, flag: MountFlags, recursive: bool) {
435        let mut state = self.write();
436        match flag {
437            MountFlags::SHARED => state.make_shared(),
438            MountFlags::PRIVATE => state.make_private(),
439            MountFlags::DOWNSTREAM => state.make_downstream(),
440            _ => {
441                log_warn!("mount propagation {:?}", flag);
442                return;
443            }
444        }
445
446        if recursive {
447            for submount in &state.submounts {
448                submount.mount.change_propagation(flag, recursive);
449            }
450        }
451    }
452
453    /// Returns the effective flags for the `Mount`, calculated as the union of the mount flags
454    /// associated with the `FileSystem`, and with the `Mount` itself.
455    fn flags(&self) -> MountFlags {
456        MountFlags::from(self.mount_flags()) | self.fs_flags().into()
457    }
458
459    /// Returns the mount flags stored unique to this `Mount`.
460    fn mount_flags(&self) -> MountpointFlags {
461        self.flags.load(Ordering::Relaxed)
462    }
463
464    /// Returns the mount flags for the `FileSystem` of this `Mount`.
465    fn fs_flags(&self) -> FileSystemFlags {
466        self.fs.options.flags
467    }
468
469    /// Updates the `Mount` with the per-mount flags specified in `flags`, while preserving the
470    /// existing access-time flag if no access-time flag is set in `flags`.
471    pub fn update_flags(self: &MountHandle, mut flags: MountpointFlags) {
472        let _lock = self.flags_lock.lock();
473        // Since Linux 3.17, if none of MS_NOATIME, MS_NODIRATIME,
474        // MS_RELATIME, or MS_STRICTATIME is specified in mountflags, then
475        // the remount operation preserves the existing values of these
476        // flags (rather than defaulting to MS_RELATIME).
477        flags.default_atime_from(self.flags.load(Ordering::Relaxed));
478        flags &= MountpointFlags::STORED_ON_MOUNT;
479        self.flags.store(flags, Ordering::Relaxed);
480    }
481
482    /// The number of active clients of this mount.
483    ///
484    /// The mount cannot be unmounted if there are any active clients.
485    fn active_clients(&self) -> usize {
486        // We need to subtract one for our own reference. We are not a real client.
487        Arc::strong_count(&self.active_client_counter) - 1
488    }
489
490    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
491        if !flags.contains(UnmountFlags::DETACH) {
492            if self.active_clients() > 0 || !self.state.read().submounts.is_empty() {
493                return error!(EBUSY);
494            }
495        }
496        let mountpoint = self.state.read().mountpoint().ok_or_else(|| errno!(EINVAL))?;
497        let parent_mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
498        parent_mount.remove_submount(mountpoint.mount_hash_key())
499    }
500
501    /// Returns the security state of the fs.
502    pub fn security_state(&self) -> &security::FileSystemState {
503        &self.fs.security_state
504    }
505
506    /// Returns the name of the fs.
507    pub fn fs_name(&self) -> &'static FsStr {
508        self.fs.name()
509    }
510
511    state_accessor!(Mount, state, Arc<Mount>);
512}
513
514impl MountState {
515    /// Returns true if there is a submount on top of `dir_entry`.
516    pub fn has_submount(&self, dir_entry: &DirEntryHandle) -> bool {
517        self.submounts.contains(ArcKey::ref_cast(dir_entry))
518    }
519
520    /// The NamespaceNode on which this Mount is mounted.
521    fn mountpoint(&self) -> Option<NamespaceNode> {
522        let (mount, entry) = self.mountpoint.as_ref()?;
523        Some(NamespaceNode::new(mount.upgrade()?, entry.clone()))
524    }
525
526    /// Return this mount's current peer group.
527    fn peer_group(&self) -> Option<&Arc<PeerGroup>> {
528        let (group, _) = self.peer_group_.as_ref()?;
529        Some(group)
530    }
531
532    /// Remove this mount from its peer group and return the peer group.
533    fn take_from_peer_group(&mut self) -> Option<Arc<PeerGroup>> {
534        let (old_group, old_mount) = self.peer_group_.take()?;
535        old_group.remove(old_mount);
536        if let Some(upstream) = self.take_from_upstream() {
537            let next_mount =
538                old_group.state.read().mounts.iter().next().map(|w| w.0.upgrade().unwrap());
539            if let Some(next_mount) = next_mount {
540                // TODO(https://fxbug.dev/42065259): Fix the lock ordering here. We've locked next_mount
541                // while self is locked, and since the propagation tree and mount tree are
542                // separate, this could violate the mount -> submount order previously established.
543                next_mount.write().set_upstream(upstream);
544            }
545        }
546        Some(old_group)
547    }
548
549    fn upstream(&self) -> Option<Arc<PeerGroup>> {
550        self.upstream_.as_ref().and_then(|g| g.0.upgrade())
551    }
552
553    fn take_from_upstream(&mut self) -> Option<Arc<PeerGroup>> {
554        let (old_upstream, old_mount) = self.upstream_.take()?;
555        // TODO(tbodt): Reason about whether the upgrade() could possibly return None, and what we
556        // should actually do in that case.
557        let old_upstream = old_upstream.upgrade()?;
558        old_upstream.remove_downstream(old_mount);
559        Some(old_upstream)
560    }
561}
562
563#[apply(state_implementation!)]
564impl MountState<Base = Mount, BaseType = Arc<Mount>> {
565    /// Add a child mount *without propagating it to the peer group*. For internal use only.
566    fn add_submount_internal(&mut self, dir: &DirEntryHandle, mount: MountHandle) {
567        if !dir.is_descendant_of(&self.base.root) {
568            return;
569        }
570
571        let submount = mount.fs.kernel.upgrade().unwrap().mounts.register_mount(dir, mount.clone());
572        let old_mountpoint =
573            mount.state.write().mountpoint.replace((Arc::downgrade(self.base), Arc::clone(dir)));
574        assert!(old_mountpoint.is_none(), "add_submount can only take a newly created mount");
575        // Mount shadowing is implemented by mounting onto the root of the first mount, not by
576        // creating two mounts on the same mountpoint.
577        let old_mount = self.submounts.replace(submount);
578
579        // In rare cases, mount propagation might result in a request to mount on a directory where
580        // something is already mounted. MountTest.LotsOfShadowing will trigger this. Linux handles
581        // this by inserting the new mount between the old mount and the current mount.
582        if let Some(mut old_mount) = old_mount {
583            // Previous state: self[dir] = old_mount
584            // New state: self[dir] = new_mount, new_mount[new_mount.root] = old_mount
585            // The new mount has already been inserted into self, now just update the old mount to
586            // be a child of the new mount.
587            old_mount.mount.write().mountpoint = Some((Arc::downgrade(&mount), Arc::clone(dir)));
588            old_mount.dir = ArcKey(mount.root.clone());
589            mount.write().submounts.insert(old_mount);
590        }
591    }
592
593    fn remove_submount_internal(&mut self, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
594        if self.submounts.remove(mount_hash_key) { Ok(()) } else { error!(EINVAL) }
595    }
596
597    /// Set this mount's peer group.
598    fn set_peer_group(&mut self, group: Arc<PeerGroup>) {
599        self.take_from_peer_group();
600        group.add(self.base);
601        self.peer_group_ = Some((group, Arc::as_ptr(self.base).into()));
602    }
603
604    fn set_upstream(&mut self, group: Arc<PeerGroup>) {
605        self.take_from_upstream();
606        group.add_downstream(self.base);
607        self.upstream_ = Some((Arc::downgrade(&group), Arc::as_ptr(self.base).into()));
608    }
609
610    /// Is the mount in a peer group? Corresponds to MS_SHARED.
611    pub fn is_shared(&self) -> bool {
612        self.peer_group().is_some()
613    }
614
615    /// Put the mount in a peer group. Implements MS_SHARED.
616    pub fn make_shared(&mut self) {
617        if self.is_shared() {
618            return;
619        }
620        let kernel =
621            self.base.fs.kernel.upgrade().expect("can't create new peer group without kernel");
622        self.set_peer_group(PeerGroup::new(kernel.get_next_peer_group_id()));
623    }
624
625    /// Take the mount out of its peer group, also remove upstream if any. Implements MS_PRIVATE.
626    pub fn make_private(&mut self) {
627        self.take_from_peer_group();
628        self.take_from_upstream();
629    }
630
631    /// Take the mount out of its peer group and make it downstream instead. Implements
632    /// MountFlags::DOWNSTREAM (MS_SLAVE).
633    pub fn make_downstream(&mut self) {
634        if let Some(peer_group) = self.take_from_peer_group() {
635            self.set_upstream(peer_group);
636        }
637    }
638}
639
640impl PeerGroup {
641    fn new(id: u64) -> Arc<Self> {
642        Arc::new(Self { id, state: Default::default() })
643    }
644
645    fn add(&self, mount: &Arc<Mount>) {
646        self.state.write().mounts.insert(WeakKey::from(mount));
647    }
648
649    fn remove(&self, mount: PtrKey<Mount>) {
650        self.state.write().mounts.remove(&mount);
651    }
652
653    fn add_downstream(&self, mount: &Arc<Mount>) {
654        self.state.write().downstream.insert(WeakKey::from(mount));
655    }
656
657    fn remove_downstream(&self, mount: PtrKey<Mount>) {
658        self.state.write().downstream.remove(&mount);
659    }
660
661    fn copy_propagation_targets(&self) -> Vec<MountHandle> {
662        let mut buf = vec![];
663        self.collect_propagation_targets(&mut buf);
664        buf
665    }
666
667    fn collect_propagation_targets(&self, buf: &mut Vec<MountHandle>) {
668        let downstream_mounts: Vec<_> = {
669            let state = self.state.read();
670            buf.extend(state.mounts.iter().filter_map(|m| m.0.upgrade()));
671            state.downstream.iter().filter_map(|m| m.0.upgrade()).collect()
672        };
673        for mount in downstream_mounts {
674            let peer_group = mount.read().peer_group().map(Arc::clone);
675            match peer_group {
676                Some(group) => group.collect_propagation_targets(buf),
677                None => buf.push(mount),
678            }
679        }
680    }
681}
682
683impl Drop for Mount {
684    fn drop(&mut self) {
685        let state = self.state.get_mut();
686        state.take_from_peer_group();
687        state.take_from_upstream();
688    }
689}
690
691impl fmt::Debug for Mount {
692    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
693        let state = self.state.read();
694        f.debug_struct("Mount")
695            .field("id", &(self as *const Mount))
696            .field("root", &self.root)
697            .field("mountpoint", &state.mountpoint)
698            .field("submounts", &state.submounts)
699            .finish()
700    }
701}
702
703impl Kernel {
704    pub fn get_next_mount_id(&self) -> u64 {
705        self.next_mount_id.next()
706    }
707
708    pub fn get_next_peer_group_id(&self) -> u64 {
709        self.next_peer_group_id.next()
710    }
711
712    pub fn get_next_namespace_id(&self) -> u64 {
713        self.next_namespace_id.next()
714    }
715}
716
717impl CurrentTask {
718    pub fn create_filesystem(
719        &self,
720        locked: &mut Locked<Unlocked>,
721        fs_type: &FsStr,
722        options: FileSystemOptions,
723    ) -> Result<FileSystemHandle, Errno> {
724        // Please register new file systems via //src/starnix/modules/lib.rs, even if the file
725        // system is implemented inside starnix_core.
726        //
727        // Most file systems should be implemented as modules. The VFS provides various traits that
728        // let starnix_core integrate file systems without needing to depend on the file systems
729        // directly.
730        self.kernel()
731            .expando
732            .get::<FsRegistry>()
733            .create(locked, self, fs_type, options)
734            .ok_or_else(|| errno!(ENODEV, fs_type))?
735    }
736}
737
738struct ProcMountsFileSource(WeakRef<Task>);
739
740impl DynamicFileSource for ProcMountsFileSource {
741    fn generate(
742        &self,
743        _current_task: &CurrentTask,
744        sink: &mut DynamicFileBuf,
745    ) -> Result<(), Errno> {
746        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
747        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
748        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
749        // extra work to maintain it.
750        let task = Task::from_weak(&self.0)?;
751        let task_fs = task.live()?.fs.read();
752        let root = task_fs.root();
753        let ns = task_fs.namespace();
754        for_each_mount(&ns.root_mount, &mut |mount| {
755            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
756            if !mountpoint.is_descendant_of(&root) {
757                return Ok(());
758            }
759            write!(
760                sink,
761                "{} {} {} {}{}",
762                mount.fs.options.source_for_display(),
763                mountpoint.path(&task_fs),
764                mount.fs.name(),
765                // Report the union of the FileSystem and Mount flags, as well as any FileSystem-
766                // or LSM-specific options.
767                mount.flags(),
768                security::sb_show_options(&task.kernel(), &mount.fs)?,
769            )?;
770            writeln!(sink, " 0 0")?;
771            Ok(())
772        })?;
773        Ok(())
774    }
775}
776
777pub struct ProcMountsFile {
778    dynamic_file: DynamicFile<ProcMountsFileSource>,
779}
780
781impl ProcMountsFile {
782    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
783        SimpleFileNode::new(move |_, _| {
784            Ok(Self { dynamic_file: DynamicFile::new(ProcMountsFileSource(task.clone())) })
785        })
786    }
787}
788
789impl FileOps for ProcMountsFile {
790    fileops_impl_delegate_read_write_and_seek!(self, self.dynamic_file);
791    fileops_impl_noop_sync!();
792
793    fn wait_async(
794        &self,
795        _locked: &mut Locked<FileOpsCore>,
796        _file: &FileObject,
797        _current_task: &CurrentTask,
798        waiter: &Waiter,
799        _events: FdEvents,
800        _handler: EventHandler,
801    ) -> Option<WaitCanceler> {
802        // Polling this file gives notifications when any change to mounts occurs. This is not
803        // implemented yet, but stubbed for Android init.
804        Some(waiter.fake_wait())
805    }
806
807    fn query_events(
808        &self,
809        _locked: &mut Locked<FileOpsCore>,
810        _file: &FileObject,
811        _current_task: &CurrentTask,
812    ) -> Result<FdEvents, Errno> {
813        Ok(FdEvents::empty())
814    }
815}
816
817#[derive(Clone)]
818pub struct ProcMountinfoFile(WeakRef<Task>);
819impl ProcMountinfoFile {
820    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
821        DynamicFile::new_node(Self(task))
822    }
823}
824impl DynamicFileSource for ProcMountinfoFile {
825    fn generate(
826        &self,
827        _current_task: &CurrentTask,
828        sink: &mut DynamicFileBuf,
829    ) -> Result<(), Errno> {
830        // Returns path to the `dir` from the root of the file system.
831        fn path_from_fs_root(dir: &DirEntryHandle) -> FsString {
832            let mut path = PathBuilder::new();
833            if dir.is_dead() {
834                // Return `/foo/dir//deleted` if the dir was deleted.
835                path.prepend_element("/deleted".into());
836            }
837            let scope = RcuReadScope::new();
838            let mut current = dir.deref();
839            while let Some(parent) = current.parent_ref(&scope) {
840                path.prepend_element(current.local_name(&scope));
841                current = parent;
842            }
843            path.build_absolute()
844        }
845
846        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
847        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
848        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
849        // extra work to maintain it.
850        let task = Task::from_weak(&self.0)?;
851        let task_fs = task.live()?.fs.read();
852        let root = task_fs.root();
853        let ns = task_fs.namespace();
854        for_each_mount(&ns.root_mount, &mut |mount| {
855            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
856            if !mountpoint.is_descendant_of(&root) {
857                return Ok(());
858            }
859            // Can't fail, mountpoint() and root() can't return a NamespaceNode with no mount
860            let parent = mountpoint.mount.as_ref().unwrap();
861            write!(
862                sink,
863                "{} {} {} {} {} {}",
864                mount.id,
865                parent.id,
866                mount.root.node.fs().dev_id,
867                path_from_fs_root(&mount.root),
868                mountpoint.path(&task_fs),
869                mount.mount_flags(),
870            )?;
871            if let Some(peer_group) = mount.read().peer_group() {
872                write!(sink, " shared:{}", peer_group.id)?;
873            }
874            if let Some(upstream) = mount.read().upstream() {
875                write!(sink, " master:{}", upstream.id)?;
876            }
877            writeln!(
878                sink,
879                " - {} {} {}{}",
880                mount.fs.name(),
881                mount.fs.options.source_for_display(),
882                mount.fs_flags(),
883                // LSM options are associated with the FileSystem rather than the Mount.
884                security::sb_show_options(&task.kernel(), &mount.fs)?
885            )?;
886            Ok(())
887        })?;
888        Ok(())
889    }
890}
891
892fn for_each_mount<E>(
893    mount: &MountHandle,
894    callback: &mut impl FnMut(&MountHandle) -> Result<(), E>,
895) -> Result<(), E> {
896    callback(mount)?;
897    // Collect list first to avoid self deadlock when ProcMountinfoFile::read_at tries to call
898    // NamespaceNode::path()
899    let submounts: Vec<_> = mount.read().submounts.iter().map(|s| s.mount.clone()).collect();
900    for submount in submounts {
901        for_each_mount(&submount, callback)?;
902    }
903    Ok(())
904}
905
906/// The `SymlinkMode` enum encodes how symlinks are followed during path traversal.
907#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)]
908pub enum SymlinkMode {
909    /// Follow a symlink at the end of a path resolution.
910    #[default]
911    Follow,
912
913    /// Do not follow a symlink at the end of a path resolution.
914    NoFollow,
915}
916
917/// The maximum number of symlink traversals that can be made during path resolution.
918pub const MAX_SYMLINK_FOLLOWS: u8 = 40;
919
920/// The context passed during namespace lookups.
921///
922/// Namespace lookups need to mutate a shared context in order to correctly
923/// count the number of remaining symlink traversals.
924pub struct LookupContext {
925    /// The SymlinkMode for the lookup.
926    ///
927    /// As the lookup proceeds, the follow count is decremented each time the
928    /// lookup traverses a symlink.
929    pub symlink_mode: SymlinkMode,
930
931    /// The number of symlinks remaining the follow.
932    ///
933    /// Each time path resolution calls readlink, this value is decremented.
934    pub remaining_follows: u8,
935
936    /// Whether the result of the lookup must be a directory.
937    ///
938    /// For example, if the path ends with a `/` or if userspace passes
939    /// O_DIRECTORY. This flag can be set to true if the lookup encounters a
940    /// symlink that ends with a `/`.
941    pub must_be_directory: bool,
942
943    /// Resolve flags passed to `openat2`. Empty if the lookup originated in any other syscall.
944    pub resolve_flags: ResolveFlags,
945
946    /// Base directory for the lookup. Set only when either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT`
947    /// is passed to `openat2`.
948    pub resolve_base: ResolveBase,
949}
950
951/// Used to specify base directory in `LookupContext` for lookups originating in the `openat2`
952/// syscall with either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT` flag.
953#[derive(Clone, Eq, PartialEq)]
954pub enum ResolveBase {
955    None,
956
957    /// The lookup is not allowed to traverse any node that's not beneath the specified node.
958    Beneath(NamespaceNode),
959
960    /// The lookup should be handled as if the root specified node is the file-system root.
961    InRoot(NamespaceNode),
962}
963
964impl LookupContext {
965    pub fn new(symlink_mode: SymlinkMode) -> LookupContext {
966        LookupContext {
967            symlink_mode,
968            remaining_follows: MAX_SYMLINK_FOLLOWS,
969            must_be_directory: false,
970            resolve_flags: ResolveFlags::empty(),
971            resolve_base: ResolveBase::None,
972        }
973    }
974
975    pub fn with(&self, symlink_mode: SymlinkMode) -> LookupContext {
976        LookupContext { symlink_mode, resolve_base: self.resolve_base.clone(), ..*self }
977    }
978
979    pub fn update_for_path(&mut self, path: &FsStr) {
980        if path.last() == Some(&b'/') {
981            // The last path element must resolve to a directory. This is because a trailing slash
982            // was found in the path.
983            self.must_be_directory = true;
984            // If the last path element is a symlink, we should follow it.
985            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
986            self.symlink_mode = SymlinkMode::Follow;
987        }
988    }
989}
990
991impl Default for LookupContext {
992    fn default() -> Self {
993        LookupContext::new(SymlinkMode::Follow)
994    }
995}
996
997/// Whether the path is reachable from the given root.
998pub enum PathWithReachability {
999    /// The path is reachable from the given root.
1000    Reachable(FsString),
1001
1002    /// The path is not reachable from the given root.
1003    Unreachable(FsString),
1004}
1005
1006impl PathWithReachability {
1007    pub fn into_path(self) -> FsString {
1008        match self {
1009            PathWithReachability::Reachable(path) => path,
1010            PathWithReachability::Unreachable(path) => path,
1011        }
1012    }
1013}
1014
1015/// A node in a mount namespace.
1016///
1017/// This tree is a composite of the mount tree and the FsNode tree.
1018///
1019/// These nodes are used when traversing paths in a namespace in order to
1020/// present the client the directory structure that includes the mounted
1021/// filesystems.
1022#[derive(Clone)]
1023pub struct NamespaceNode {
1024    /// The mount where this namespace node is mounted.
1025    ///
1026    /// A given FsNode can be mounted in multiple places in a namespace. This
1027    /// field distinguishes between them.
1028    pub mount: MountInfo,
1029
1030    /// The FsNode that corresponds to this namespace entry.
1031    pub entry: DirEntryHandle,
1032}
1033
1034impl NamespaceNode {
1035    pub fn new(mount: MountHandle, entry: DirEntryHandle) -> Self {
1036        Self { mount: Some(mount).into(), entry }
1037    }
1038
1039    /// Create a namespace node that is not mounted in a namespace.
1040    pub fn new_anonymous(entry: DirEntryHandle) -> Self {
1041        Self { mount: None.into(), entry }
1042    }
1043
1044    /// Create a namespace node that is not mounted in a namespace and that refers to a node that
1045    /// is not rooted in a hierarchy and has no name.
1046    pub fn new_anonymous_unrooted(current_task: &CurrentTask, node: FsNodeHandle) -> Self {
1047        let dir_entry = DirEntry::new_unrooted(node);
1048        let _ = security::fs_node_init_with_dentry_no_xattr(current_task, &dir_entry);
1049        Self::new_anonymous(dir_entry)
1050    }
1051
1052    /// Create a FileObject corresponding to this namespace node.
1053    ///
1054    /// This function is the primary way of instantiating FileObjects. Each
1055    /// FileObject records the NamespaceNode that created it in order to
1056    /// remember its path in the Namespace.
1057    pub fn open(
1058        &self,
1059        locked: &mut Locked<Unlocked>,
1060        current_task: &CurrentTask,
1061        flags: OpenFlags,
1062        access_check: AccessCheck,
1063    ) -> Result<FileHandle, Errno> {
1064        let ops = self.entry.node.open(locked, current_task, self, flags, access_check)?;
1065        FileObject::new(locked, current_task, ops, self.clone(), flags)
1066    }
1067
1068    /// Create or open a node in the file system.
1069    ///
1070    /// Works for any type of node other than a symlink.
1071    ///
1072    /// Will return an existing node unless `flags` contains `OpenFlags::EXCL`.
1073    pub fn open_create_node<L>(
1074        &self,
1075        locked: &mut Locked<L>,
1076        current_task: &CurrentTask,
1077        name: &FsStr,
1078        mode: FileMode,
1079        dev: DeviceId,
1080        flags: OpenFlags,
1081    ) -> Result<NamespaceNode, Errno>
1082    where
1083        L: LockEqualOrBefore<FileOpsCore>,
1084    {
1085        let owner = current_task.current_fscred();
1086        let mode = current_task.fs().apply_umask(mode);
1087        let create_fn =
1088            |locked: &mut Locked<L>, dir: &FsNodeHandle, mount: &MountInfo, name: &_| {
1089                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1090            };
1091        let entry = if flags.contains(OpenFlags::EXCL) {
1092            self.entry.create_entry(locked, current_task, &self.mount, name, create_fn)
1093        } else {
1094            self.entry.get_or_create_entry(locked, current_task, &self.mount, name, create_fn)
1095        }?;
1096        Ok(self.with_new_entry(entry))
1097    }
1098
1099    pub fn into_active(self) -> ActiveNamespaceNode {
1100        ActiveNamespaceNode::new(self)
1101    }
1102
1103    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1104        self.into_active().into_mapping(mode)
1105    }
1106
1107    /// Create a node in the file system.
1108    ///
1109    /// Works for any type of node other than a symlink.
1110    ///
1111    /// Does not return an existing node.
1112    pub fn create_node<L>(
1113        &self,
1114        locked: &mut Locked<L>,
1115        current_task: &CurrentTask,
1116        name: &FsStr,
1117        mode: FileMode,
1118        dev: DeviceId,
1119    ) -> Result<NamespaceNode, Errno>
1120    where
1121        L: LockEqualOrBefore<FileOpsCore>,
1122    {
1123        let owner = current_task.current_fscred();
1124        let mode = current_task.fs().apply_umask(mode);
1125        let entry = self.entry.create_entry(
1126            locked,
1127            current_task,
1128            &self.mount,
1129            name,
1130            |locked, dir, mount, name| {
1131                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1132            },
1133        )?;
1134        Ok(self.with_new_entry(entry))
1135    }
1136
1137    /// Create a symlink in the file system.
1138    ///
1139    /// To create another type of node, use `create_node`.
1140    pub fn create_symlink<L>(
1141        &self,
1142        locked: &mut Locked<L>,
1143        current_task: &CurrentTask,
1144        name: &FsStr,
1145        target: &FsStr,
1146    ) -> Result<NamespaceNode, Errno>
1147    where
1148        L: LockEqualOrBefore<FileOpsCore>,
1149    {
1150        let owner = current_task.current_fscred();
1151        let entry = self.entry.create_entry(
1152            locked,
1153            current_task,
1154            &self.mount,
1155            name,
1156            |locked, dir, mount, name| {
1157                dir.create_symlink(locked, current_task, mount, name, target, owner)
1158            },
1159        )?;
1160        Ok(self.with_new_entry(entry))
1161    }
1162
1163    /// Creates an anonymous file.
1164    ///
1165    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
1166    ///
1167    /// Used by O_TMPFILE.
1168    pub fn create_tmpfile<L>(
1169        &self,
1170        locked: &mut Locked<L>,
1171        current_task: &CurrentTask,
1172        mode: FileMode,
1173        flags: OpenFlags,
1174    ) -> Result<NamespaceNode, Errno>
1175    where
1176        L: LockEqualOrBefore<FileOpsCore>,
1177    {
1178        let owner = current_task.current_fscred();
1179        let mode = current_task.fs().apply_umask(mode);
1180        Ok(self.with_new_entry(self.entry.create_tmpfile(
1181            locked,
1182            current_task,
1183            &self.mount,
1184            mode,
1185            owner,
1186            flags,
1187        )?))
1188    }
1189
1190    pub fn link<L>(
1191        &self,
1192        locked: &mut Locked<L>,
1193        current_task: &CurrentTask,
1194        name: &FsStr,
1195        child: &FsNodeHandle,
1196    ) -> Result<NamespaceNode, Errno>
1197    where
1198        L: LockEqualOrBefore<FileOpsCore>,
1199    {
1200        let dir_entry = self.entry.create_entry(
1201            locked,
1202            current_task,
1203            &self.mount,
1204            name,
1205            |locked, dir, mount, name| dir.link(locked, current_task, mount, name, child),
1206        )?;
1207        Ok(self.with_new_entry(dir_entry))
1208    }
1209
1210    pub fn bind_socket<L>(
1211        &self,
1212        locked: &mut Locked<L>,
1213        current_task: &CurrentTask,
1214        name: &FsStr,
1215        socket: SocketHandle,
1216        socket_address: SocketAddress,
1217        mode: FileMode,
1218    ) -> Result<NamespaceNode, Errno>
1219    where
1220        L: LockEqualOrBefore<FileOpsCore>,
1221    {
1222        let dir_entry = self.entry.create_entry(
1223            locked,
1224            current_task,
1225            &self.mount,
1226            name,
1227            |locked, dir, mount, name| {
1228                let node = dir.create_node(
1229                    locked,
1230                    current_task,
1231                    mount,
1232                    name,
1233                    mode,
1234                    DeviceId::NONE,
1235                    current_task.current_fscred(),
1236                )?;
1237                if let Some(unix_socket) = socket.downcast_socket::<UnixSocket>() {
1238                    unix_socket.bind_socket_to_node(&socket, socket_address, &node)?;
1239                } else {
1240                    return error!(ENOTSUP);
1241                }
1242                Ok(node)
1243            },
1244        )?;
1245        Ok(self.with_new_entry(dir_entry))
1246    }
1247
1248    pub fn unlink<L>(
1249        &self,
1250        locked: &mut Locked<L>,
1251        current_task: &CurrentTask,
1252        name: &FsStr,
1253        kind: UnlinkKind,
1254        must_be_directory: bool,
1255    ) -> Result<(), Errno>
1256    where
1257        L: LockEqualOrBefore<FileOpsCore>,
1258    {
1259        if DirEntry::is_reserved_name(name) {
1260            match kind {
1261                UnlinkKind::Directory => {
1262                    if name == ".." {
1263                        error!(ENOTEMPTY)
1264                    } else if self.parent().is_none() {
1265                        // The client is attempting to remove the root.
1266                        error!(EBUSY)
1267                    } else {
1268                        error!(EINVAL)
1269                    }
1270                }
1271                UnlinkKind::NonDirectory => error!(ENOTDIR),
1272            }
1273        } else {
1274            self.entry.unlink(locked, current_task, &self.mount, name, kind, must_be_directory)
1275        }
1276    }
1277
1278    // Resolve the current node.
1279    //
1280    // Depending on context, this will resolve symlink and mount point.
1281    fn resolve<L>(
1282        self,
1283        locked: &mut Locked<L>,
1284        current_task: &CurrentTask,
1285        context: &mut LookupContext,
1286    ) -> Result<NamespaceNode, Errno>
1287    where
1288        L: LockEqualOrBefore<FileOpsCore>,
1289    {
1290        let mut node = self;
1291
1292        loop {
1293            if !node.entry.node.is_lnk() || context.symlink_mode == SymlinkMode::NoFollow {
1294                break;
1295            }
1296            if context.remaining_follows == 0
1297                || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
1298            {
1299                return error!(ELOOP);
1300            }
1301            context.remaining_follows -= 1;
1302            node = match node.readlink(locked, current_task)? {
1303                SymlinkTarget::Path(link_target) => {
1304                    let link_directory = if link_target[0] == b'/' {
1305                        // If the path is absolute, we'll resolve the root directory.
1306                        match &context.resolve_base {
1307                            ResolveBase::None => current_task.fs().root(),
1308                            ResolveBase::Beneath(_) => return error!(EXDEV),
1309                            ResolveBase::InRoot(root) => root.clone(),
1310                        }
1311                    } else {
1312                        // If the path is not absolute, it's a relative directory.
1313                        // Let's try to get the parent of the current node, or in the case that
1314                        // the node is the root we can just use that directly.
1315                        node.parent().unwrap_or(node)
1316                    };
1317                    current_task.lookup_path(
1318                        locked,
1319                        context,
1320                        link_directory,
1321                        link_target.as_ref(),
1322                    )?
1323                }
1324                SymlinkTarget::Node(node) => {
1325                    if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
1326                        return error!(ELOOP);
1327                    }
1328                    node
1329                }
1330            };
1331        }
1332        Ok(node.enter_mount())
1333    }
1334
1335    /// Traverse down a parent-to-child link in the namespace.
1336    pub fn lookup_child<L>(
1337        &self,
1338        locked: &mut Locked<L>,
1339        current_task: &CurrentTask,
1340        context: &mut LookupContext,
1341        basename: &FsStr,
1342    ) -> Result<NamespaceNode, Errno>
1343    where
1344        L: LockEqualOrBefore<FileOpsCore>,
1345    {
1346        self.lookup_children(locked, current_task, context, &[basename])
1347    }
1348
1349    /// Traverse down a parent-to-child link in the namespace.
1350    pub fn lookup_children<L>(
1351        &self,
1352        locked: &mut Locked<L>,
1353        current_task: &CurrentTask,
1354        context: &mut LookupContext,
1355        mut basenames: &[&FsStr],
1356    ) -> Result<NamespaceNode, Errno>
1357    where
1358        L: LockEqualOrBefore<FileOpsCore>,
1359    {
1360        for name in basenames {
1361            if name.len() > NAME_MAX as usize {
1362                return error!(ENAMETOOLONG);
1363            }
1364        }
1365
1366        let mut current_namespace_node = self.clone();
1367
1368        while basenames.len() > 0 {
1369            if !current_namespace_node.entry.node.is_dir() {
1370                return error!(ENOTDIR);
1371            }
1372
1373            let basename = basenames[0];
1374            if basename.is_empty() || basename == "." {
1375                basenames = &basenames[1..];
1376                continue;
1377            }
1378            if basename == ".." {
1379                let root = match &context.resolve_base {
1380                    ResolveBase::None => current_task.fs().root(),
1381                    ResolveBase::Beneath(node) => {
1382                        // Do not allow traversal out of the 'node'.
1383                        if current_namespace_node == *node {
1384                            return error!(EXDEV);
1385                        }
1386                        current_task.fs().root()
1387                    }
1388                    ResolveBase::InRoot(root) => root.clone(),
1389                };
1390
1391                // Make sure this can't escape a chroot.
1392                if current_namespace_node != root {
1393                    current_namespace_node =
1394                        current_namespace_node.parent().unwrap_or(current_namespace_node)
1395                }
1396                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1397                    && current_namespace_node.mount != self.mount
1398                {
1399                    return error!(EXDEV);
1400                }
1401
1402                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1403                    return error!(ENOTDIR);
1404                }
1405                basenames = &basenames[1..];
1406                continue;
1407            }
1408            if basenames.len() == 1
1409                || !current_namespace_node.entry.node.ops().has_lookup_pipelined()
1410            {
1411                current_namespace_node = current_namespace_node.with_new_entry(
1412                    current_namespace_node.entry.component_lookup(
1413                        locked,
1414                        current_task,
1415                        &current_namespace_node.mount,
1416                        basename,
1417                    )?,
1418                );
1419
1420                current_namespace_node =
1421                    current_namespace_node.resolve(locked, current_task, context)?;
1422
1423                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1424                    && current_namespace_node.mount != self.mount
1425                {
1426                    return error!(EXDEV);
1427                }
1428
1429                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1430                    return error!(ENOTDIR);
1431                }
1432
1433                basenames = &basenames[1..];
1434                continue;
1435            }
1436
1437            let pipelined_basenames = if let Some(pos) =
1438                basenames.iter().position(|&name| name.is_empty() || name == "." || name == "..")
1439            {
1440                &basenames[..pos]
1441            } else {
1442                basenames
1443            };
1444            let precomputed_entries = current_namespace_node.entry.get_children_pipelined(
1445                locked,
1446                current_task,
1447                &current_namespace_node.mount,
1448                pipelined_basenames,
1449            );
1450            for entry in precomputed_entries {
1451                basenames = &basenames[1..];
1452                let child = current_namespace_node.with_new_entry(entry?);
1453
1454                current_namespace_node = child.clone().resolve(locked, current_task, context)?;
1455
1456                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1457                    && current_namespace_node.mount != self.mount
1458                {
1459                    return error!(EXDEV);
1460                }
1461
1462                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1463                    return error!(ENOTDIR);
1464                }
1465
1466                if current_namespace_node != child {
1467                    break;
1468                }
1469            }
1470        }
1471
1472        Ok(current_namespace_node)
1473    }
1474
1475    /// Traverse up a child-to-parent link in the namespace.
1476    ///
1477    /// This traversal matches the child-to-parent link in the underlying
1478    /// FsNode except at mountpoints, where the link switches from one
1479    /// filesystem to another.
1480    pub fn parent(&self) -> Option<NamespaceNode> {
1481        let mountpoint_or_self = self.escape_mount();
1482        let parent = mountpoint_or_self.entry.parent()?;
1483        Some(mountpoint_or_self.with_new_entry(parent))
1484    }
1485
1486    /// Returns the parent, but does not escape mounts i.e. returns None if this node
1487    /// is the root of a mount.
1488    pub fn parent_within_mount(&self) -> Option<DirEntryHandle> {
1489        if let Ok(_) = self.mount_if_root() {
1490            return None;
1491        }
1492        self.entry.parent()
1493    }
1494
1495    /// Whether this namespace node is a descendant of the given node.
1496    ///
1497    /// Walks up the namespace node tree looking for ancestor. If ancestor is
1498    /// found, returns true. Otherwise, returns false.
1499    pub fn is_descendant_of(&self, ancestor: &NamespaceNode) -> bool {
1500        let ancestor = ancestor.escape_mount();
1501        let mut current = self.escape_mount();
1502        while current != ancestor {
1503            if let Some(parent) = current.parent() {
1504                current = parent.escape_mount();
1505            } else {
1506                return false;
1507            }
1508        }
1509        true
1510    }
1511
1512    /// If this is a mount point, return the root of the mount. Otherwise return self.
1513    fn enter_mount(&self) -> NamespaceNode {
1514        // While the child is a mountpoint, replace child with the mount's root.
1515        fn enter_one_mount(node: &NamespaceNode) -> Option<NamespaceNode> {
1516            if let Some(mount) = node.mount.deref() {
1517                if let Some(submount) =
1518                    mount.state.read().submounts.get(ArcKey::ref_cast(&node.entry))
1519                {
1520                    return Some(submount.mount.root());
1521                }
1522            }
1523            None
1524        }
1525        let mut inner = self.clone();
1526        while let Some(inner_root) = enter_one_mount(&inner) {
1527            inner = inner_root;
1528        }
1529        inner
1530    }
1531
1532    /// If this is the root of a mount, return the mount point. Otherwise return self.
1533    ///
1534    /// This is not exactly the same as parent(). If parent() is called on a root, it will escape
1535    /// the mount, but then return the parent of the mount point instead of the mount point.
1536    fn escape_mount(&self) -> NamespaceNode {
1537        let mut mountpoint_or_self = self.clone();
1538        while let Some(mountpoint) = mountpoint_or_self.mountpoint() {
1539            mountpoint_or_self = mountpoint;
1540        }
1541        mountpoint_or_self
1542    }
1543
1544    /// If this node is the root of a mount, return it. Otherwise EINVAL.
1545    pub fn mount_if_root(&self) -> Result<&MountHandle, Errno> {
1546        if let Some(mount) = self.mount.deref() {
1547            if Arc::ptr_eq(&self.entry, &mount.root) {
1548                return Ok(mount);
1549            }
1550        }
1551        error!(EINVAL)
1552    }
1553
1554    /// Returns the mountpoint at this location in the namespace.
1555    ///
1556    /// If this node is mounted in another node, this function returns the node
1557    /// at which this node is mounted. Otherwise, returns None.
1558    fn mountpoint(&self) -> Option<NamespaceNode> {
1559        self.mount_if_root().ok()?.read().mountpoint()
1560    }
1561
1562    /// The path from the filesystem root to this node.
1563    pub fn path(&self, fs: &FsContext) -> FsString {
1564        self.path_from_root(Some(&fs.root())).into_path()
1565    }
1566
1567    /// The path from the root of the namespace to this node.
1568    pub fn path_escaping_chroot(&self) -> FsString {
1569        self.path_from_root(None).into_path()
1570    }
1571
1572    /// Returns the path to this node, accounting for a custom root.
1573    /// A task may have a custom root set by `chroot`.
1574    pub fn path_from_root(&self, root: Option<&NamespaceNode>) -> PathWithReachability {
1575        if self.mount.is_none() {
1576            return PathWithReachability::Reachable(self.entry.node.internal_name());
1577        }
1578
1579        let mut path = PathBuilder::new();
1580        let mut current = self.escape_mount();
1581        if let Some(root) = root {
1582            let scope = RcuReadScope::new();
1583            // The current node is expected to intersect with the custom root as we travel up the tree.
1584            let root = root.escape_mount();
1585            while current != root {
1586                if let Some(parent) = current.parent() {
1587                    path.prepend_element(current.entry.local_name(&scope));
1588                    current = parent.escape_mount();
1589                } else {
1590                    // This node hasn't intersected with the custom root and has reached the namespace root.
1591                    let mut absolute_path = path.build_absolute();
1592                    if self.entry.is_dead() {
1593                        absolute_path.extend_from_slice(b" (deleted)");
1594                    }
1595
1596                    return PathWithReachability::Unreachable(absolute_path);
1597                }
1598            }
1599        } else {
1600            // No custom root, so travel up the tree to the namespace root.
1601            let scope = RcuReadScope::new();
1602            while let Some(parent) = current.parent() {
1603                path.prepend_element(current.entry.local_name(&scope));
1604                current = parent.escape_mount();
1605            }
1606        }
1607
1608        let mut absolute_path = path.build_absolute();
1609        if self.entry.is_dead() {
1610            absolute_path.extend_from_slice(b" (deleted)");
1611        }
1612
1613        PathWithReachability::Reachable(absolute_path)
1614    }
1615
1616    pub fn mount(&self, what: WhatToMount, flags: MountpointFlags) -> Result<(), Errno> {
1617        let mountpoint = self.enter_mount();
1618        let mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
1619        mount.create_submount(&mountpoint.entry, what, flags);
1620        Ok(())
1621    }
1622
1623    /// If this is the root of a filesystem, unmount. Otherwise return EINVAL.
1624    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
1625        let mount = self.enter_mount().mount_if_root()?.clone();
1626        mount.unmount(flags)
1627    }
1628
1629    pub fn rename<L>(
1630        locked: &mut Locked<L>,
1631        current_task: &CurrentTask,
1632        old_parent: &NamespaceNode,
1633        old_name: &FsStr,
1634        new_parent: &NamespaceNode,
1635        new_name: &FsStr,
1636        flags: RenameFlags,
1637    ) -> Result<(), Errno>
1638    where
1639        L: LockEqualOrBefore<FileOpsCore>,
1640    {
1641        DirEntry::rename(
1642            locked,
1643            current_task,
1644            &old_parent.entry,
1645            &old_parent.mount,
1646            old_name,
1647            &new_parent.entry,
1648            &new_parent.mount,
1649            new_name,
1650            flags,
1651        )
1652    }
1653
1654    fn with_new_entry(&self, entry: DirEntryHandle) -> NamespaceNode {
1655        Self { mount: self.mount.clone(), entry }
1656    }
1657
1658    fn mount_hash_key(&self) -> &ArcKey<DirEntry> {
1659        ArcKey::ref_cast(&self.entry)
1660    }
1661
1662    pub fn suid_and_sgid(&self, current_task: &CurrentTask) -> Result<UserAndOrGroupId, Errno> {
1663        if self.mount.flags().contains(MountFlags::NOSUID) {
1664            Ok(UserAndOrGroupId::default())
1665        } else {
1666            self.entry.node.info().suid_and_sgid(current_task, &self.entry.node)
1667        }
1668    }
1669
1670    pub fn update_atime(&self) {
1671        // Do not update the atime of this node if it is mounted with the NOATIME flag.
1672        if !self.mount.flags().contains(MountFlags::NOATIME) {
1673            self.entry.node.update_info(|info| {
1674                let now = utc::utc_now();
1675                info.time_access = now;
1676                info.pending_time_access_update = true;
1677            });
1678        }
1679    }
1680
1681    pub fn readlink<L>(
1682        &self,
1683        locked: &mut Locked<L>,
1684        current_task: &CurrentTask,
1685    ) -> Result<SymlinkTarget, Errno>
1686    where
1687        L: LockEqualOrBefore<FileOpsCore>,
1688    {
1689        self.update_atime();
1690        self.entry.node.readlink(locked, current_task)
1691    }
1692
1693    pub fn notify(&self, event_mask: InotifyMask) {
1694        if self.mount.is_some() {
1695            self.entry.notify(event_mask);
1696        }
1697    }
1698
1699    /// Check whether the node can be accessed in the current context with the specified access
1700    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1701    /// owner or is in the file's group.
1702    pub fn check_access<L>(
1703        &self,
1704        locked: &mut Locked<L>,
1705        current_task: &CurrentTask,
1706        permission_flags: impl Into<security::PermissionFlags>,
1707        reason: CheckAccessReason,
1708    ) -> Result<(), Errno>
1709    where
1710        L: LockEqualOrBefore<FileOpsCore>,
1711    {
1712        self.entry.node.check_access(
1713            locked,
1714            current_task,
1715            &self.mount,
1716            permission_flags,
1717            reason,
1718            self,
1719        )
1720    }
1721
1722    /// Checks if O_NOATIME is allowed,
1723    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1724        self.entry.node.check_o_noatime_allowed(current_task)
1725    }
1726
1727    pub fn truncate<L>(
1728        &self,
1729        locked: &mut Locked<L>,
1730        current_task: &CurrentTask,
1731        length: u64,
1732    ) -> Result<(), Errno>
1733    where
1734        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1735    {
1736        self.entry.node.truncate(locked, current_task, &self.mount, length)?;
1737        self.entry.notify_ignoring_excl_unlink(InotifyMask::MODIFY);
1738        Ok(())
1739    }
1740}
1741
1742impl fmt::Debug for NamespaceNode {
1743    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1744        f.debug_struct("NamespaceNode")
1745            .field("path", &self.path_escaping_chroot())
1746            .field("mount", &self.mount)
1747            .field("entry", &self.entry)
1748            .finish()
1749    }
1750}
1751
1752// Eq/Hash impls intended for the MOUNT_POINTS hash
1753impl PartialEq for NamespaceNode {
1754    fn eq(&self, other: &Self) -> bool {
1755        self.mount.as_ref().map(Arc::as_ptr).eq(&other.mount.as_ref().map(Arc::as_ptr))
1756            && Arc::ptr_eq(&self.entry, &other.entry)
1757    }
1758}
1759impl Eq for NamespaceNode {}
1760impl Hash for NamespaceNode {
1761    fn hash<H: Hasher>(&self, state: &mut H) {
1762        self.mount.as_ref().map(Arc::as_ptr).hash(state);
1763        Arc::as_ptr(&self.entry).hash(state);
1764    }
1765}
1766
1767/// A namespace node that keeps the underly mount busy.
1768#[derive(Debug, Clone)]
1769pub struct ActiveNamespaceNode {
1770    /// The underlying namespace node.
1771    name: NamespaceNode,
1772
1773    /// Adds a reference to the mount client marker to prevent the mount from
1774    /// being removed while the NamespaceNode is active. Is None iff mount is
1775    /// None.
1776    _marker: Option<MountClientMarker>,
1777}
1778
1779impl ActiveNamespaceNode {
1780    pub fn new(name: NamespaceNode) -> Self {
1781        let marker = name.mount.as_ref().map(|mount| mount.active_client_counter.clone());
1782        Self { name, _marker: marker }
1783    }
1784
1785    pub fn to_passive(&self) -> NamespaceNode {
1786        self.deref().clone()
1787    }
1788
1789    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1790        if let Some(mode) = mode {
1791            self.entry.node.write_guard_state.lock().acquire(mode)?;
1792        }
1793        Ok(Arc::new(FileMapping { name: self, mode }))
1794    }
1795}
1796
1797impl Deref for ActiveNamespaceNode {
1798    type Target = NamespaceNode;
1799
1800    fn deref(&self) -> &Self::Target {
1801        &self.name
1802    }
1803}
1804
1805impl PartialEq for ActiveNamespaceNode {
1806    fn eq(&self, other: &Self) -> bool {
1807        self.deref().eq(other.deref())
1808    }
1809}
1810impl Eq for ActiveNamespaceNode {}
1811impl Hash for ActiveNamespaceNode {
1812    fn hash<H: Hasher>(&self, state: &mut H) {
1813        self.deref().hash(state)
1814    }
1815}
1816
1817#[derive(Debug, Clone, PartialEq, Eq)]
1818#[must_use]
1819pub struct FileMapping {
1820    pub name: ActiveNamespaceNode,
1821    mode: Option<FileWriteGuardMode>,
1822}
1823
1824impl Drop for FileMapping {
1825    fn drop(&mut self) {
1826        if let Some(mode) = self.mode {
1827            self.name.entry.node.write_guard_state.lock().release(mode);
1828        }
1829    }
1830}
1831
1832/// Tracks all mounts, keyed by mount point.
1833pub struct Mounts {
1834    mounts: RcuHashMap<WeakKey<DirEntry>, Vec<ArcKey<Mount>>>,
1835}
1836
1837impl Mounts {
1838    pub fn new() -> Self {
1839        Mounts { mounts: RcuHashMap::default() }
1840    }
1841
1842    /// Registers the mount in the global mounts map.
1843    fn register_mount(&self, dir_entry: &Arc<DirEntry>, mount: MountHandle) -> Submount {
1844        let mut mounts = self.mounts.lock();
1845        let key = WeakKey::from(dir_entry);
1846        let mut vec = mounts.get(&key).unwrap_or_else(|| {
1847            dir_entry.set_has_mounts(true);
1848            Vec::new()
1849        });
1850        vec.push(ArcKey(mount.clone()));
1851        mounts.insert(key, vec);
1852        Submount { dir: ArcKey(dir_entry.clone()), mount }
1853    }
1854
1855    /// Unregisters the mount.  This is called by `Submount::drop`.
1856    fn unregister_mount(&self, dir_entry: &Arc<DirEntry>, mount: &MountHandle) {
1857        let mut mounts = self.mounts.lock();
1858        let key = WeakKey::from(dir_entry);
1859        if let Some(mut vec) = mounts.get(&key) {
1860            let index = vec.iter().position(|e| e == ArcKey::ref_cast(mount)).unwrap();
1861            if vec.len() == 1 {
1862                mounts.remove(&key);
1863                dir_entry.set_has_mounts(false);
1864            } else {
1865                vec.swap_remove(index);
1866                mounts.insert(key, vec);
1867            }
1868        }
1869    }
1870
1871    /// Unmounts all mounts associated with `dir_entry`.  This is called when `dir_entry` is
1872    /// unlinked (which would normally result in EBUSY, but not if it isn't mounted in the local
1873    /// namespace).
1874    pub fn unmount(&self, dir_entry: &DirEntry) {
1875        let mounts = self.mounts.lock().remove(&PtrKey::from(dir_entry as *const _));
1876        if let Some(mounts) = mounts {
1877            for mount in mounts {
1878                // Ignore errors.
1879                let _ = mount.unmount(UnmountFlags::DETACH);
1880            }
1881        }
1882    }
1883
1884    /// Drain mounts. For each drained mount, force a FileSystem unmount.
1885    // TODO(https://fxbug.dev/295073633): Graceful shutdown should try to first unmount the mounts
1886    // and only force a FileSystem unmount on failure.
1887    pub fn clear(&self) {
1888        for (_dir_entry, mounts) in self.mounts.lock().drain() {
1889            for mount in mounts {
1890                mount.fs.force_unmount_ops();
1891            }
1892        }
1893    }
1894
1895    pub fn sync_all(
1896        &self,
1897        locked: &mut Locked<Unlocked>,
1898        current_task: &CurrentTask,
1899    ) -> Result<(), Errno> {
1900        let mut filesystems = Vec::new();
1901        {
1902            let scope = RcuReadScope::new();
1903            let mut seen = HashSet::new();
1904            for (_dir_entry, m_list) in self.mounts.iter(&scope) {
1905                for m in m_list {
1906                    if seen.insert(Arc::as_ptr(&m.fs)) {
1907                        filesystems.push(m.fs.clone());
1908                    }
1909                }
1910            }
1911        }
1912
1913        for fs in filesystems {
1914            if let Err(e) = fs.sync(locked, current_task) {
1915                log_warn!("sync failed for filesystem {:?}: {:?}", fs.name(), e);
1916            }
1917        }
1918        Ok(())
1919    }
1920}
1921
1922/// A RAII object that unregisters a mount when dropped.
1923#[derive(Debug)]
1924struct Submount {
1925    dir: ArcKey<DirEntry>,
1926    mount: MountHandle,
1927}
1928
1929impl Drop for Submount {
1930    fn drop(&mut self) {
1931        self.mount.fs.kernel.upgrade().unwrap().mounts.unregister_mount(&self.dir, &self.mount)
1932    }
1933}
1934
1935/// Submount is stored in a mount's submounts hash set, which is keyed by the mountpoint.
1936impl Eq for Submount {}
1937impl PartialEq<Self> for Submount {
1938    fn eq(&self, other: &Self) -> bool {
1939        self.dir == other.dir
1940    }
1941}
1942impl Hash for Submount {
1943    fn hash<H: Hasher>(&self, state: &mut H) {
1944        self.dir.hash(state)
1945    }
1946}
1947
1948impl Borrow<ArcKey<DirEntry>> for Submount {
1949    fn borrow(&self) -> &ArcKey<DirEntry> {
1950        &self.dir
1951    }
1952}
1953
1954#[cfg(test)]
1955mod test {
1956    use crate::fs::tmpfs::TmpFs;
1957    use crate::testing::spawn_kernel_and_run;
1958    use crate::vfs::namespace::DeviceId;
1959    use crate::vfs::{
1960        CallbackSymlinkNode, FsNodeInfo, LookupContext, MountInfo, Namespace, NamespaceNode,
1961        RenameFlags, SymlinkMode, SymlinkTarget, UnlinkKind, WhatToMount,
1962    };
1963    use starnix_uapi::mount_flags::MountpointFlags;
1964    use starnix_uapi::{errno, mode};
1965    use std::sync::Arc;
1966
1967    #[::fuchsia::test]
1968    async fn test_namespace() {
1969        spawn_kernel_and_run(async |locked, current_task| {
1970            let kernel = current_task.kernel();
1971            let root_fs = TmpFs::new_fs(locked, &kernel);
1972            let root_node = Arc::clone(root_fs.root());
1973            let _dev_node = root_node
1974                .create_dir(locked, &current_task, "dev".into())
1975                .expect("failed to mkdir dev");
1976            let dev_fs = TmpFs::new_fs(locked, &kernel);
1977            let dev_root_node = Arc::clone(dev_fs.root());
1978            let _dev_pts_node = dev_root_node
1979                .create_dir(locked, &current_task, "pts".into())
1980                .expect("failed to mkdir pts");
1981
1982            let ns = Namespace::new(root_fs);
1983            let mut context = LookupContext::default();
1984            let dev = ns
1985                .root()
1986                .lookup_child(locked, &current_task, &mut context, "dev".into())
1987                .expect("failed to lookup dev");
1988            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
1989                .expect("failed to mount dev root node");
1990
1991            let mut context = LookupContext::default();
1992            let dev = ns
1993                .root()
1994                .lookup_child(locked, &current_task, &mut context, "dev".into())
1995                .expect("failed to lookup dev");
1996            let mut context = LookupContext::default();
1997            let pts = dev
1998                .lookup_child(locked, &current_task, &mut context, "pts".into())
1999                .expect("failed to lookup pts");
2000            let pts_parent =
2001                pts.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of pts");
2002            assert!(Arc::ptr_eq(&pts_parent.entry, &dev.entry));
2003
2004            let dev_parent =
2005                dev.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of dev");
2006            assert!(Arc::ptr_eq(&dev_parent.entry, &ns.root().entry));
2007        })
2008        .await;
2009    }
2010
2011    #[::fuchsia::test]
2012    async fn test_mount_does_not_upgrade() {
2013        spawn_kernel_and_run(async |locked, current_task| {
2014            let kernel = current_task.kernel();
2015            let root_fs = TmpFs::new_fs(locked, &kernel);
2016            let root_node = Arc::clone(root_fs.root());
2017            let _dev_node = root_node
2018                .create_dir(locked, &current_task, "dev".into())
2019                .expect("failed to mkdir dev");
2020            let dev_fs = TmpFs::new_fs(locked, &kernel);
2021            let dev_root_node = Arc::clone(dev_fs.root());
2022            let _dev_pts_node = dev_root_node
2023                .create_dir(locked, &current_task, "pts".into())
2024                .expect("failed to mkdir pts");
2025
2026            let ns = Namespace::new(root_fs);
2027            let mut context = LookupContext::default();
2028            let dev = ns
2029                .root()
2030                .lookup_child(locked, &current_task, &mut context, "dev".into())
2031                .expect("failed to lookup dev");
2032            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2033                .expect("failed to mount dev root node");
2034            let mut context = LookupContext::default();
2035            let new_dev = ns
2036                .root()
2037                .lookup_child(locked, &current_task, &mut context, "dev".into())
2038                .expect("failed to lookup dev again");
2039            assert!(!Arc::ptr_eq(&dev.entry, &new_dev.entry));
2040            assert_ne!(&dev, &new_dev);
2041
2042            let mut context = LookupContext::default();
2043            let _new_pts = new_dev
2044                .lookup_child(locked, &current_task, &mut context, "pts".into())
2045                .expect("failed to lookup pts");
2046            let mut context = LookupContext::default();
2047            assert!(dev.lookup_child(locked, &current_task, &mut context, "pts".into()).is_err());
2048        })
2049        .await;
2050    }
2051
2052    #[::fuchsia::test]
2053    async fn test_path() {
2054        spawn_kernel_and_run(async |locked, current_task| {
2055            let kernel = current_task.kernel();
2056            let root_fs = TmpFs::new_fs(locked, &kernel);
2057            let root_node = Arc::clone(root_fs.root());
2058            let _dev_node = root_node
2059                .create_dir(locked, &current_task, "dev".into())
2060                .expect("failed to mkdir dev");
2061            let dev_fs = TmpFs::new_fs(locked, &kernel);
2062            let dev_root_node = Arc::clone(dev_fs.root());
2063            let _dev_pts_node = dev_root_node
2064                .create_dir(locked, &current_task, "pts".into())
2065                .expect("failed to mkdir pts");
2066
2067            let ns = Namespace::new(root_fs);
2068            let mut context = LookupContext::default();
2069            let dev = ns
2070                .root()
2071                .lookup_child(locked, &current_task, &mut context, "dev".into())
2072                .expect("failed to lookup dev");
2073            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2074                .expect("failed to mount dev root node");
2075
2076            let mut context = LookupContext::default();
2077            let dev = ns
2078                .root()
2079                .lookup_child(locked, &current_task, &mut context, "dev".into())
2080                .expect("failed to lookup dev");
2081            let mut context = LookupContext::default();
2082            let pts = dev
2083                .lookup_child(locked, &current_task, &mut context, "pts".into())
2084                .expect("failed to lookup pts");
2085
2086            assert_eq!("/", ns.root().path_escaping_chroot());
2087            assert_eq!("/dev", dev.path_escaping_chroot());
2088            assert_eq!("/dev/pts", pts.path_escaping_chroot());
2089        })
2090        .await;
2091    }
2092
2093    #[::fuchsia::test]
2094    async fn test_shadowing() {
2095        spawn_kernel_and_run(async |locked, current_task| {
2096            let kernel = current_task.kernel();
2097            let root_fs = TmpFs::new_fs(locked, &kernel);
2098            let ns = Namespace::new(root_fs.clone());
2099            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2100            let mut context = LookupContext::default();
2101            let foo_dir =
2102                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2103
2104            let foofs1 = TmpFs::new_fs(locked, &kernel);
2105            foo_dir.mount(WhatToMount::Fs(foofs1.clone()), MountpointFlags::empty()).unwrap();
2106            let mut context = LookupContext::default();
2107            assert!(Arc::ptr_eq(
2108                &ns.root()
2109                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2110                    .unwrap()
2111                    .entry,
2112                foofs1.root()
2113            ));
2114            let foo_dir =
2115                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2116
2117            let ns_clone = ns.clone_namespace();
2118
2119            let foofs2 = TmpFs::new_fs(locked, &kernel);
2120            foo_dir.mount(WhatToMount::Fs(foofs2.clone()), MountpointFlags::empty()).unwrap();
2121            let mut context = LookupContext::default();
2122            assert!(Arc::ptr_eq(
2123                &ns.root()
2124                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2125                    .unwrap()
2126                    .entry,
2127                foofs2.root()
2128            ));
2129
2130            assert!(Arc::ptr_eq(
2131                &ns_clone
2132                    .root()
2133                    .lookup_child(
2134                        locked,
2135                        &current_task,
2136                        &mut LookupContext::default(),
2137                        "foo".into()
2138                    )
2139                    .unwrap()
2140                    .entry,
2141                foofs1.root()
2142            ));
2143        })
2144        .await;
2145    }
2146
2147    #[::fuchsia::test]
2148    async fn test_unlink_mounted_directory() {
2149        spawn_kernel_and_run(async |locked, current_task| {
2150            let kernel = current_task.kernel();
2151            let root_fs = TmpFs::new_fs(locked, &kernel);
2152            let ns1 = Namespace::new(root_fs.clone());
2153            let ns2 = Namespace::new(root_fs.clone());
2154            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2155            let mut context = LookupContext::default();
2156            let foo_dir =
2157                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2158
2159            let foofs = TmpFs::new_fs(locked, &kernel);
2160            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2161
2162            // Trying to unlink from ns1 should fail.
2163            assert_eq!(
2164                ns1.root()
2165                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2166                    .unwrap_err(),
2167                errno!(EBUSY),
2168            );
2169
2170            // But unlinking from ns2 should succeed.
2171            ns2.root()
2172                .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2173                .expect("unlink failed");
2174
2175            // And it should no longer show up in ns1.
2176            assert_eq!(
2177                ns1.root()
2178                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2179                    .unwrap_err(),
2180                errno!(ENOENT),
2181            );
2182        })
2183        .await;
2184    }
2185
2186    #[::fuchsia::test]
2187    async fn test_rename_mounted_directory() {
2188        spawn_kernel_and_run(async |locked, current_task| {
2189            let kernel = current_task.kernel();
2190            let root_fs = TmpFs::new_fs(locked, &kernel);
2191            let ns1 = Namespace::new(root_fs.clone());
2192            let ns2 = Namespace::new(root_fs.clone());
2193            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2194            let _bar_node = root_fs.root().create_dir(locked, &current_task, "bar".into()).unwrap();
2195            let _baz_node = root_fs.root().create_dir(locked, &current_task, "baz".into()).unwrap();
2196            let mut context = LookupContext::default();
2197            let foo_dir =
2198                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2199
2200            let foofs = TmpFs::new_fs(locked, &kernel);
2201            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2202
2203            // Trying to rename over foo from ns1 should fail.
2204            let root = ns1.root();
2205            assert_eq!(
2206                NamespaceNode::rename(
2207                    locked,
2208                    &current_task,
2209                    &root,
2210                    "bar".into(),
2211                    &root,
2212                    "foo".into(),
2213                    RenameFlags::empty()
2214                )
2215                .unwrap_err(),
2216                errno!(EBUSY),
2217            );
2218            // Likewise the other way.
2219            assert_eq!(
2220                NamespaceNode::rename(
2221                    locked,
2222                    &current_task,
2223                    &root,
2224                    "foo".into(),
2225                    &root,
2226                    "bar".into(),
2227                    RenameFlags::empty()
2228                )
2229                .unwrap_err(),
2230                errno!(EBUSY),
2231            );
2232
2233            // But renaming from ns2 should succeed.
2234            let root = ns2.root();
2235
2236            // First rename the directory with the mount.
2237            NamespaceNode::rename(
2238                locked,
2239                &current_task,
2240                &root,
2241                "foo".into(),
2242                &root,
2243                "bar".into(),
2244                RenameFlags::empty(),
2245            )
2246            .expect("rename failed");
2247
2248            // Renaming over a directory with a mount should also work.
2249            NamespaceNode::rename(
2250                locked,
2251                &current_task,
2252                &root,
2253                "baz".into(),
2254                &root,
2255                "bar".into(),
2256                RenameFlags::empty(),
2257            )
2258            .expect("rename failed");
2259
2260            // "foo" and "baz" should no longer show up in ns1.
2261            assert_eq!(
2262                ns1.root()
2263                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2264                    .unwrap_err(),
2265                errno!(ENOENT)
2266            );
2267            assert_eq!(
2268                ns1.root()
2269                    .lookup_child(locked, &current_task, &mut context, "baz".into())
2270                    .unwrap_err(),
2271                errno!(ENOENT)
2272            );
2273        })
2274        .await;
2275    }
2276
2277    /// Symlinks which need to be traversed across types (nodes and paths), as well as across
2278    /// owning directories, can be tricky to get right.
2279    #[::fuchsia::test]
2280    async fn test_lookup_with_symlink_chain() {
2281        spawn_kernel_and_run(async |locked, current_task| {
2282            // Set up the root filesystem
2283            let kernel = current_task.kernel();
2284            let root_fs = TmpFs::new_fs(locked, &kernel);
2285            let root_node = Arc::clone(root_fs.root());
2286            let _first_subdir_node = root_node
2287                .create_dir(locked, &current_task, "first_subdir".into())
2288                .expect("failed to mkdir dev");
2289            let _second_subdir_node = root_node
2290                .create_dir(locked, &current_task, "second_subdir".into())
2291                .expect("failed to mkdir dev");
2292
2293            // Set up two subdirectories under the root filesystem
2294            let first_subdir_fs = TmpFs::new_fs(locked, &kernel);
2295            let second_subdir_fs = TmpFs::new_fs(locked, &kernel);
2296
2297            let ns = Namespace::new(root_fs);
2298            let mut context = LookupContext::default();
2299            let first_subdir = ns
2300                .root()
2301                .lookup_child(locked, &current_task, &mut context, "first_subdir".into())
2302                .expect("failed to lookup first_subdir");
2303            first_subdir
2304                .mount(WhatToMount::Fs(first_subdir_fs), MountpointFlags::empty())
2305                .expect("failed to mount first_subdir fs node");
2306            let second_subdir = ns
2307                .root()
2308                .lookup_child(locked, &current_task, &mut context, "second_subdir".into())
2309                .expect("failed to lookup second_subdir");
2310            second_subdir
2311                .mount(WhatToMount::Fs(second_subdir_fs), MountpointFlags::empty())
2312                .expect("failed to mount second_subdir fs node");
2313
2314            // Create the symlink structure. To trigger potential symlink traversal bugs, we're going
2315            // for the following directory structure:
2316            // / (root)
2317            //     + first_subdir/
2318            //         - real_file
2319            //         - path_symlink (-> real_file)
2320            //     + second_subdir/
2321            //         - node_symlink (-> path_symlink)
2322            let real_file_node = first_subdir
2323                .create_node(
2324                    locked,
2325                    &current_task,
2326                    "real_file".into(),
2327                    mode!(IFREG, 0o777),
2328                    DeviceId::NONE,
2329                )
2330                .expect("failed to create real_file");
2331            first_subdir
2332                .create_symlink(locked, &current_task, "path_symlink".into(), "real_file".into())
2333                .expect("failed to create path_symlink");
2334
2335            let mut no_follow_lookup_context = LookupContext::new(SymlinkMode::NoFollow);
2336            let path_symlink_node = first_subdir
2337                .lookup_child(
2338                    locked,
2339                    &current_task,
2340                    &mut no_follow_lookup_context,
2341                    "path_symlink".into(),
2342                )
2343                .expect("Failed to lookup path_symlink");
2344
2345            // The second symlink needs to be of type SymlinkTarget::Node in order to trip the sensitive
2346            // code path. There's no easy method for creating this type of symlink target, so we'll need
2347            // to construct a node from scratch and insert it into the directory manually.
2348            let node_symlink_node = second_subdir.entry.node.fs().create_node_and_allocate_node_id(
2349                CallbackSymlinkNode::new(move || {
2350                    let node = path_symlink_node.clone();
2351                    Ok(SymlinkTarget::Node(node))
2352                }),
2353                FsNodeInfo::new(mode!(IFLNK, 0o777), current_task.current_fscred()),
2354            );
2355            second_subdir
2356                .entry
2357                .create_entry(
2358                    locked,
2359                    &current_task,
2360                    &MountInfo::detached(),
2361                    "node_symlink".into(),
2362                    move |_locked, _dir, _mount, _name| Ok(node_symlink_node),
2363                )
2364                .expect("failed to create node_symlink entry");
2365
2366            // Finally, exercise the lookup under test.
2367            let mut follow_lookup_context = LookupContext::new(SymlinkMode::Follow);
2368            let node_symlink_resolution = second_subdir
2369                .lookup_child(
2370                    locked,
2371                    &current_task,
2372                    &mut follow_lookup_context,
2373                    "node_symlink".into(),
2374                )
2375                .expect("lookup with symlink chain failed");
2376
2377            // The lookup resolution should have correctly followed the symlinks to the real_file node.
2378            assert!(node_symlink_resolution.entry.node.ino == real_file_node.entry.node.ino);
2379        })
2380        .await;
2381    }
2382}