Skip to main content

starnix_core/vfs/
namespace.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mutable_state::{state_accessor, state_implementation};
6use crate::security;
7use crate::task::{CurrentTask, EventHandler, Kernel, Task, WaitCanceler, Waiter};
8use crate::time::utc;
9use crate::vfs::fs_registry::FsRegistry;
10use crate::vfs::pseudo::dynamic_file::{DynamicFile, DynamicFileBuf, DynamicFileSource};
11use crate::vfs::pseudo::simple_file::SimpleFileNode;
12use crate::vfs::socket::{SocketAddress, SocketHandle, UnixSocket};
13use crate::vfs::{
14    CheckAccessReason, DirEntry, DirEntryHandle, FileHandle, FileObject, FileOps, FileSystemHandle,
15    FileSystemOptions, FileWriteGuardMode, FsContext, FsNode, FsNodeHandle, FsNodeOps, FsStr,
16    FsString, PathBuilder, RenameFlags, SymlinkTarget, UnlinkKind, fileops_impl_dataless,
17    fileops_impl_delegate_read_write_and_seek, fileops_impl_nonseekable, fileops_impl_noop_sync,
18    fs_node_impl_not_dir,
19};
20use fuchsia_rcu::RcuReadScope;
21use macro_rules_attribute::apply;
22use ref_cast::RefCast;
23use starnix_logging::log_warn;
24use starnix_rcu::RcuHashMap;
25use starnix_sync::{
26    BeforeFsNodeAppend, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, Mutex, RwLock, Unlocked,
27};
28use starnix_types::ownership::WeakRef;
29use starnix_uapi::arc_key::{ArcKey, PtrKey, WeakKey};
30use starnix_uapi::auth::UserAndOrGroupId;
31use starnix_uapi::device_id::DeviceId;
32use starnix_uapi::errors::Errno;
33use starnix_uapi::file_mode::{AccessCheck, FileMode};
34use starnix_uapi::inotify_mask::InotifyMask;
35use starnix_uapi::mount_flags::{
36    AtomicMountpointFlags, FileSystemFlags, MountFlags, MountpointFlags,
37};
38use starnix_uapi::open_flags::OpenFlags;
39use starnix_uapi::unmount_flags::UnmountFlags;
40use starnix_uapi::vfs::{FdEvents, ResolveFlags};
41use starnix_uapi::{NAME_MAX, errno, error};
42use std::borrow::Borrow;
43use std::collections::HashSet;
44use std::fmt;
45use std::hash::{Hash, Hasher};
46use std::ops::{Deref, DerefMut};
47use std::sync::atomic::Ordering;
48use std::sync::{Arc, Weak};
49
50/// A mount namespace.
51///
52/// The namespace records at which entries filesystems are mounted.
53#[derive(Debug)]
54pub struct Namespace {
55    root_mount: MountHandle,
56
57    // Unique ID of this namespace.
58    pub id: u64,
59}
60
61impl Namespace {
62    pub fn new(fs: FileSystemHandle) -> Arc<Namespace> {
63        Self::new_with_flags(fs, MountpointFlags::empty())
64    }
65
66    pub fn new_with_flags(fs: FileSystemHandle, flags: MountpointFlags) -> Arc<Namespace> {
67        let kernel = fs.kernel.upgrade().expect("can't create namespace without a kernel");
68        let root_mount = Mount::new(WhatToMount::Fs(fs), flags);
69        Arc::new(Self { root_mount, id: kernel.get_next_namespace_id() })
70    }
71
72    pub fn root(&self) -> NamespaceNode {
73        self.root_mount.root()
74    }
75
76    pub fn clone_namespace(&self) -> Arc<Namespace> {
77        let kernel =
78            self.root_mount.fs.kernel.upgrade().expect("can't clone namespace without a kernel");
79        Arc::new(Self {
80            root_mount: self.root_mount.clone_mount_recursive(),
81            id: kernel.get_next_namespace_id(),
82        })
83    }
84
85    /// Assuming new_ns is a clone of the namespace that node is from, return the equivalent of
86    /// node in new_ns. If this assumption is violated, returns None.
87    pub fn translate_node(mut node: NamespaceNode, new_ns: &Namespace) -> Option<NamespaceNode> {
88        // Collect the list of mountpoints that leads to this node's mount
89        let mut mountpoints = vec![];
90        let mut mount = node.mount;
91        while let Some(mountpoint) = mount.as_ref().and_then(|m| m.read().mountpoint()) {
92            mountpoints.push(mountpoint.entry);
93            mount = mountpoint.mount;
94        }
95
96        // Follow the same path in the new namespace
97        let mut mount = Arc::clone(&new_ns.root_mount);
98        for mountpoint in mountpoints.iter().rev() {
99            let next_mount =
100                mount.read().submounts.get(ArcKey::ref_cast(mountpoint))?.mount.clone();
101            mount = next_mount;
102        }
103        node.mount = Some(mount).into();
104        Some(node)
105    }
106}
107
108impl FsNodeOps for Arc<Namespace> {
109    fs_node_impl_not_dir!();
110
111    fn create_file_ops(
112        &self,
113        _locked: &mut Locked<FileOpsCore>,
114        _node: &FsNode,
115        _current_task: &CurrentTask,
116        _flags: OpenFlags,
117    ) -> Result<Box<dyn FileOps>, Errno> {
118        Ok(Box::new(MountNamespaceFile(self.clone())))
119    }
120}
121
122pub struct MountNamespaceFile(pub Arc<Namespace>);
123
124impl FileOps for MountNamespaceFile {
125    fileops_impl_nonseekable!();
126    fileops_impl_dataless!();
127    fileops_impl_noop_sync!();
128}
129
130/// An empty struct that we use to track the number of active clients for a mount.
131///
132/// Each active client takes a reference to this object. The unmount operation fails
133/// if there are any active clients of the mount.
134type MountClientMarker = Arc<()>;
135
136/// An instance of a filesystem mounted in a namespace.
137///
138/// At a mount, path traversal switches from one filesystem to another.
139/// The client sees a composed directory structure that glues together the
140/// directories from the underlying FsNodes from those filesystems.
141///
142/// The mounts in a namespace form a mount tree, with `mountpoint` pointing to the parent and
143/// `submounts` pointing to the children.
144pub struct Mount {
145    root: DirEntryHandle,
146    fs: FileSystemHandle,
147
148    /// Holds the flags specific to this mount of the underlying filesystem.
149    flags: AtomicMountpointFlags,
150
151    /// Lock used to serialize updates of `flags` to ensure consistency during remount operations.
152    flags_lock: Mutex<()>,
153
154    /// A unique identifier for this mount reported in /proc/pid/mountinfo.
155    id: u64,
156
157    /// A count of the number of active clients.
158    active_client_counter: MountClientMarker,
159
160    // Lock ordering: mount -> submount
161    state: RwLock<MountState>,
162    // Mount used to contain a Weak<Namespace>. It no longer does because since the mount point
163    // hash was moved from Namespace to Mount, nothing actually uses it. Now that
164    // Namespace::clone_namespace() is implemented in terms of Mount::clone_mount_recursive, it
165    // won't be trivial to add it back. I recommend turning the mountpoint field into an enum of
166    // Mountpoint or Namespace, maybe called "parent", and then traverse up to the top of the tree
167    // if you need to find a Mount's Namespace.
168}
169type MountHandle = Arc<Mount>;
170
171/// Public representation of the mount options.
172#[derive(Clone, Debug)]
173pub struct MountInfo {
174    handle: Option<MountHandle>,
175}
176
177impl MountInfo {
178    /// `MountInfo` for a element that is not tied to a given mount. Mount flags will be considered
179    /// empty.
180    pub fn detached() -> Self {
181        None.into()
182    }
183
184    /// The mount flags of the represented mount.
185    pub fn flags(&self) -> MountFlags {
186        if let Some(handle) = &self.handle {
187            handle.flags()
188        } else {
189            // Consider not mounted node have the NOATIME flags.
190            MountFlags::NOATIME
191        }
192    }
193
194    /// Checks whether this `MountInfo` represents a writable file system mount.
195    pub fn check_readonly_filesystem(&self) -> Result<(), Errno> {
196        if self.flags().contains(MountFlags::RDONLY) {
197            return error!(EROFS);
198        }
199        Ok(())
200    }
201
202    /// Checks whether this `MountInfo` represents an executable file system mount.
203    pub fn check_noexec_filesystem(&self) -> Result<(), Errno> {
204        if self.flags().contains(MountFlags::NOEXEC) {
205            return error!(EACCES);
206        }
207        Ok(())
208    }
209}
210
211impl Deref for MountInfo {
212    type Target = Option<MountHandle>;
213
214    fn deref(&self) -> &Self::Target {
215        &self.handle
216    }
217}
218
219impl DerefMut for MountInfo {
220    fn deref_mut(&mut self) -> &mut Self::Target {
221        &mut self.handle
222    }
223}
224
225impl std::cmp::PartialEq for MountInfo {
226    fn eq(&self, other: &Self) -> bool {
227        self.handle.as_ref().map(Arc::as_ptr) == other.handle.as_ref().map(Arc::as_ptr)
228    }
229}
230
231impl std::cmp::Eq for MountInfo {}
232
233impl Into<MountInfo> for Option<MountHandle> {
234    fn into(self) -> MountInfo {
235        MountInfo { handle: self }
236    }
237}
238
239#[derive(Default)]
240pub struct MountState {
241    /// The namespace node that this mount is mounted on. This is a tuple instead of a
242    /// NamespaceNode because the Mount pointer has to be weak because this is the pointer to the
243    /// parent mount, the parent has a pointer to the children too, and making both strong would be
244    /// a cycle.
245    mountpoint: Option<(Weak<Mount>, DirEntryHandle)>,
246
247    // The set is keyed by the mountpoints which are always descendants of this mount's root.
248    // Conceptually, the set is more akin to a map: `DirEntry -> MountHandle`, but we use a set
249    // instead because `Submount` has a drop implementation that needs both the key and value.
250    //
251    // Each directory entry can only have one mount attached. Mount shadowing works by using the
252    // root of the inner mount as a mountpoint. For example, if filesystem A is mounted at /foo,
253    // mounting filesystem B on /foo will create the mount as a child of the A mount, attached to
254    // A's root, instead of the root mount.
255    submounts: HashSet<Submount>,
256
257    /// The membership of this mount in its peer group. Do not access directly. Instead use
258    /// peer_group(), take_from_peer_group(), and set_peer_group().
259    // TODO(tbodt): Refactor the links into, some kind of extra struct or something? This is hard
260    // because setting this field requires the Arc<Mount>.
261    peer_group_: Option<(Arc<PeerGroup>, PtrKey<Mount>)>,
262    /// The membership of this mount in a PeerGroup's downstream. Do not access directly. Instead
263    /// use upstream(), take_from_upstream(), and set_upstream().
264    upstream_: Option<(Weak<PeerGroup>, PtrKey<Mount>)>,
265}
266
267/// A group of mounts. Setting MS_SHARED on a mount puts it in its own peer group. Any bind mounts
268/// of a mount in the group are also added to the group. A mount created in any mount in a peer
269/// group will be automatically propagated (recreated) in every other mount in the group.
270#[derive(Default)]
271struct PeerGroup {
272    id: u64,
273    state: RwLock<PeerGroupState>,
274}
275#[derive(Default)]
276struct PeerGroupState {
277    mounts: HashSet<WeakKey<Mount>>,
278    downstream: HashSet<WeakKey<Mount>>,
279}
280
281pub enum WhatToMount {
282    Fs(FileSystemHandle),
283    Bind(NamespaceNode),
284}
285
286impl Mount {
287    pub fn new(what: WhatToMount, flags: MountpointFlags) -> MountHandle {
288        match what {
289            WhatToMount::Fs(fs) => Self::new_with_root(fs.root().clone(), flags),
290            WhatToMount::Bind(node) => {
291                let mount = node.mount.as_ref().expect("can't bind mount from an anonymous node");
292                mount.clone_mount(&node.entry, flags.into())
293            }
294        }
295    }
296
297    fn new_with_root(root: DirEntryHandle, flags: MountpointFlags) -> MountHandle {
298        let fs = root.node.fs();
299        let kernel = fs.kernel.upgrade().expect("can't create mount without kernel");
300        Arc::new(Self {
301            id: kernel.get_next_mount_id(),
302            flags: (flags & MountpointFlags::STORED_ON_MOUNT).into(),
303            flags_lock: Mutex::new(()),
304            root,
305            active_client_counter: Default::default(),
306            fs,
307            state: Default::default(),
308        })
309    }
310
311    /// A namespace node referring to the root of the mount.
312    pub fn root(self: &MountHandle) -> NamespaceNode {
313        NamespaceNode::new(Arc::clone(self), Arc::clone(&self.root))
314    }
315
316    /// Create the specified mount as a child. Also propagate it to the mount's peer group.
317    fn create_submount(
318        self: &MountHandle,
319        dir: &DirEntryHandle,
320        what: WhatToMount,
321        flags: MountpointFlags,
322    ) {
323        // TODO(tbodt): Making a copy here is necessary for lock ordering, because the peer group
324        // lock nests inside all mount locks (it would be impractical to reverse this because you
325        // need to lock a mount to get its peer group.) But it opens the door to race conditions
326        // where if a peer are concurrently being added, the mount might not get propagated to the
327        // new peer. The only true solution to this is bigger locks, somehow using the same lock
328        // for the peer group and all of the mounts in the group. Since peer groups are fluid and
329        // can have mounts constantly joining and leaving and then joining other groups, the only
330        // sensible locking option is to use a single global lock for all mounts and peer groups.
331        // This is almost impossible to express in rust. Help.
332        //
333        // Update: Also necessary to make a copy to prevent excess replication, see the comment on
334        // the following Mount::new call.
335        let peers = {
336            let state = self.state.read();
337            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
338        };
339
340        // Create the mount after copying the peer groups, because in the case of creating a bind
341        // mount inside itself, the new mount would get added to our peer group during the
342        // Mount::new call, but we don't want to replicate into it already. For an example see
343        // MountTest.QuizBRecursion.
344        let mount = Mount::new(what, flags);
345
346        if self.read().is_shared() {
347            mount.write().make_shared();
348        }
349
350        for peer in peers {
351            if Arc::ptr_eq(self, &peer) {
352                continue;
353            }
354            let clone = mount.clone_mount_recursive();
355            peer.write().add_submount_internal(dir, clone);
356        }
357
358        self.write().add_submount_internal(dir, mount)
359    }
360
361    fn remove_submount(self: &MountHandle, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
362        // create_submount explains why we need to make a copy of peers.
363        let peers = {
364            let state = self.state.read();
365            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
366        };
367
368        for peer in peers {
369            if Arc::ptr_eq(self, &peer) {
370                continue;
371            }
372            // mount_namespaces(7): If B is shared, then all most-recently-mounted mounts at b on
373            // mounts that receive propagation from mount B and do not have submounts under them are
374            // unmounted.
375            let mut peer = peer.write();
376            if let Some(submount) = peer.submounts.get(mount_hash_key) {
377                if !submount.mount.read().submounts.is_empty() {
378                    continue;
379                }
380            }
381            let _ = peer.remove_submount_internal(mount_hash_key);
382        }
383
384        self.write().remove_submount_internal(mount_hash_key)
385    }
386
387    /// Create a new mount with the same filesystem, flags, and peer group. Used to implement bind
388    /// mounts.
389    fn clone_mount(
390        self: &MountHandle,
391        new_root: &DirEntryHandle,
392        flags: MountFlags,
393    ) -> MountHandle {
394        assert!(new_root.is_descendant_of(&self.root));
395        // According to mount(2) on bind mounts, all flags other than MS_REC are ignored when doing
396        // a bind mount.
397        let clone = Self::new_with_root(Arc::clone(new_root), self.mount_flags());
398
399        if flags.contains(MountFlags::REC) {
400            // This is two steps because the alternative (locking clone.state while iterating over
401            // self.state.submounts) trips tracing_mutex. The lock ordering is parent -> child, and
402            // if the clone is eventually made a child of self, this looks like an ordering
403            // violation. I'm not convinced it's a real issue, but I can't convince myself it's not
404            // either.
405            let mut submounts = vec![];
406            for Submount { dir, mount } in &self.state.read().submounts {
407                submounts.push((dir.clone(), mount.clone_mount_recursive()));
408            }
409            let mut clone_state = clone.write();
410            for (dir, submount) in submounts {
411                clone_state.add_submount_internal(&dir, submount);
412            }
413        }
414
415        // Put the clone in the same peer group
416        let peer_group = self.state.read().peer_group().map(Arc::clone);
417        if let Some(peer_group) = peer_group {
418            clone.write().set_peer_group(peer_group);
419        }
420
421        clone
422    }
423
424    /// Do a clone of the full mount hierarchy below this mount. Used for creating mount
425    /// namespaces and creating copies to use for propagation.
426    fn clone_mount_recursive(self: &MountHandle) -> MountHandle {
427        self.clone_mount(&self.root, MountFlags::REC)
428    }
429
430    pub fn change_propagation(self: &MountHandle, flag: MountFlags, recursive: bool) {
431        let mut state = self.write();
432        match flag {
433            MountFlags::SHARED => state.make_shared(),
434            MountFlags::PRIVATE => state.make_private(),
435            MountFlags::DOWNSTREAM => state.make_downstream(),
436            _ => {
437                log_warn!("mount propagation {:?}", flag);
438                return;
439            }
440        }
441
442        if recursive {
443            for submount in &state.submounts {
444                submount.mount.change_propagation(flag, recursive);
445            }
446        }
447    }
448
449    /// Returns the effective flags for the `Mount`, calculated as the union of the mount flags
450    /// associated with the `FileSystem`, and with the `Mount` itself.
451    fn flags(&self) -> MountFlags {
452        // TODO: https://fxbug.dev/322875215 - All `FileSystem` flags should be included here, once
453        // updating superblock mount flags via `MS_REMOUNT` is implemented.
454        self.mount_flags().into()
455    }
456
457    /// Returns the mount flags stored unique to this `Mount`.
458    fn mount_flags(&self) -> MountpointFlags {
459        self.flags.load(Ordering::Relaxed)
460    }
461
462    /// Returns the mount flags for the `FileSystem` of this `Mount`.
463    fn fs_flags(&self) -> FileSystemFlags {
464        self.fs.options.flags
465    }
466
467    /// Updates the `Mount` with the per-mount flags specified in `flags`, while preserving the
468    /// existing access-time flag if no access-time flag is set in `flags`.
469    pub fn update_flags(self: &MountHandle, mut flags: MountpointFlags) {
470        let atime_flags = MountpointFlags::NOATIME
471            | MountpointFlags::NODIRATIME
472            | MountpointFlags::RELATIME
473            | MountpointFlags::STRICTATIME;
474        let _lock = self.flags_lock.lock();
475        if !flags.intersects(atime_flags) {
476            // Since Linux 3.17, if none of MS_NOATIME, MS_NODIRATIME,
477            // MS_RELATIME, or MS_STRICTATIME is specified in mountflags, then
478            // the remount operation preserves the existing values of these
479            // flags (rather than defaulting to MS_RELATIME).
480            flags |= self.flags.load(Ordering::Relaxed) & atime_flags;
481        }
482        self.flags.store(flags & MountpointFlags::STORED_ON_MOUNT, Ordering::Relaxed);
483    }
484
485    /// The number of active clients of this mount.
486    ///
487    /// The mount cannot be unmounted if there are any active clients.
488    fn active_clients(&self) -> usize {
489        // We need to subtract one for our own reference. We are not a real client.
490        Arc::strong_count(&self.active_client_counter) - 1
491    }
492
493    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
494        if !flags.contains(UnmountFlags::DETACH) {
495            if self.active_clients() > 0 || !self.state.read().submounts.is_empty() {
496                return error!(EBUSY);
497            }
498        }
499        let mountpoint = self.state.read().mountpoint().ok_or_else(|| errno!(EINVAL))?;
500        let parent_mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
501        parent_mount.remove_submount(mountpoint.mount_hash_key())
502    }
503
504    /// Returns the security state of the fs.
505    pub fn security_state(&self) -> &security::FileSystemState {
506        &self.fs.security_state
507    }
508
509    /// Returns the name of the fs.
510    pub fn fs_name(&self) -> &'static FsStr {
511        self.fs.name()
512    }
513
514    state_accessor!(Mount, state, Arc<Mount>);
515}
516
517impl MountState {
518    /// Returns true if there is a submount on top of `dir_entry`.
519    pub fn has_submount(&self, dir_entry: &DirEntryHandle) -> bool {
520        self.submounts.contains(ArcKey::ref_cast(dir_entry))
521    }
522
523    /// The NamespaceNode on which this Mount is mounted.
524    fn mountpoint(&self) -> Option<NamespaceNode> {
525        let (mount, entry) = self.mountpoint.as_ref()?;
526        Some(NamespaceNode::new(mount.upgrade()?, entry.clone()))
527    }
528
529    /// Return this mount's current peer group.
530    fn peer_group(&self) -> Option<&Arc<PeerGroup>> {
531        let (group, _) = self.peer_group_.as_ref()?;
532        Some(group)
533    }
534
535    /// Remove this mount from its peer group and return the peer group.
536    fn take_from_peer_group(&mut self) -> Option<Arc<PeerGroup>> {
537        let (old_group, old_mount) = self.peer_group_.take()?;
538        old_group.remove(old_mount);
539        if let Some(upstream) = self.take_from_upstream() {
540            let next_mount =
541                old_group.state.read().mounts.iter().next().map(|w| w.0.upgrade().unwrap());
542            if let Some(next_mount) = next_mount {
543                // TODO(https://fxbug.dev/42065259): Fix the lock ordering here. We've locked next_mount
544                // while self is locked, and since the propagation tree and mount tree are
545                // separate, this could violate the mount -> submount order previously established.
546                next_mount.write().set_upstream(upstream);
547            }
548        }
549        Some(old_group)
550    }
551
552    fn upstream(&self) -> Option<Arc<PeerGroup>> {
553        self.upstream_.as_ref().and_then(|g| g.0.upgrade())
554    }
555
556    fn take_from_upstream(&mut self) -> Option<Arc<PeerGroup>> {
557        let (old_upstream, old_mount) = self.upstream_.take()?;
558        // TODO(tbodt): Reason about whether the upgrade() could possibly return None, and what we
559        // should actually do in that case.
560        let old_upstream = old_upstream.upgrade()?;
561        old_upstream.remove_downstream(old_mount);
562        Some(old_upstream)
563    }
564}
565
566#[apply(state_implementation!)]
567impl MountState<Base = Mount, BaseType = Arc<Mount>> {
568    /// Add a child mount *without propagating it to the peer group*. For internal use only.
569    fn add_submount_internal(&mut self, dir: &DirEntryHandle, mount: MountHandle) {
570        if !dir.is_descendant_of(&self.base.root) {
571            return;
572        }
573
574        let submount = mount.fs.kernel.upgrade().unwrap().mounts.register_mount(dir, mount.clone());
575        let old_mountpoint =
576            mount.state.write().mountpoint.replace((Arc::downgrade(self.base), Arc::clone(dir)));
577        assert!(old_mountpoint.is_none(), "add_submount can only take a newly created mount");
578        // Mount shadowing is implemented by mounting onto the root of the first mount, not by
579        // creating two mounts on the same mountpoint.
580        let old_mount = self.submounts.replace(submount);
581
582        // In rare cases, mount propagation might result in a request to mount on a directory where
583        // something is already mounted. MountTest.LotsOfShadowing will trigger this. Linux handles
584        // this by inserting the new mount between the old mount and the current mount.
585        if let Some(mut old_mount) = old_mount {
586            // Previous state: self[dir] = old_mount
587            // New state: self[dir] = new_mount, new_mount[new_mount.root] = old_mount
588            // The new mount has already been inserted into self, now just update the old mount to
589            // be a child of the new mount.
590            old_mount.mount.write().mountpoint = Some((Arc::downgrade(&mount), Arc::clone(dir)));
591            old_mount.dir = ArcKey(mount.root.clone());
592            mount.write().submounts.insert(old_mount);
593        }
594    }
595
596    fn remove_submount_internal(&mut self, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
597        if self.submounts.remove(mount_hash_key) { Ok(()) } else { error!(EINVAL) }
598    }
599
600    /// Set this mount's peer group.
601    fn set_peer_group(&mut self, group: Arc<PeerGroup>) {
602        self.take_from_peer_group();
603        group.add(self.base);
604        self.peer_group_ = Some((group, Arc::as_ptr(self.base).into()));
605    }
606
607    fn set_upstream(&mut self, group: Arc<PeerGroup>) {
608        self.take_from_upstream();
609        group.add_downstream(self.base);
610        self.upstream_ = Some((Arc::downgrade(&group), Arc::as_ptr(self.base).into()));
611    }
612
613    /// Is the mount in a peer group? Corresponds to MS_SHARED.
614    pub fn is_shared(&self) -> bool {
615        self.peer_group().is_some()
616    }
617
618    /// Put the mount in a peer group. Implements MS_SHARED.
619    pub fn make_shared(&mut self) {
620        if self.is_shared() {
621            return;
622        }
623        let kernel =
624            self.base.fs.kernel.upgrade().expect("can't create new peer group without kernel");
625        self.set_peer_group(PeerGroup::new(kernel.get_next_peer_group_id()));
626    }
627
628    /// Take the mount out of its peer group, also remove upstream if any. Implements MS_PRIVATE.
629    pub fn make_private(&mut self) {
630        self.take_from_peer_group();
631        self.take_from_upstream();
632    }
633
634    /// Take the mount out of its peer group and make it downstream instead. Implements
635    /// MountFlags::DOWNSTREAM (MS_SLAVE).
636    pub fn make_downstream(&mut self) {
637        if let Some(peer_group) = self.take_from_peer_group() {
638            self.set_upstream(peer_group);
639        }
640    }
641}
642
643impl PeerGroup {
644    fn new(id: u64) -> Arc<Self> {
645        Arc::new(Self { id, state: Default::default() })
646    }
647
648    fn add(&self, mount: &Arc<Mount>) {
649        self.state.write().mounts.insert(WeakKey::from(mount));
650    }
651
652    fn remove(&self, mount: PtrKey<Mount>) {
653        self.state.write().mounts.remove(&mount);
654    }
655
656    fn add_downstream(&self, mount: &Arc<Mount>) {
657        self.state.write().downstream.insert(WeakKey::from(mount));
658    }
659
660    fn remove_downstream(&self, mount: PtrKey<Mount>) {
661        self.state.write().downstream.remove(&mount);
662    }
663
664    fn copy_propagation_targets(&self) -> Vec<MountHandle> {
665        let mut buf = vec![];
666        self.collect_propagation_targets(&mut buf);
667        buf
668    }
669
670    fn collect_propagation_targets(&self, buf: &mut Vec<MountHandle>) {
671        let downstream_mounts: Vec<_> = {
672            let state = self.state.read();
673            buf.extend(state.mounts.iter().filter_map(|m| m.0.upgrade()));
674            state.downstream.iter().filter_map(|m| m.0.upgrade()).collect()
675        };
676        for mount in downstream_mounts {
677            let peer_group = mount.read().peer_group().map(Arc::clone);
678            match peer_group {
679                Some(group) => group.collect_propagation_targets(buf),
680                None => buf.push(mount),
681            }
682        }
683    }
684}
685
686impl Drop for Mount {
687    fn drop(&mut self) {
688        let state = self.state.get_mut();
689        state.take_from_peer_group();
690        state.take_from_upstream();
691    }
692}
693
694impl fmt::Debug for Mount {
695    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
696        let state = self.state.read();
697        f.debug_struct("Mount")
698            .field("id", &(self as *const Mount))
699            .field("root", &self.root)
700            .field("mountpoint", &state.mountpoint)
701            .field("submounts", &state.submounts)
702            .finish()
703    }
704}
705
706impl Kernel {
707    pub fn get_next_mount_id(&self) -> u64 {
708        self.next_mount_id.next()
709    }
710
711    pub fn get_next_peer_group_id(&self) -> u64 {
712        self.next_peer_group_id.next()
713    }
714
715    pub fn get_next_namespace_id(&self) -> u64 {
716        self.next_namespace_id.next()
717    }
718}
719
720impl CurrentTask {
721    pub fn create_filesystem(
722        &self,
723        locked: &mut Locked<Unlocked>,
724        fs_type: &FsStr,
725        options: FileSystemOptions,
726    ) -> Result<FileSystemHandle, Errno> {
727        // Please register new file systems via //src/starnix/modules/lib.rs, even if the file
728        // system is implemented inside starnix_core.
729        //
730        // Most file systems should be implemented as modules. The VFS provides various traits that
731        // let starnix_core integrate file systems without needing to depend on the file systems
732        // directly.
733        self.kernel()
734            .expando
735            .get::<FsRegistry>()
736            .create(locked, self, fs_type, options)
737            .ok_or_else(|| errno!(ENODEV, fs_type))?
738    }
739}
740
741struct ProcMountsFileSource(WeakRef<Task>);
742
743impl DynamicFileSource for ProcMountsFileSource {
744    fn generate(
745        &self,
746        _current_task: &CurrentTask,
747        sink: &mut DynamicFileBuf,
748    ) -> Result<(), Errno> {
749        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
750        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
751        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
752        // extra work to maintain it.
753        let task = Task::from_weak(&self.0)?;
754        let task_fs = task.live()?.fs.read();
755        let root = task_fs.root();
756        let ns = task_fs.namespace();
757        for_each_mount(&ns.root_mount, &mut |mount| {
758            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
759            if !mountpoint.is_descendant_of(&root) {
760                return Ok(());
761            }
762            write!(
763                sink,
764                "{} {} {} {}{}",
765                mount.fs.options.source_for_display(),
766                mountpoint.path(&task_fs),
767                mount.fs.name(),
768                // Report the union of the FileSystem and Mount flags, as well as any FileSystem-
769                // or LSM-specific options.
770                // TODO: https://fxbug.dev/322875215 - Remove the explicit fs_flags() once
771                // Mount::flags() is fixed to include the filesystem flags.
772                mount.flags() | (mount.fs_flags() & FileSystemFlags::RDONLY).into(),
773                security::sb_show_options(&task.kernel(), &mount.fs)?,
774            )?;
775            writeln!(sink, " 0 0")?;
776            Ok(())
777        })?;
778        Ok(())
779    }
780}
781
782pub struct ProcMountsFile {
783    dynamic_file: DynamicFile<ProcMountsFileSource>,
784}
785
786impl ProcMountsFile {
787    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
788        SimpleFileNode::new(move |_, _| {
789            Ok(Self { dynamic_file: DynamicFile::new(ProcMountsFileSource(task.clone())) })
790        })
791    }
792}
793
794impl FileOps for ProcMountsFile {
795    fileops_impl_delegate_read_write_and_seek!(self, self.dynamic_file);
796    fileops_impl_noop_sync!();
797
798    fn wait_async(
799        &self,
800        _locked: &mut Locked<FileOpsCore>,
801        _file: &FileObject,
802        _current_task: &CurrentTask,
803        waiter: &Waiter,
804        _events: FdEvents,
805        _handler: EventHandler,
806    ) -> Option<WaitCanceler> {
807        // Polling this file gives notifications when any change to mounts occurs. This is not
808        // implemented yet, but stubbed for Android init.
809        Some(waiter.fake_wait())
810    }
811
812    fn query_events(
813        &self,
814        _locked: &mut Locked<FileOpsCore>,
815        _file: &FileObject,
816        _current_task: &CurrentTask,
817    ) -> Result<FdEvents, Errno> {
818        Ok(FdEvents::empty())
819    }
820}
821
822#[derive(Clone)]
823pub struct ProcMountinfoFile(WeakRef<Task>);
824impl ProcMountinfoFile {
825    pub fn new_node(task: WeakRef<Task>) -> impl FsNodeOps {
826        DynamicFile::new_node(Self(task))
827    }
828}
829impl DynamicFileSource for ProcMountinfoFile {
830    fn generate(
831        &self,
832        _current_task: &CurrentTask,
833        sink: &mut DynamicFileBuf,
834    ) -> Result<(), Errno> {
835        // Returns path to the `dir` from the root of the file system.
836        fn path_from_fs_root(dir: &DirEntryHandle) -> FsString {
837            let mut path = PathBuilder::new();
838            if dir.is_dead() {
839                // Return `/foo/dir//deleted` if the dir was deleted.
840                path.prepend_element("/deleted".into());
841            }
842            let scope = RcuReadScope::new();
843            let mut current = dir.deref();
844            while let Some(parent) = current.parent_ref(&scope) {
845                path.prepend_element(current.local_name(&scope));
846                current = parent;
847            }
848            path.build_absolute()
849        }
850
851        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
852        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
853        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
854        // extra work to maintain it.
855        let task = Task::from_weak(&self.0)?;
856        let task_fs = task.live()?.fs.read();
857        let root = task_fs.root();
858        let ns = task_fs.namespace();
859        for_each_mount(&ns.root_mount, &mut |mount| {
860            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
861            if !mountpoint.is_descendant_of(&root) {
862                return Ok(());
863            }
864            // Can't fail, mountpoint() and root() can't return a NamespaceNode with no mount
865            let parent = mountpoint.mount.as_ref().unwrap();
866            write!(
867                sink,
868                "{} {} {} {} {} {}",
869                mount.id,
870                parent.id,
871                mount.root.node.fs().dev_id,
872                path_from_fs_root(&mount.root),
873                mountpoint.path(&task_fs),
874                mount.mount_flags(),
875            )?;
876            if let Some(peer_group) = mount.read().peer_group() {
877                write!(sink, " shared:{}", peer_group.id)?;
878            }
879            if let Some(upstream) = mount.read().upstream() {
880                write!(sink, " master:{}", upstream.id)?;
881            }
882            writeln!(
883                sink,
884                " - {} {} {}{}",
885                mount.fs.name(),
886                mount.fs.options.source_for_display(),
887                mount.fs_flags(),
888                // LSM options are associated with the FileSystem rather than the Mount.
889                security::sb_show_options(&task.kernel(), &mount.fs)?
890            )?;
891            Ok(())
892        })?;
893        Ok(())
894    }
895}
896
897fn for_each_mount<E>(
898    mount: &MountHandle,
899    callback: &mut impl FnMut(&MountHandle) -> Result<(), E>,
900) -> Result<(), E> {
901    callback(mount)?;
902    // Collect list first to avoid self deadlock when ProcMountinfoFile::read_at tries to call
903    // NamespaceNode::path()
904    let submounts: Vec<_> = mount.read().submounts.iter().map(|s| s.mount.clone()).collect();
905    for submount in submounts {
906        for_each_mount(&submount, callback)?;
907    }
908    Ok(())
909}
910
911/// The `SymlinkMode` enum encodes how symlinks are followed during path traversal.
912#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)]
913pub enum SymlinkMode {
914    /// Follow a symlink at the end of a path resolution.
915    #[default]
916    Follow,
917
918    /// Do not follow a symlink at the end of a path resolution.
919    NoFollow,
920}
921
922/// The maximum number of symlink traversals that can be made during path resolution.
923pub const MAX_SYMLINK_FOLLOWS: u8 = 40;
924
925/// The context passed during namespace lookups.
926///
927/// Namespace lookups need to mutate a shared context in order to correctly
928/// count the number of remaining symlink traversals.
929pub struct LookupContext {
930    /// The SymlinkMode for the lookup.
931    ///
932    /// As the lookup proceeds, the follow count is decremented each time the
933    /// lookup traverses a symlink.
934    pub symlink_mode: SymlinkMode,
935
936    /// The number of symlinks remaining the follow.
937    ///
938    /// Each time path resolution calls readlink, this value is decremented.
939    pub remaining_follows: u8,
940
941    /// Whether the result of the lookup must be a directory.
942    ///
943    /// For example, if the path ends with a `/` or if userspace passes
944    /// O_DIRECTORY. This flag can be set to true if the lookup encounters a
945    /// symlink that ends with a `/`.
946    pub must_be_directory: bool,
947
948    /// Resolve flags passed to `openat2`. Empty if the lookup originated in any other syscall.
949    pub resolve_flags: ResolveFlags,
950
951    /// Base directory for the lookup. Set only when either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT`
952    /// is passed to `openat2`.
953    pub resolve_base: ResolveBase,
954}
955
956/// Used to specify base directory in `LookupContext` for lookups originating in the `openat2`
957/// syscall with either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT` flag.
958#[derive(Clone, Eq, PartialEq)]
959pub enum ResolveBase {
960    None,
961
962    /// The lookup is not allowed to traverse any node that's not beneath the specified node.
963    Beneath(NamespaceNode),
964
965    /// The lookup should be handled as if the root specified node is the file-system root.
966    InRoot(NamespaceNode),
967}
968
969impl LookupContext {
970    pub fn new(symlink_mode: SymlinkMode) -> LookupContext {
971        LookupContext {
972            symlink_mode,
973            remaining_follows: MAX_SYMLINK_FOLLOWS,
974            must_be_directory: false,
975            resolve_flags: ResolveFlags::empty(),
976            resolve_base: ResolveBase::None,
977        }
978    }
979
980    pub fn with(&self, symlink_mode: SymlinkMode) -> LookupContext {
981        LookupContext { symlink_mode, resolve_base: self.resolve_base.clone(), ..*self }
982    }
983
984    pub fn update_for_path(&mut self, path: &FsStr) {
985        if path.last() == Some(&b'/') {
986            // The last path element must resolve to a directory. This is because a trailing slash
987            // was found in the path.
988            self.must_be_directory = true;
989            // If the last path element is a symlink, we should follow it.
990            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
991            self.symlink_mode = SymlinkMode::Follow;
992        }
993    }
994}
995
996impl Default for LookupContext {
997    fn default() -> Self {
998        LookupContext::new(SymlinkMode::Follow)
999    }
1000}
1001
1002/// Whether the path is reachable from the given root.
1003pub enum PathWithReachability {
1004    /// The path is reachable from the given root.
1005    Reachable(FsString),
1006
1007    /// The path is not reachable from the given root.
1008    Unreachable(FsString),
1009}
1010
1011impl PathWithReachability {
1012    pub fn into_path(self) -> FsString {
1013        match self {
1014            PathWithReachability::Reachable(path) => path,
1015            PathWithReachability::Unreachable(path) => path,
1016        }
1017    }
1018}
1019
1020/// A node in a mount namespace.
1021///
1022/// This tree is a composite of the mount tree and the FsNode tree.
1023///
1024/// These nodes are used when traversing paths in a namespace in order to
1025/// present the client the directory structure that includes the mounted
1026/// filesystems.
1027#[derive(Clone)]
1028pub struct NamespaceNode {
1029    /// The mount where this namespace node is mounted.
1030    ///
1031    /// A given FsNode can be mounted in multiple places in a namespace. This
1032    /// field distinguishes between them.
1033    pub mount: MountInfo,
1034
1035    /// The FsNode that corresponds to this namespace entry.
1036    pub entry: DirEntryHandle,
1037}
1038
1039impl NamespaceNode {
1040    pub fn new(mount: MountHandle, entry: DirEntryHandle) -> Self {
1041        Self { mount: Some(mount).into(), entry }
1042    }
1043
1044    /// Create a namespace node that is not mounted in a namespace.
1045    pub fn new_anonymous(entry: DirEntryHandle) -> Self {
1046        Self { mount: None.into(), entry }
1047    }
1048
1049    /// Create a namespace node that is not mounted in a namespace and that refers to a node that
1050    /// is not rooted in a hierarchy and has no name.
1051    pub fn new_anonymous_unrooted(current_task: &CurrentTask, node: FsNodeHandle) -> Self {
1052        let dir_entry = DirEntry::new_unrooted(node);
1053        let _ = security::fs_node_init_with_dentry_no_xattr(current_task, &dir_entry);
1054        Self::new_anonymous(dir_entry)
1055    }
1056
1057    /// Create a FileObject corresponding to this namespace node.
1058    ///
1059    /// This function is the primary way of instantiating FileObjects. Each
1060    /// FileObject records the NamespaceNode that created it in order to
1061    /// remember its path in the Namespace.
1062    pub fn open(
1063        &self,
1064        locked: &mut Locked<Unlocked>,
1065        current_task: &CurrentTask,
1066        flags: OpenFlags,
1067        access_check: AccessCheck,
1068    ) -> Result<FileHandle, Errno> {
1069        let ops = self.entry.node.open(locked, current_task, self, flags, access_check)?;
1070        FileObject::new(locked, current_task, ops, self.clone(), flags)
1071    }
1072
1073    /// Create or open a node in the file system.
1074    ///
1075    /// Works for any type of node other than a symlink.
1076    ///
1077    /// Will return an existing node unless `flags` contains `OpenFlags::EXCL`.
1078    pub fn open_create_node<L>(
1079        &self,
1080        locked: &mut Locked<L>,
1081        current_task: &CurrentTask,
1082        name: &FsStr,
1083        mode: FileMode,
1084        dev: DeviceId,
1085        flags: OpenFlags,
1086    ) -> Result<NamespaceNode, Errno>
1087    where
1088        L: LockEqualOrBefore<FileOpsCore>,
1089    {
1090        let owner = current_task.current_fscred();
1091        let mode = current_task.fs().apply_umask(mode);
1092        let create_fn =
1093            |locked: &mut Locked<L>, dir: &FsNodeHandle, mount: &MountInfo, name: &_| {
1094                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1095            };
1096        let entry = if flags.contains(OpenFlags::EXCL) {
1097            self.entry.create_entry(locked, current_task, &self.mount, name, create_fn)
1098        } else {
1099            self.entry.get_or_create_entry(locked, current_task, &self.mount, name, create_fn)
1100        }?;
1101        Ok(self.with_new_entry(entry))
1102    }
1103
1104    pub fn into_active(self) -> ActiveNamespaceNode {
1105        ActiveNamespaceNode::new(self)
1106    }
1107
1108    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1109        self.into_active().into_mapping(mode)
1110    }
1111
1112    /// Create a node in the file system.
1113    ///
1114    /// Works for any type of node other than a symlink.
1115    ///
1116    /// Does not return an existing node.
1117    pub fn create_node<L>(
1118        &self,
1119        locked: &mut Locked<L>,
1120        current_task: &CurrentTask,
1121        name: &FsStr,
1122        mode: FileMode,
1123        dev: DeviceId,
1124    ) -> Result<NamespaceNode, Errno>
1125    where
1126        L: LockEqualOrBefore<FileOpsCore>,
1127    {
1128        let owner = current_task.current_fscred();
1129        let mode = current_task.fs().apply_umask(mode);
1130        let entry = self.entry.create_entry(
1131            locked,
1132            current_task,
1133            &self.mount,
1134            name,
1135            |locked, dir, mount, name| {
1136                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1137            },
1138        )?;
1139        Ok(self.with_new_entry(entry))
1140    }
1141
1142    /// Create a symlink in the file system.
1143    ///
1144    /// To create another type of node, use `create_node`.
1145    pub fn create_symlink<L>(
1146        &self,
1147        locked: &mut Locked<L>,
1148        current_task: &CurrentTask,
1149        name: &FsStr,
1150        target: &FsStr,
1151    ) -> Result<NamespaceNode, Errno>
1152    where
1153        L: LockEqualOrBefore<FileOpsCore>,
1154    {
1155        let owner = current_task.current_fscred();
1156        let entry = self.entry.create_entry(
1157            locked,
1158            current_task,
1159            &self.mount,
1160            name,
1161            |locked, dir, mount, name| {
1162                dir.create_symlink(locked, current_task, mount, name, target, owner)
1163            },
1164        )?;
1165        Ok(self.with_new_entry(entry))
1166    }
1167
1168    /// Creates an anonymous file.
1169    ///
1170    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
1171    ///
1172    /// Used by O_TMPFILE.
1173    pub fn create_tmpfile<L>(
1174        &self,
1175        locked: &mut Locked<L>,
1176        current_task: &CurrentTask,
1177        mode: FileMode,
1178        flags: OpenFlags,
1179    ) -> Result<NamespaceNode, Errno>
1180    where
1181        L: LockEqualOrBefore<FileOpsCore>,
1182    {
1183        let owner = current_task.current_fscred();
1184        let mode = current_task.fs().apply_umask(mode);
1185        Ok(self.with_new_entry(self.entry.create_tmpfile(
1186            locked,
1187            current_task,
1188            &self.mount,
1189            mode,
1190            owner,
1191            flags,
1192        )?))
1193    }
1194
1195    pub fn link<L>(
1196        &self,
1197        locked: &mut Locked<L>,
1198        current_task: &CurrentTask,
1199        name: &FsStr,
1200        child: &FsNodeHandle,
1201    ) -> Result<NamespaceNode, Errno>
1202    where
1203        L: LockEqualOrBefore<FileOpsCore>,
1204    {
1205        let dir_entry = self.entry.create_entry(
1206            locked,
1207            current_task,
1208            &self.mount,
1209            name,
1210            |locked, dir, mount, name| dir.link(locked, current_task, mount, name, child),
1211        )?;
1212        Ok(self.with_new_entry(dir_entry))
1213    }
1214
1215    pub fn bind_socket<L>(
1216        &self,
1217        locked: &mut Locked<L>,
1218        current_task: &CurrentTask,
1219        name: &FsStr,
1220        socket: SocketHandle,
1221        socket_address: SocketAddress,
1222        mode: FileMode,
1223    ) -> Result<NamespaceNode, Errno>
1224    where
1225        L: LockEqualOrBefore<FileOpsCore>,
1226    {
1227        let dir_entry = self.entry.create_entry(
1228            locked,
1229            current_task,
1230            &self.mount,
1231            name,
1232            |locked, dir, mount, name| {
1233                let node = dir.create_node(
1234                    locked,
1235                    current_task,
1236                    mount,
1237                    name,
1238                    mode,
1239                    DeviceId::NONE,
1240                    current_task.current_fscred(),
1241                )?;
1242                if let Some(unix_socket) = socket.downcast_socket::<UnixSocket>() {
1243                    unix_socket.bind_socket_to_node(&socket, socket_address, &node)?;
1244                } else {
1245                    return error!(ENOTSUP);
1246                }
1247                Ok(node)
1248            },
1249        )?;
1250        Ok(self.with_new_entry(dir_entry))
1251    }
1252
1253    pub fn unlink<L>(
1254        &self,
1255        locked: &mut Locked<L>,
1256        current_task: &CurrentTask,
1257        name: &FsStr,
1258        kind: UnlinkKind,
1259        must_be_directory: bool,
1260    ) -> Result<(), Errno>
1261    where
1262        L: LockEqualOrBefore<FileOpsCore>,
1263    {
1264        if DirEntry::is_reserved_name(name) {
1265            match kind {
1266                UnlinkKind::Directory => {
1267                    if name == ".." {
1268                        error!(ENOTEMPTY)
1269                    } else if self.parent().is_none() {
1270                        // The client is attempting to remove the root.
1271                        error!(EBUSY)
1272                    } else {
1273                        error!(EINVAL)
1274                    }
1275                }
1276                UnlinkKind::NonDirectory => error!(ENOTDIR),
1277            }
1278        } else {
1279            self.entry.unlink(locked, current_task, &self.mount, name, kind, must_be_directory)
1280        }
1281    }
1282
1283    // Resolve the current node.
1284    //
1285    // Depending on context, this will resolve symlink and mount point.
1286    fn resolve<L>(
1287        self,
1288        locked: &mut Locked<L>,
1289        current_task: &CurrentTask,
1290        context: &mut LookupContext,
1291    ) -> Result<NamespaceNode, Errno>
1292    where
1293        L: LockEqualOrBefore<FileOpsCore>,
1294    {
1295        let mut node = self;
1296
1297        loop {
1298            if !node.entry.node.is_lnk() || context.symlink_mode == SymlinkMode::NoFollow {
1299                break;
1300            }
1301            if context.remaining_follows == 0
1302                || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
1303            {
1304                return error!(ELOOP);
1305            }
1306            context.remaining_follows -= 1;
1307            node = match node.readlink(locked, current_task)? {
1308                SymlinkTarget::Path(link_target) => {
1309                    let link_directory = if link_target[0] == b'/' {
1310                        // If the path is absolute, we'll resolve the root directory.
1311                        match &context.resolve_base {
1312                            ResolveBase::None => current_task.fs().root(),
1313                            ResolveBase::Beneath(_) => return error!(EXDEV),
1314                            ResolveBase::InRoot(root) => root.clone(),
1315                        }
1316                    } else {
1317                        // If the path is not absolute, it's a relative directory.
1318                        // Let's try to get the parent of the current node, or in the case that
1319                        // the node is the root we can just use that directly.
1320                        node.parent().unwrap_or(node)
1321                    };
1322                    current_task.lookup_path(
1323                        locked,
1324                        context,
1325                        link_directory,
1326                        link_target.as_ref(),
1327                    )?
1328                }
1329                SymlinkTarget::Node(node) => {
1330                    if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
1331                        return error!(ELOOP);
1332                    }
1333                    node
1334                }
1335            };
1336        }
1337        Ok(node.enter_mount())
1338    }
1339
1340    /// Traverse down a parent-to-child link in the namespace.
1341    pub fn lookup_child<L>(
1342        &self,
1343        locked: &mut Locked<L>,
1344        current_task: &CurrentTask,
1345        context: &mut LookupContext,
1346        basename: &FsStr,
1347    ) -> Result<NamespaceNode, Errno>
1348    where
1349        L: LockEqualOrBefore<FileOpsCore>,
1350    {
1351        self.lookup_children(locked, current_task, context, &[basename])
1352    }
1353
1354    /// Traverse down a parent-to-child link in the namespace.
1355    pub fn lookup_children<L>(
1356        &self,
1357        locked: &mut Locked<L>,
1358        current_task: &CurrentTask,
1359        context: &mut LookupContext,
1360        mut basenames: &[&FsStr],
1361    ) -> Result<NamespaceNode, Errno>
1362    where
1363        L: LockEqualOrBefore<FileOpsCore>,
1364    {
1365        for name in basenames {
1366            if name.len() > NAME_MAX as usize {
1367                return error!(ENAMETOOLONG);
1368            }
1369        }
1370
1371        let mut current_namespace_node = self.clone();
1372
1373        while basenames.len() > 0 {
1374            if !current_namespace_node.entry.node.is_dir() {
1375                return error!(ENOTDIR);
1376            }
1377
1378            let basename = basenames[0];
1379            if basename.is_empty() || basename == "." {
1380                basenames = &basenames[1..];
1381                continue;
1382            }
1383            if basename == ".." {
1384                let root = match &context.resolve_base {
1385                    ResolveBase::None => current_task.fs().root(),
1386                    ResolveBase::Beneath(node) => {
1387                        // Do not allow traversal out of the 'node'.
1388                        if current_namespace_node == *node {
1389                            return error!(EXDEV);
1390                        }
1391                        current_task.fs().root()
1392                    }
1393                    ResolveBase::InRoot(root) => root.clone(),
1394                };
1395
1396                // Make sure this can't escape a chroot.
1397                if current_namespace_node != root {
1398                    current_namespace_node =
1399                        current_namespace_node.parent().unwrap_or(current_namespace_node)
1400                }
1401                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1402                    && current_namespace_node.mount != self.mount
1403                {
1404                    return error!(EXDEV);
1405                }
1406
1407                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1408                    return error!(ENOTDIR);
1409                }
1410                basenames = &basenames[1..];
1411                continue;
1412            }
1413            if basenames.len() == 1
1414                || !current_namespace_node.entry.node.ops().has_lookup_pipelined()
1415            {
1416                current_namespace_node = current_namespace_node.with_new_entry(
1417                    current_namespace_node.entry.component_lookup(
1418                        locked,
1419                        current_task,
1420                        &current_namespace_node.mount,
1421                        basename,
1422                    )?,
1423                );
1424
1425                current_namespace_node =
1426                    current_namespace_node.resolve(locked, current_task, context)?;
1427
1428                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1429                    && current_namespace_node.mount != self.mount
1430                {
1431                    return error!(EXDEV);
1432                }
1433
1434                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1435                    return error!(ENOTDIR);
1436                }
1437
1438                basenames = &basenames[1..];
1439                continue;
1440            }
1441
1442            let pipelined_basenames = if let Some(pos) =
1443                basenames.iter().position(|&name| name.is_empty() || name == "." || name == "..")
1444            {
1445                &basenames[..pos]
1446            } else {
1447                basenames
1448            };
1449            let precomputed_entries = current_namespace_node.entry.get_children_pipelined(
1450                locked,
1451                current_task,
1452                &current_namespace_node.mount,
1453                pipelined_basenames,
1454            );
1455            for entry in precomputed_entries {
1456                basenames = &basenames[1..];
1457                let child = current_namespace_node.with_new_entry(entry?);
1458
1459                current_namespace_node = child.clone().resolve(locked, current_task, context)?;
1460
1461                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1462                    && current_namespace_node.mount != self.mount
1463                {
1464                    return error!(EXDEV);
1465                }
1466
1467                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1468                    return error!(ENOTDIR);
1469                }
1470
1471                if current_namespace_node != child {
1472                    break;
1473                }
1474            }
1475        }
1476
1477        Ok(current_namespace_node)
1478    }
1479
1480    /// Traverse up a child-to-parent link in the namespace.
1481    ///
1482    /// This traversal matches the child-to-parent link in the underlying
1483    /// FsNode except at mountpoints, where the link switches from one
1484    /// filesystem to another.
1485    pub fn parent(&self) -> Option<NamespaceNode> {
1486        let mountpoint_or_self = self.escape_mount();
1487        let parent = mountpoint_or_self.entry.parent()?;
1488        Some(mountpoint_or_self.with_new_entry(parent))
1489    }
1490
1491    /// Returns the parent, but does not escape mounts i.e. returns None if this node
1492    /// is the root of a mount.
1493    pub fn parent_within_mount(&self) -> Option<DirEntryHandle> {
1494        if let Ok(_) = self.mount_if_root() {
1495            return None;
1496        }
1497        self.entry.parent()
1498    }
1499
1500    /// Whether this namespace node is a descendant of the given node.
1501    ///
1502    /// Walks up the namespace node tree looking for ancestor. If ancestor is
1503    /// found, returns true. Otherwise, returns false.
1504    pub fn is_descendant_of(&self, ancestor: &NamespaceNode) -> bool {
1505        let ancestor = ancestor.escape_mount();
1506        let mut current = self.escape_mount();
1507        while current != ancestor {
1508            if let Some(parent) = current.parent() {
1509                current = parent.escape_mount();
1510            } else {
1511                return false;
1512            }
1513        }
1514        true
1515    }
1516
1517    /// If this is a mount point, return the root of the mount. Otherwise return self.
1518    fn enter_mount(&self) -> NamespaceNode {
1519        // While the child is a mountpoint, replace child with the mount's root.
1520        fn enter_one_mount(node: &NamespaceNode) -> Option<NamespaceNode> {
1521            if let Some(mount) = node.mount.deref() {
1522                if let Some(submount) =
1523                    mount.state.read().submounts.get(ArcKey::ref_cast(&node.entry))
1524                {
1525                    return Some(submount.mount.root());
1526                }
1527            }
1528            None
1529        }
1530        let mut inner = self.clone();
1531        while let Some(inner_root) = enter_one_mount(&inner) {
1532            inner = inner_root;
1533        }
1534        inner
1535    }
1536
1537    /// If this is the root of a mount, return the mount point. Otherwise return self.
1538    ///
1539    /// This is not exactly the same as parent(). If parent() is called on a root, it will escape
1540    /// the mount, but then return the parent of the mount point instead of the mount point.
1541    fn escape_mount(&self) -> NamespaceNode {
1542        let mut mountpoint_or_self = self.clone();
1543        while let Some(mountpoint) = mountpoint_or_self.mountpoint() {
1544            mountpoint_or_self = mountpoint;
1545        }
1546        mountpoint_or_self
1547    }
1548
1549    /// If this node is the root of a mount, return it. Otherwise EINVAL.
1550    pub fn mount_if_root(&self) -> Result<&MountHandle, Errno> {
1551        if let Some(mount) = self.mount.deref() {
1552            if Arc::ptr_eq(&self.entry, &mount.root) {
1553                return Ok(mount);
1554            }
1555        }
1556        error!(EINVAL)
1557    }
1558
1559    /// Returns the mountpoint at this location in the namespace.
1560    ///
1561    /// If this node is mounted in another node, this function returns the node
1562    /// at which this node is mounted. Otherwise, returns None.
1563    fn mountpoint(&self) -> Option<NamespaceNode> {
1564        self.mount_if_root().ok()?.read().mountpoint()
1565    }
1566
1567    /// The path from the filesystem root to this node.
1568    pub fn path(&self, fs: &FsContext) -> FsString {
1569        self.path_from_root(Some(&fs.root())).into_path()
1570    }
1571
1572    /// The path from the root of the namespace to this node.
1573    pub fn path_escaping_chroot(&self) -> FsString {
1574        self.path_from_root(None).into_path()
1575    }
1576
1577    /// Returns the path to this node, accounting for a custom root.
1578    /// A task may have a custom root set by `chroot`.
1579    pub fn path_from_root(&self, root: Option<&NamespaceNode>) -> PathWithReachability {
1580        if self.mount.is_none() {
1581            return PathWithReachability::Reachable(self.entry.node.internal_name());
1582        }
1583
1584        let mut path = PathBuilder::new();
1585        let mut current = self.escape_mount();
1586        if let Some(root) = root {
1587            let scope = RcuReadScope::new();
1588            // The current node is expected to intersect with the custom root as we travel up the tree.
1589            let root = root.escape_mount();
1590            while current != root {
1591                if let Some(parent) = current.parent() {
1592                    path.prepend_element(current.entry.local_name(&scope));
1593                    current = parent.escape_mount();
1594                } else {
1595                    // This node hasn't intersected with the custom root and has reached the namespace root.
1596                    let mut absolute_path = path.build_absolute();
1597                    if self.entry.is_dead() {
1598                        absolute_path.extend_from_slice(b" (deleted)");
1599                    }
1600
1601                    return PathWithReachability::Unreachable(absolute_path);
1602                }
1603            }
1604        } else {
1605            // No custom root, so travel up the tree to the namespace root.
1606            let scope = RcuReadScope::new();
1607            while let Some(parent) = current.parent() {
1608                path.prepend_element(current.entry.local_name(&scope));
1609                current = parent.escape_mount();
1610            }
1611        }
1612
1613        let mut absolute_path = path.build_absolute();
1614        if self.entry.is_dead() {
1615            absolute_path.extend_from_slice(b" (deleted)");
1616        }
1617
1618        PathWithReachability::Reachable(absolute_path)
1619    }
1620
1621    pub fn mount(&self, what: WhatToMount, flags: MountpointFlags) -> Result<(), Errno> {
1622        let mountpoint = self.enter_mount();
1623        let mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
1624        mount.create_submount(&mountpoint.entry, what, flags);
1625        Ok(())
1626    }
1627
1628    /// If this is the root of a filesystem, unmount. Otherwise return EINVAL.
1629    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
1630        let mount = self.enter_mount().mount_if_root()?.clone();
1631        mount.unmount(flags)
1632    }
1633
1634    pub fn rename<L>(
1635        locked: &mut Locked<L>,
1636        current_task: &CurrentTask,
1637        old_parent: &NamespaceNode,
1638        old_name: &FsStr,
1639        new_parent: &NamespaceNode,
1640        new_name: &FsStr,
1641        flags: RenameFlags,
1642    ) -> Result<(), Errno>
1643    where
1644        L: LockEqualOrBefore<FileOpsCore>,
1645    {
1646        DirEntry::rename(
1647            locked,
1648            current_task,
1649            &old_parent.entry,
1650            &old_parent.mount,
1651            old_name,
1652            &new_parent.entry,
1653            &new_parent.mount,
1654            new_name,
1655            flags,
1656        )
1657    }
1658
1659    fn with_new_entry(&self, entry: DirEntryHandle) -> NamespaceNode {
1660        Self { mount: self.mount.clone(), entry }
1661    }
1662
1663    fn mount_hash_key(&self) -> &ArcKey<DirEntry> {
1664        ArcKey::ref_cast(&self.entry)
1665    }
1666
1667    pub fn suid_and_sgid(&self, current_task: &CurrentTask) -> Result<UserAndOrGroupId, Errno> {
1668        if self.mount.flags().contains(MountFlags::NOSUID) {
1669            Ok(UserAndOrGroupId::default())
1670        } else {
1671            self.entry.node.info().suid_and_sgid(current_task, &self.entry.node)
1672        }
1673    }
1674
1675    pub fn update_atime(&self) {
1676        // Do not update the atime of this node if it is mounted with the NOATIME flag.
1677        if !self.mount.flags().contains(MountFlags::NOATIME) {
1678            self.entry.node.update_info(|info| {
1679                let now = utc::utc_now();
1680                info.time_access = now;
1681                info.pending_time_access_update = true;
1682            });
1683        }
1684    }
1685
1686    pub fn readlink<L>(
1687        &self,
1688        locked: &mut Locked<L>,
1689        current_task: &CurrentTask,
1690    ) -> Result<SymlinkTarget, Errno>
1691    where
1692        L: LockEqualOrBefore<FileOpsCore>,
1693    {
1694        self.update_atime();
1695        self.entry.node.readlink(locked, current_task)
1696    }
1697
1698    pub fn notify(&self, event_mask: InotifyMask) {
1699        if self.mount.is_some() {
1700            self.entry.notify(event_mask);
1701        }
1702    }
1703
1704    /// Check whether the node can be accessed in the current context with the specified access
1705    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1706    /// owner or is in the file's group.
1707    pub fn check_access<L>(
1708        &self,
1709        locked: &mut Locked<L>,
1710        current_task: &CurrentTask,
1711        permission_flags: impl Into<security::PermissionFlags>,
1712        reason: CheckAccessReason,
1713    ) -> Result<(), Errno>
1714    where
1715        L: LockEqualOrBefore<FileOpsCore>,
1716    {
1717        self.entry.node.check_access(
1718            locked,
1719            current_task,
1720            &self.mount,
1721            permission_flags,
1722            reason,
1723            self,
1724        )
1725    }
1726
1727    /// Checks if O_NOATIME is allowed,
1728    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1729        self.entry.node.check_o_noatime_allowed(current_task)
1730    }
1731
1732    pub fn truncate<L>(
1733        &self,
1734        locked: &mut Locked<L>,
1735        current_task: &CurrentTask,
1736        length: u64,
1737    ) -> Result<(), Errno>
1738    where
1739        L: LockBefore<BeforeFsNodeAppend>,
1740    {
1741        self.entry.node.truncate(locked, current_task, &self.mount, length)?;
1742        self.entry.notify_ignoring_excl_unlink(InotifyMask::MODIFY);
1743        Ok(())
1744    }
1745}
1746
1747impl fmt::Debug for NamespaceNode {
1748    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1749        f.debug_struct("NamespaceNode")
1750            .field("path", &self.path_escaping_chroot())
1751            .field("mount", &self.mount)
1752            .field("entry", &self.entry)
1753            .finish()
1754    }
1755}
1756
1757// Eq/Hash impls intended for the MOUNT_POINTS hash
1758impl PartialEq for NamespaceNode {
1759    fn eq(&self, other: &Self) -> bool {
1760        self.mount.as_ref().map(Arc::as_ptr).eq(&other.mount.as_ref().map(Arc::as_ptr))
1761            && Arc::ptr_eq(&self.entry, &other.entry)
1762    }
1763}
1764impl Eq for NamespaceNode {}
1765impl Hash for NamespaceNode {
1766    fn hash<H: Hasher>(&self, state: &mut H) {
1767        self.mount.as_ref().map(Arc::as_ptr).hash(state);
1768        Arc::as_ptr(&self.entry).hash(state);
1769    }
1770}
1771
1772/// A namespace node that keeps the underly mount busy.
1773#[derive(Debug, Clone)]
1774pub struct ActiveNamespaceNode {
1775    /// The underlying namespace node.
1776    name: NamespaceNode,
1777
1778    /// Adds a reference to the mount client marker to prevent the mount from
1779    /// being removed while the NamespaceNode is active. Is None iff mount is
1780    /// None.
1781    _marker: Option<MountClientMarker>,
1782}
1783
1784impl ActiveNamespaceNode {
1785    pub fn new(name: NamespaceNode) -> Self {
1786        let marker = name.mount.as_ref().map(|mount| mount.active_client_counter.clone());
1787        Self { name, _marker: marker }
1788    }
1789
1790    pub fn to_passive(&self) -> NamespaceNode {
1791        self.deref().clone()
1792    }
1793
1794    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1795        if let Some(mode) = mode {
1796            self.entry.node.write_guard_state.lock().acquire(mode)?;
1797        }
1798        Ok(Arc::new(FileMapping { name: self, mode }))
1799    }
1800}
1801
1802impl Deref for ActiveNamespaceNode {
1803    type Target = NamespaceNode;
1804
1805    fn deref(&self) -> &Self::Target {
1806        &self.name
1807    }
1808}
1809
1810impl PartialEq for ActiveNamespaceNode {
1811    fn eq(&self, other: &Self) -> bool {
1812        self.deref().eq(other.deref())
1813    }
1814}
1815impl Eq for ActiveNamespaceNode {}
1816impl Hash for ActiveNamespaceNode {
1817    fn hash<H: Hasher>(&self, state: &mut H) {
1818        self.deref().hash(state)
1819    }
1820}
1821
1822#[derive(Debug, Clone, PartialEq, Eq)]
1823#[must_use]
1824pub struct FileMapping {
1825    pub name: ActiveNamespaceNode,
1826    mode: Option<FileWriteGuardMode>,
1827}
1828
1829impl Drop for FileMapping {
1830    fn drop(&mut self) {
1831        if let Some(mode) = self.mode {
1832            self.name.entry.node.write_guard_state.lock().release(mode);
1833        }
1834    }
1835}
1836
1837/// Tracks all mounts, keyed by mount point.
1838pub struct Mounts {
1839    mounts: RcuHashMap<WeakKey<DirEntry>, Vec<ArcKey<Mount>>>,
1840}
1841
1842impl Mounts {
1843    pub fn new() -> Self {
1844        Mounts { mounts: RcuHashMap::default() }
1845    }
1846
1847    /// Registers the mount in the global mounts map.
1848    fn register_mount(&self, dir_entry: &Arc<DirEntry>, mount: MountHandle) -> Submount {
1849        let mut mounts = self.mounts.lock();
1850        let key = WeakKey::from(dir_entry);
1851        let mut vec = mounts.get(&key).unwrap_or_else(|| {
1852            dir_entry.set_has_mounts(true);
1853            Vec::new()
1854        });
1855        vec.push(ArcKey(mount.clone()));
1856        mounts.insert(key, vec);
1857        Submount { dir: ArcKey(dir_entry.clone()), mount }
1858    }
1859
1860    /// Unregisters the mount.  This is called by `Submount::drop`.
1861    fn unregister_mount(&self, dir_entry: &Arc<DirEntry>, mount: &MountHandle) {
1862        let mut mounts = self.mounts.lock();
1863        let key = WeakKey::from(dir_entry);
1864        if let Some(mut vec) = mounts.get(&key) {
1865            let index = vec.iter().position(|e| e == ArcKey::ref_cast(mount)).unwrap();
1866            if vec.len() == 1 {
1867                mounts.remove(&key);
1868                dir_entry.set_has_mounts(false);
1869            } else {
1870                vec.swap_remove(index);
1871                mounts.insert(key, vec);
1872            }
1873        }
1874    }
1875
1876    /// Unmounts all mounts associated with `dir_entry`.  This is called when `dir_entry` is
1877    /// unlinked (which would normally result in EBUSY, but not if it isn't mounted in the local
1878    /// namespace).
1879    pub fn unmount(&self, dir_entry: &DirEntry) {
1880        let mounts = self.mounts.lock().remove(&PtrKey::from(dir_entry as *const _));
1881        if let Some(mounts) = mounts {
1882            for mount in mounts {
1883                // Ignore errors.
1884                let _ = mount.unmount(UnmountFlags::DETACH);
1885            }
1886        }
1887    }
1888
1889    /// Drain mounts. For each drained mount, force a FileSystem unmount.
1890    // TODO(https://fxbug.dev/295073633): Graceful shutdown should try to first unmount the mounts
1891    // and only force a FileSystem unmount on failure.
1892    pub fn clear(&self) {
1893        for (_dir_entry, mounts) in self.mounts.lock().drain() {
1894            for mount in mounts {
1895                mount.fs.force_unmount_ops();
1896            }
1897        }
1898    }
1899
1900    pub fn sync_all(
1901        &self,
1902        locked: &mut Locked<Unlocked>,
1903        current_task: &CurrentTask,
1904    ) -> Result<(), Errno> {
1905        let mut filesystems = Vec::new();
1906        {
1907            let scope = RcuReadScope::new();
1908            let mut seen = HashSet::new();
1909            for (_dir_entry, m_list) in self.mounts.iter(&scope) {
1910                for m in m_list {
1911                    if seen.insert(Arc::as_ptr(&m.fs)) {
1912                        filesystems.push(m.fs.clone());
1913                    }
1914                }
1915            }
1916        }
1917
1918        for fs in filesystems {
1919            if let Err(e) = fs.sync(locked, current_task) {
1920                log_warn!("sync failed for filesystem {:?}: {:?}", fs.name(), e);
1921            }
1922        }
1923        Ok(())
1924    }
1925}
1926
1927/// A RAII object that unregisters a mount when dropped.
1928#[derive(Debug)]
1929struct Submount {
1930    dir: ArcKey<DirEntry>,
1931    mount: MountHandle,
1932}
1933
1934impl Drop for Submount {
1935    fn drop(&mut self) {
1936        self.mount.fs.kernel.upgrade().unwrap().mounts.unregister_mount(&self.dir, &self.mount)
1937    }
1938}
1939
1940/// Submount is stored in a mount's submounts hash set, which is keyed by the mountpoint.
1941impl Eq for Submount {}
1942impl PartialEq<Self> for Submount {
1943    fn eq(&self, other: &Self) -> bool {
1944        self.dir == other.dir
1945    }
1946}
1947impl Hash for Submount {
1948    fn hash<H: Hasher>(&self, state: &mut H) {
1949        self.dir.hash(state)
1950    }
1951}
1952
1953impl Borrow<ArcKey<DirEntry>> for Submount {
1954    fn borrow(&self) -> &ArcKey<DirEntry> {
1955        &self.dir
1956    }
1957}
1958
1959#[cfg(test)]
1960mod test {
1961    use crate::fs::tmpfs::TmpFs;
1962    use crate::testing::spawn_kernel_and_run;
1963    use crate::vfs::namespace::DeviceId;
1964    use crate::vfs::{
1965        CallbackSymlinkNode, FsNodeInfo, LookupContext, MountInfo, Namespace, NamespaceNode,
1966        RenameFlags, SymlinkMode, SymlinkTarget, UnlinkKind, WhatToMount,
1967    };
1968    use starnix_uapi::mount_flags::MountpointFlags;
1969    use starnix_uapi::{errno, mode};
1970    use std::sync::Arc;
1971
1972    #[::fuchsia::test]
1973    async fn test_namespace() {
1974        spawn_kernel_and_run(async |locked, current_task| {
1975            let kernel = current_task.kernel();
1976            let root_fs = TmpFs::new_fs(locked, &kernel);
1977            let root_node = Arc::clone(root_fs.root());
1978            let _dev_node = root_node
1979                .create_dir(locked, &current_task, "dev".into())
1980                .expect("failed to mkdir dev");
1981            let dev_fs = TmpFs::new_fs(locked, &kernel);
1982            let dev_root_node = Arc::clone(dev_fs.root());
1983            let _dev_pts_node = dev_root_node
1984                .create_dir(locked, &current_task, "pts".into())
1985                .expect("failed to mkdir pts");
1986
1987            let ns = Namespace::new(root_fs);
1988            let mut context = LookupContext::default();
1989            let dev = ns
1990                .root()
1991                .lookup_child(locked, &current_task, &mut context, "dev".into())
1992                .expect("failed to lookup dev");
1993            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
1994                .expect("failed to mount dev root node");
1995
1996            let mut context = LookupContext::default();
1997            let dev = ns
1998                .root()
1999                .lookup_child(locked, &current_task, &mut context, "dev".into())
2000                .expect("failed to lookup dev");
2001            let mut context = LookupContext::default();
2002            let pts = dev
2003                .lookup_child(locked, &current_task, &mut context, "pts".into())
2004                .expect("failed to lookup pts");
2005            let pts_parent =
2006                pts.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of pts");
2007            assert!(Arc::ptr_eq(&pts_parent.entry, &dev.entry));
2008
2009            let dev_parent =
2010                dev.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of dev");
2011            assert!(Arc::ptr_eq(&dev_parent.entry, &ns.root().entry));
2012        })
2013        .await;
2014    }
2015
2016    #[::fuchsia::test]
2017    async fn test_mount_does_not_upgrade() {
2018        spawn_kernel_and_run(async |locked, current_task| {
2019            let kernel = current_task.kernel();
2020            let root_fs = TmpFs::new_fs(locked, &kernel);
2021            let root_node = Arc::clone(root_fs.root());
2022            let _dev_node = root_node
2023                .create_dir(locked, &current_task, "dev".into())
2024                .expect("failed to mkdir dev");
2025            let dev_fs = TmpFs::new_fs(locked, &kernel);
2026            let dev_root_node = Arc::clone(dev_fs.root());
2027            let _dev_pts_node = dev_root_node
2028                .create_dir(locked, &current_task, "pts".into())
2029                .expect("failed to mkdir pts");
2030
2031            let ns = Namespace::new(root_fs);
2032            let mut context = LookupContext::default();
2033            let dev = ns
2034                .root()
2035                .lookup_child(locked, &current_task, &mut context, "dev".into())
2036                .expect("failed to lookup dev");
2037            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2038                .expect("failed to mount dev root node");
2039            let mut context = LookupContext::default();
2040            let new_dev = ns
2041                .root()
2042                .lookup_child(locked, &current_task, &mut context, "dev".into())
2043                .expect("failed to lookup dev again");
2044            assert!(!Arc::ptr_eq(&dev.entry, &new_dev.entry));
2045            assert_ne!(&dev, &new_dev);
2046
2047            let mut context = LookupContext::default();
2048            let _new_pts = new_dev
2049                .lookup_child(locked, &current_task, &mut context, "pts".into())
2050                .expect("failed to lookup pts");
2051            let mut context = LookupContext::default();
2052            assert!(dev.lookup_child(locked, &current_task, &mut context, "pts".into()).is_err());
2053        })
2054        .await;
2055    }
2056
2057    #[::fuchsia::test]
2058    async fn test_path() {
2059        spawn_kernel_and_run(async |locked, current_task| {
2060            let kernel = current_task.kernel();
2061            let root_fs = TmpFs::new_fs(locked, &kernel);
2062            let root_node = Arc::clone(root_fs.root());
2063            let _dev_node = root_node
2064                .create_dir(locked, &current_task, "dev".into())
2065                .expect("failed to mkdir dev");
2066            let dev_fs = TmpFs::new_fs(locked, &kernel);
2067            let dev_root_node = Arc::clone(dev_fs.root());
2068            let _dev_pts_node = dev_root_node
2069                .create_dir(locked, &current_task, "pts".into())
2070                .expect("failed to mkdir pts");
2071
2072            let ns = Namespace::new(root_fs);
2073            let mut context = LookupContext::default();
2074            let dev = ns
2075                .root()
2076                .lookup_child(locked, &current_task, &mut context, "dev".into())
2077                .expect("failed to lookup dev");
2078            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2079                .expect("failed to mount dev root node");
2080
2081            let mut context = LookupContext::default();
2082            let dev = ns
2083                .root()
2084                .lookup_child(locked, &current_task, &mut context, "dev".into())
2085                .expect("failed to lookup dev");
2086            let mut context = LookupContext::default();
2087            let pts = dev
2088                .lookup_child(locked, &current_task, &mut context, "pts".into())
2089                .expect("failed to lookup pts");
2090
2091            assert_eq!("/", ns.root().path_escaping_chroot());
2092            assert_eq!("/dev", dev.path_escaping_chroot());
2093            assert_eq!("/dev/pts", pts.path_escaping_chroot());
2094        })
2095        .await;
2096    }
2097
2098    #[::fuchsia::test]
2099    async fn test_shadowing() {
2100        spawn_kernel_and_run(async |locked, current_task| {
2101            let kernel = current_task.kernel();
2102            let root_fs = TmpFs::new_fs(locked, &kernel);
2103            let ns = Namespace::new(root_fs.clone());
2104            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2105            let mut context = LookupContext::default();
2106            let foo_dir =
2107                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2108
2109            let foofs1 = TmpFs::new_fs(locked, &kernel);
2110            foo_dir.mount(WhatToMount::Fs(foofs1.clone()), MountpointFlags::empty()).unwrap();
2111            let mut context = LookupContext::default();
2112            assert!(Arc::ptr_eq(
2113                &ns.root()
2114                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2115                    .unwrap()
2116                    .entry,
2117                foofs1.root()
2118            ));
2119            let foo_dir =
2120                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2121
2122            let ns_clone = ns.clone_namespace();
2123
2124            let foofs2 = TmpFs::new_fs(locked, &kernel);
2125            foo_dir.mount(WhatToMount::Fs(foofs2.clone()), MountpointFlags::empty()).unwrap();
2126            let mut context = LookupContext::default();
2127            assert!(Arc::ptr_eq(
2128                &ns.root()
2129                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2130                    .unwrap()
2131                    .entry,
2132                foofs2.root()
2133            ));
2134
2135            assert!(Arc::ptr_eq(
2136                &ns_clone
2137                    .root()
2138                    .lookup_child(
2139                        locked,
2140                        &current_task,
2141                        &mut LookupContext::default(),
2142                        "foo".into()
2143                    )
2144                    .unwrap()
2145                    .entry,
2146                foofs1.root()
2147            ));
2148        })
2149        .await;
2150    }
2151
2152    #[::fuchsia::test]
2153    async fn test_unlink_mounted_directory() {
2154        spawn_kernel_and_run(async |locked, current_task| {
2155            let kernel = current_task.kernel();
2156            let root_fs = TmpFs::new_fs(locked, &kernel);
2157            let ns1 = Namespace::new(root_fs.clone());
2158            let ns2 = Namespace::new(root_fs.clone());
2159            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2160            let mut context = LookupContext::default();
2161            let foo_dir =
2162                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2163
2164            let foofs = TmpFs::new_fs(locked, &kernel);
2165            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2166
2167            // Trying to unlink from ns1 should fail.
2168            assert_eq!(
2169                ns1.root()
2170                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2171                    .unwrap_err(),
2172                errno!(EBUSY),
2173            );
2174
2175            // But unlinking from ns2 should succeed.
2176            ns2.root()
2177                .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2178                .expect("unlink failed");
2179
2180            // And it should no longer show up in ns1.
2181            assert_eq!(
2182                ns1.root()
2183                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2184                    .unwrap_err(),
2185                errno!(ENOENT),
2186            );
2187        })
2188        .await;
2189    }
2190
2191    #[::fuchsia::test]
2192    async fn test_rename_mounted_directory() {
2193        spawn_kernel_and_run(async |locked, current_task| {
2194            let kernel = current_task.kernel();
2195            let root_fs = TmpFs::new_fs(locked, &kernel);
2196            let ns1 = Namespace::new(root_fs.clone());
2197            let ns2 = Namespace::new(root_fs.clone());
2198            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2199            let _bar_node = root_fs.root().create_dir(locked, &current_task, "bar".into()).unwrap();
2200            let _baz_node = root_fs.root().create_dir(locked, &current_task, "baz".into()).unwrap();
2201            let mut context = LookupContext::default();
2202            let foo_dir =
2203                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2204
2205            let foofs = TmpFs::new_fs(locked, &kernel);
2206            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2207
2208            // Trying to rename over foo from ns1 should fail.
2209            let root = ns1.root();
2210            assert_eq!(
2211                NamespaceNode::rename(
2212                    locked,
2213                    &current_task,
2214                    &root,
2215                    "bar".into(),
2216                    &root,
2217                    "foo".into(),
2218                    RenameFlags::empty()
2219                )
2220                .unwrap_err(),
2221                errno!(EBUSY),
2222            );
2223            // Likewise the other way.
2224            assert_eq!(
2225                NamespaceNode::rename(
2226                    locked,
2227                    &current_task,
2228                    &root,
2229                    "foo".into(),
2230                    &root,
2231                    "bar".into(),
2232                    RenameFlags::empty()
2233                )
2234                .unwrap_err(),
2235                errno!(EBUSY),
2236            );
2237
2238            // But renaming from ns2 should succeed.
2239            let root = ns2.root();
2240
2241            // First rename the directory with the mount.
2242            NamespaceNode::rename(
2243                locked,
2244                &current_task,
2245                &root,
2246                "foo".into(),
2247                &root,
2248                "bar".into(),
2249                RenameFlags::empty(),
2250            )
2251            .expect("rename failed");
2252
2253            // Renaming over a directory with a mount should also work.
2254            NamespaceNode::rename(
2255                locked,
2256                &current_task,
2257                &root,
2258                "baz".into(),
2259                &root,
2260                "bar".into(),
2261                RenameFlags::empty(),
2262            )
2263            .expect("rename failed");
2264
2265            // "foo" and "baz" should no longer show up in ns1.
2266            assert_eq!(
2267                ns1.root()
2268                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2269                    .unwrap_err(),
2270                errno!(ENOENT)
2271            );
2272            assert_eq!(
2273                ns1.root()
2274                    .lookup_child(locked, &current_task, &mut context, "baz".into())
2275                    .unwrap_err(),
2276                errno!(ENOENT)
2277            );
2278        })
2279        .await;
2280    }
2281
2282    /// Symlinks which need to be traversed across types (nodes and paths), as well as across
2283    /// owning directories, can be tricky to get right.
2284    #[::fuchsia::test]
2285    async fn test_lookup_with_symlink_chain() {
2286        spawn_kernel_and_run(async |locked, current_task| {
2287            // Set up the root filesystem
2288            let kernel = current_task.kernel();
2289            let root_fs = TmpFs::new_fs(locked, &kernel);
2290            let root_node = Arc::clone(root_fs.root());
2291            let _first_subdir_node = root_node
2292                .create_dir(locked, &current_task, "first_subdir".into())
2293                .expect("failed to mkdir dev");
2294            let _second_subdir_node = root_node
2295                .create_dir(locked, &current_task, "second_subdir".into())
2296                .expect("failed to mkdir dev");
2297
2298            // Set up two subdirectories under the root filesystem
2299            let first_subdir_fs = TmpFs::new_fs(locked, &kernel);
2300            let second_subdir_fs = TmpFs::new_fs(locked, &kernel);
2301
2302            let ns = Namespace::new(root_fs);
2303            let mut context = LookupContext::default();
2304            let first_subdir = ns
2305                .root()
2306                .lookup_child(locked, &current_task, &mut context, "first_subdir".into())
2307                .expect("failed to lookup first_subdir");
2308            first_subdir
2309                .mount(WhatToMount::Fs(first_subdir_fs), MountpointFlags::empty())
2310                .expect("failed to mount first_subdir fs node");
2311            let second_subdir = ns
2312                .root()
2313                .lookup_child(locked, &current_task, &mut context, "second_subdir".into())
2314                .expect("failed to lookup second_subdir");
2315            second_subdir
2316                .mount(WhatToMount::Fs(second_subdir_fs), MountpointFlags::empty())
2317                .expect("failed to mount second_subdir fs node");
2318
2319            // Create the symlink structure. To trigger potential symlink traversal bugs, we're going
2320            // for the following directory structure:
2321            // / (root)
2322            //     + first_subdir/
2323            //         - real_file
2324            //         - path_symlink (-> real_file)
2325            //     + second_subdir/
2326            //         - node_symlink (-> path_symlink)
2327            let real_file_node = first_subdir
2328                .create_node(
2329                    locked,
2330                    &current_task,
2331                    "real_file".into(),
2332                    mode!(IFREG, 0o777),
2333                    DeviceId::NONE,
2334                )
2335                .expect("failed to create real_file");
2336            first_subdir
2337                .create_symlink(locked, &current_task, "path_symlink".into(), "real_file".into())
2338                .expect("failed to create path_symlink");
2339
2340            let mut no_follow_lookup_context = LookupContext::new(SymlinkMode::NoFollow);
2341            let path_symlink_node = first_subdir
2342                .lookup_child(
2343                    locked,
2344                    &current_task,
2345                    &mut no_follow_lookup_context,
2346                    "path_symlink".into(),
2347                )
2348                .expect("Failed to lookup path_symlink");
2349
2350            // The second symlink needs to be of type SymlinkTarget::Node in order to trip the sensitive
2351            // code path. There's no easy method for creating this type of symlink target, so we'll need
2352            // to construct a node from scratch and insert it into the directory manually.
2353            let node_symlink_node = second_subdir.entry.node.fs().create_node_and_allocate_node_id(
2354                CallbackSymlinkNode::new(move || {
2355                    let node = path_symlink_node.clone();
2356                    Ok(SymlinkTarget::Node(node))
2357                }),
2358                FsNodeInfo::new(mode!(IFLNK, 0o777), current_task.current_fscred()),
2359            );
2360            second_subdir
2361                .entry
2362                .create_entry(
2363                    locked,
2364                    &current_task,
2365                    &MountInfo::detached(),
2366                    "node_symlink".into(),
2367                    move |_locked, _dir, _mount, _name| Ok(node_symlink_node),
2368                )
2369                .expect("failed to create node_symlink entry");
2370
2371            // Finally, exercise the lookup under test.
2372            let mut follow_lookup_context = LookupContext::new(SymlinkMode::Follow);
2373            let node_symlink_resolution = second_subdir
2374                .lookup_child(
2375                    locked,
2376                    &current_task,
2377                    &mut follow_lookup_context,
2378                    "node_symlink".into(),
2379                )
2380                .expect("lookup with symlink chain failed");
2381
2382            // The lookup resolution should have correctly followed the symlinks to the real_file node.
2383            assert!(node_symlink_resolution.entry.node.ino == real_file_node.entry.node.ino);
2384        })
2385        .await;
2386    }
2387}