Skip to main content

starnix_core/vfs/
namespace.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mutable_state::{state_accessor, state_implementation};
6use crate::security;
7use crate::task::{CurrentTask, EventHandler, Kernel, Task, WaitCanceler, Waiter};
8use crate::time::utc;
9use crate::vfs::fs_registry::FsRegistry;
10use crate::vfs::pseudo::dynamic_file::{DynamicFile, DynamicFileBuf, DynamicFileSource};
11use crate::vfs::pseudo::simple_file::SimpleFileNode;
12use crate::vfs::socket::{SocketAddress, SocketHandle, UnixSocket};
13use crate::vfs::{
14    CheckAccessReason, DirEntry, DirEntryHandle, FileHandle, FileObject, FileOps, FileSystemHandle,
15    FileSystemOptions, FileWriteGuardMode, FsContext, FsNode, FsNodeHandle, FsNodeOps, FsStr,
16    FsString, PathBuilder, RenameFlags, SymlinkTarget, UnlinkKind, fileops_impl_dataless,
17    fileops_impl_delegate_read_write_and_seek, fileops_impl_nonseekable, fileops_impl_noop_sync,
18    fs_node_impl_not_dir,
19};
20use fuchsia_rcu::RcuReadScope;
21use macro_rules_attribute::apply;
22use ref_cast::RefCast;
23use starnix_logging::log_warn;
24use starnix_rcu::RcuHashMap;
25use starnix_sync::{
26    BeforeFsNodeAppend, FileOpsCore, LockEqualOrBefore, Locked, Mutex, RwLock, Unlocked,
27};
28use starnix_uapi::arc_key::{ArcKey, PtrKey, WeakKey};
29use starnix_uapi::auth::UserAndOrGroupId;
30use starnix_uapi::device_id::DeviceId;
31use starnix_uapi::errors::Errno;
32use starnix_uapi::file_mode::{AccessCheck, FileMode};
33use starnix_uapi::inotify_mask::InotifyMask;
34use starnix_uapi::mount_flags::{
35    AtomicMountpointFlags, FileSystemFlags, MountFlags, MountpointFlags,
36};
37use starnix_uapi::open_flags::OpenFlags;
38use starnix_uapi::unmount_flags::UnmountFlags;
39use starnix_uapi::vfs::{FdEvents, ResolveFlags};
40use starnix_uapi::{NAME_MAX, errno, error};
41use std::borrow::Borrow;
42use std::collections::HashSet;
43use std::fmt;
44use std::hash::{Hash, Hasher};
45use std::ops::{Deref, DerefMut};
46use std::sync::atomic::Ordering;
47use std::sync::{Arc, Weak};
48
49/// A mount namespace.
50///
51/// The namespace records at which entries filesystems are mounted.
52#[derive(Debug)]
53pub struct Namespace {
54    root_mount: MountHandle,
55
56    // Unique ID of this namespace.
57    pub id: u64,
58}
59
60impl Namespace {
61    pub fn new(fs: FileSystemHandle) -> Arc<Namespace> {
62        Self::new_with_flags(fs, MountpointFlags::empty())
63    }
64
65    pub fn new_with_flags(fs: FileSystemHandle, flags: MountpointFlags) -> Arc<Namespace> {
66        let kernel = fs.kernel.upgrade().expect("can't create namespace without a kernel");
67        let root_mount = Mount::new(WhatToMount::Fs(fs), flags);
68        Arc::new(Self { root_mount, id: kernel.get_next_namespace_id() })
69    }
70
71    pub fn root(&self) -> NamespaceNode {
72        self.root_mount.root()
73    }
74
75    pub fn clone_namespace(&self) -> Arc<Namespace> {
76        let kernel =
77            self.root_mount.fs.kernel.upgrade().expect("can't clone namespace without a kernel");
78        Arc::new(Self {
79            root_mount: self.root_mount.clone_mount_recursive(),
80            id: kernel.get_next_namespace_id(),
81        })
82    }
83
84    /// Assuming new_ns is a clone of the namespace that node is from, return the equivalent of
85    /// node in new_ns. If this assumption is violated, returns None.
86    pub fn translate_node(mut node: NamespaceNode, new_ns: &Namespace) -> Option<NamespaceNode> {
87        // Collect the list of mountpoints that leads to this node's mount
88        let mut mountpoints = vec![];
89        let mut mount = node.mount;
90        while let Some(mountpoint) = mount.as_ref().and_then(|m| m.read().mountpoint()) {
91            mountpoints.push(mountpoint.entry);
92            mount = mountpoint.mount;
93        }
94
95        // Follow the same path in the new namespace
96        let mut mount = Arc::clone(&new_ns.root_mount);
97        for mountpoint in mountpoints.iter().rev() {
98            let next_mount =
99                mount.read().submounts.get(ArcKey::ref_cast(mountpoint))?.mount.clone();
100            mount = next_mount;
101        }
102        node.mount = Some(mount).into();
103        Some(node)
104    }
105}
106
107impl FsNodeOps for Arc<Namespace> {
108    fs_node_impl_not_dir!();
109
110    fn create_file_ops(
111        &self,
112        _locked: &mut Locked<FileOpsCore>,
113        _node: &FsNode,
114        _current_task: &CurrentTask,
115        _flags: OpenFlags,
116    ) -> Result<Box<dyn FileOps>, Errno> {
117        Ok(Box::new(MountNamespaceFile(self.clone())))
118    }
119}
120
121pub struct MountNamespaceFile(pub Arc<Namespace>);
122
123impl FileOps for MountNamespaceFile {
124    fileops_impl_nonseekable!();
125    fileops_impl_dataless!();
126    fileops_impl_noop_sync!();
127}
128
129/// An empty struct that we use to track the number of active clients for a mount.
130///
131/// Each active client takes a reference to this object. The unmount operation fails
132/// if there are any active clients of the mount.
133type MountClientMarker = Arc<()>;
134
135/// An instance of a filesystem mounted in a namespace.
136///
137/// At a mount, path traversal switches from one filesystem to another.
138/// The client sees a composed directory structure that glues together the
139/// directories from the underlying FsNodes from those filesystems.
140///
141/// The mounts in a namespace form a mount tree, with `mountpoint` pointing to the parent and
142/// `submounts` pointing to the children.
143pub struct Mount {
144    root: DirEntryHandle,
145    fs: FileSystemHandle,
146
147    /// Holds the flags specific to this mount of the underlying filesystem.
148    flags: AtomicMountpointFlags,
149
150    /// Lock used to serialize updates of `flags` to ensure consistency during remount operations.
151    flags_lock: Mutex<()>,
152
153    /// A unique identifier for this mount reported in /proc/pid/mountinfo.
154    id: u64,
155
156    /// A count of the number of active clients.
157    active_client_counter: MountClientMarker,
158
159    // Lock ordering: mount -> submount
160    state: RwLock<MountState>,
161    // Mount used to contain a Weak<Namespace>. It no longer does because since the mount point
162    // hash was moved from Namespace to Mount, nothing actually uses it. Now that
163    // Namespace::clone_namespace() is implemented in terms of Mount::clone_mount_recursive, it
164    // won't be trivial to add it back. If you end up needing to find a Mount's Namespace, I
165    // recommend turning the mountpoint field into an enum of Mountpoint or Namespace, maybe called
166    // "parent", and then you can traverse up to the top of the tree.
167}
168type MountHandle = Arc<Mount>;
169
170/// Public representation of the mount options.
171#[derive(Clone, Debug)]
172pub struct MountInfo {
173    handle: Option<MountHandle>,
174}
175
176impl MountInfo {
177    /// `MountInfo` for a element that is not tied to a given mount. Mount flags will be considered
178    /// empty.
179    pub fn detached() -> Self {
180        None.into()
181    }
182
183    /// The mount flags of the represented mount.
184    pub fn flags(&self) -> MountFlags {
185        if let Some(handle) = &self.handle {
186            handle.flags()
187        } else {
188            // Consider not mounted node have the NOATIME flags.
189            MountFlags::NOATIME
190        }
191    }
192
193    /// Checks whether this `MountInfo` represents a writable file system mount.
194    pub fn check_readonly_filesystem(&self) -> Result<(), Errno> {
195        if self.flags().contains(MountFlags::RDONLY) {
196            return error!(EROFS);
197        }
198        Ok(())
199    }
200
201    /// Checks whether this `MountInfo` represents an executable file system mount.
202    pub fn check_noexec_filesystem(&self) -> Result<(), Errno> {
203        if self.flags().contains(MountFlags::NOEXEC) {
204            return error!(EACCES);
205        }
206        Ok(())
207    }
208}
209
210impl Deref for MountInfo {
211    type Target = Option<MountHandle>;
212
213    fn deref(&self) -> &Self::Target {
214        &self.handle
215    }
216}
217
218impl DerefMut for MountInfo {
219    fn deref_mut(&mut self) -> &mut Self::Target {
220        &mut self.handle
221    }
222}
223
224impl std::cmp::PartialEq for MountInfo {
225    fn eq(&self, other: &Self) -> bool {
226        self.handle.as_ref().map(Arc::as_ptr) == other.handle.as_ref().map(Arc::as_ptr)
227    }
228}
229
230impl std::cmp::Eq for MountInfo {}
231
232impl Into<MountInfo> for Option<MountHandle> {
233    fn into(self) -> MountInfo {
234        MountInfo { handle: self }
235    }
236}
237
238#[derive(Default)]
239pub struct MountState {
240    /// The namespace node that this mount is mounted on. This is a tuple instead of a
241    /// NamespaceNode because the Mount pointer has to be weak because this is the pointer to the
242    /// parent mount, the parent has a pointer to the children too, and making both strong would be
243    /// a cycle.
244    mountpoint: Option<(Weak<Mount>, DirEntryHandle)>,
245
246    // The set is keyed by the mountpoints which are always descendants of this mount's root.
247    // Conceptually, the set is more akin to a map: `DirEntry -> MountHandle`, but we use a set
248    // instead because `Submount` has a drop implementation that needs both the key and value.
249    //
250    // Each directory entry can only have one mount attached. Mount shadowing works by using the
251    // root of the inner mount as a mountpoint. For example, if filesystem A is mounted at /foo,
252    // mounting filesystem B on /foo will create the mount as a child of the A mount, attached to
253    // A's root, instead of the root mount.
254    submounts: HashSet<Submount>,
255
256    /// The membership of this mount in its peer group. Do not access directly. Instead use
257    /// peer_group(), take_from_peer_group(), and set_peer_group().
258    // TODO(tbodt): Refactor the links into, some kind of extra struct or something? This is hard
259    // because setting this field requires the Arc<Mount>.
260    peer_group_: Option<(Arc<PeerGroup>, PtrKey<Mount>)>,
261    /// The membership of this mount in a PeerGroup's downstream. Do not access directly. Instead
262    /// use upstream(), take_from_upstream(), and set_upstream().
263    upstream_: Option<(Weak<PeerGroup>, PtrKey<Mount>)>,
264}
265
266/// A group of mounts. Setting MS_SHARED on a mount puts it in its own peer group. Any bind mounts
267/// of a mount in the group are also added to the group. A mount created in any mount in a peer
268/// group will be automatically propagated (recreated) in every other mount in the group.
269#[derive(Default)]
270struct PeerGroup {
271    id: u64,
272    state: RwLock<PeerGroupState>,
273}
274#[derive(Default)]
275struct PeerGroupState {
276    mounts: HashSet<WeakKey<Mount>>,
277    downstream: HashSet<WeakKey<Mount>>,
278}
279
280pub enum WhatToMount {
281    Fs(FileSystemHandle),
282    Bind(NamespaceNode),
283}
284
285enum WhatSubmount {
286    New(WhatToMount, MountpointFlags),
287    Existing(MountHandle),
288}
289
290impl Mount {
291    pub fn new(what: WhatToMount, mut flags: MountpointFlags) -> MountHandle {
292        match what {
293            WhatToMount::Fs(fs) => {
294                // If `flags` does not explicitly specify an access-time flag then default to `RELATIME`.
295                flags.default_atime_from(MountpointFlags::RELATIME);
296                Self::new_with_root(fs.root().clone(), flags)
297            }
298            WhatToMount::Bind(node) => {
299                let mount = node.mount.as_ref().expect("can't bind mount from an anonymous node");
300                mount.clone_mount(&node.entry, flags.into())
301            }
302        }
303    }
304
305    fn new_with_root(root: DirEntryHandle, flags: MountpointFlags) -> MountHandle {
306        let fs = root.node.fs();
307        let kernel = fs.kernel.upgrade().expect("can't create mount without kernel");
308        Arc::new(Self {
309            id: kernel.get_next_mount_id(),
310            flags: (flags & MountpointFlags::STORED_ON_MOUNT).into(),
311            flags_lock: Mutex::new(()),
312            root,
313            active_client_counter: Default::default(),
314            fs,
315            state: Default::default(),
316        })
317    }
318
319    /// A namespace node referring to the root of the mount.
320    pub fn root(self: &MountHandle) -> NamespaceNode {
321        NamespaceNode::new(Arc::clone(self), Arc::clone(&self.root))
322    }
323
324    /// Create the specified mount as a child. Also propagate it to the mount's peer group.
325    fn create_submount(self: &MountHandle, dir: &DirEntryHandle, what: WhatSubmount) {
326        // TODO(b/482453480): Making a copy here is necessary for lock ordering, because the peer
327        // group lock nests inside all mount locks (it would be impractical to reverse this because
328        // you need to lock a mount to get its peer group.) But it opens the door to race conditions
329        // where if a peer are concurrently being added, the mount might not get propagated to the
330        // new peer. The only true solution to this is bigger locks, somehow using the same lock for
331        // the peer group and all of the mounts in the group. Since peer groups are fluid and can
332        // have mounts constantly joining and leaving and then joining other groups, the only
333        // sensible locking option is to use a single global lock for all mounts and peer groups.
334        // This is almost impossible to express in rust. Help.
335        //
336        // Update: Also necessary to make a copy to prevent excess replication, see the comment on
337        // the following Mount::new call.
338        let peers = {
339            let state = self.state.read();
340            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
341        };
342
343        // Create the mount after copying the peer list, because in the case of creating a bind
344        // mount inside itself, the new mount would get added to our peer group during the
345        // Mount::new call, but we don't want to replicate into it already. For an example see
346        // MountTest.QuizBRecursion.
347        let mount = match what {
348            WhatSubmount::Existing(mount) => mount,
349            WhatSubmount::New(what, flags) => Mount::new(what, flags),
350        };
351
352        if self.read().is_shared() {
353            mount.write().make_shared();
354        }
355
356        for peer in peers {
357            if Arc::ptr_eq(self, &peer) {
358                continue;
359            }
360            let clone = mount.clone_mount_recursive();
361            peer.write().add_submount_internal(dir, clone);
362        }
363
364        self.write().add_submount_internal(dir, mount)
365    }
366
367    fn remove_submount(self: &MountHandle, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
368        // create_submount explains why we need to make a copy of peers.
369        let peers = {
370            let state = self.state.read();
371            state.peer_group().map(|g| g.copy_propagation_targets()).unwrap_or_default()
372        };
373
374        for peer in peers {
375            if Arc::ptr_eq(self, &peer) {
376                continue;
377            }
378            // mount_namespaces(7): If B is shared, then all most-recently-mounted mounts at b on
379            // mounts that receive propagation from mount B and do not have submounts under them are
380            // unmounted.
381            let mut peer = peer.write();
382            if let Some(submount) = peer.submounts.get(mount_hash_key) {
383                if !submount.mount.read().submounts.is_empty() {
384                    continue;
385                }
386            }
387            let _ = peer.remove_submount_internal(mount_hash_key);
388        }
389
390        self.write().remove_submount_internal(mount_hash_key)
391    }
392
393    pub fn move_mount(
394        source_mount: &MountHandle,
395        target_mount: &MountHandle,
396        target_dir: &DirEntryHandle,
397    ) -> Result<(), Errno> {
398        // TODO(b/482453480): Moving a mount is supposed to be atomic, but this isn't. Trying to
399        // think of a way to ensure full atomicity in the current locking model led to a train of
400        // thought of spiraling complexity (you need to lock source_parent before source_mount, but
401        // you need to lock source_mount in order to get a reference to source_parent, and someone
402        // could move the mount again in between these operations, so you need to retry this.) So
403        // I'm settling for not trying for atomicitiy, plus a TODO comment.
404        let source_mountpoint = source_mount.read().mountpoint().ok_or_else(|| errno!(EIO))?;
405        let source_parent =
406            source_mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
407
408        // First, disconnect the mount from its parent.
409        {
410            let mut source_parent = source_parent.write();
411            if source_parent.peer_group().is_some() {
412                // Sayeth mount(2):
413                // EINVAL A move operation (MS_MOVE) was attempted, but the parent mount of source
414                //        mount has propagation type MS_SHARED.
415                return error!(EINVAL);
416            }
417            let mut source_mount = source_mount.write();
418            source_parent.remove_submount_internal(source_mountpoint.mount_hash_key())?;
419            source_mount.mountpoint = None;
420        }
421
422        target_mount.create_submount(target_dir, WhatSubmount::Existing(Arc::clone(source_mount)));
423        Ok(())
424    }
425
426    /// Create a new mount with the same filesystem, flags, and peer group. Used to implement bind
427    /// mounts.
428    fn clone_mount(
429        self: &MountHandle,
430        new_root: &DirEntryHandle,
431        flags: MountFlags,
432    ) -> MountHandle {
433        assert!(new_root.is_descendant_of(&self.root));
434        // According to mount(2) on bind mounts, all flags other than MS_REC are ignored when doing
435        // a bind mount.
436        let clone = Self::new_with_root(Arc::clone(new_root), self.mount_flags());
437
438        if flags.contains(MountFlags::REC) {
439            // This is two steps because the alternative (locking clone.state while iterating over
440            // self.state.submounts) trips tracing_mutex. The lock ordering is parent -> child, and
441            // if the clone is eventually made a child of self, this looks like an ordering
442            // violation. I'm not convinced it's a real issue, but I can't convince myself it's not
443            // either.
444            let mut submounts = vec![];
445            for Submount { dir, mount } in &self.state.read().submounts {
446                submounts.push((dir.clone(), mount.clone_mount_recursive()));
447            }
448            let mut clone_state = clone.write();
449            for (dir, submount) in submounts {
450                clone_state.add_submount_internal(&dir, submount);
451            }
452        }
453
454        // Put the clone in the same peer group
455        let peer_group = self.state.read().peer_group().map(Arc::clone);
456        if let Some(peer_group) = peer_group {
457            clone.write().set_peer_group(peer_group);
458        }
459
460        clone
461    }
462
463    /// Do a clone of the full mount hierarchy below this mount. Used for creating mount
464    /// namespaces and creating copies to use for propagation.
465    fn clone_mount_recursive(self: &MountHandle) -> MountHandle {
466        self.clone_mount(&self.root, MountFlags::REC)
467    }
468
469    pub fn change_propagation(self: &MountHandle, flag: MountFlags, recursive: bool) {
470        let mut state = self.write();
471        match flag {
472            MountFlags::SHARED => state.make_shared(),
473            MountFlags::PRIVATE => state.make_private(),
474            MountFlags::DOWNSTREAM => state.make_downstream(),
475            _ => {
476                log_warn!("mount propagation {:?}", flag);
477                return;
478            }
479        }
480
481        if recursive {
482            for submount in &state.submounts {
483                submount.mount.change_propagation(flag, recursive);
484            }
485        }
486    }
487
488    /// Returns the effective flags for the `Mount`, calculated as the union of the mount flags
489    /// associated with the `FileSystem`, and with the `Mount` itself.
490    fn flags(&self) -> MountFlags {
491        MountFlags::from(self.mount_flags()) | self.fs_flags().into()
492    }
493
494    /// Returns the mount flags stored unique to this `Mount`.
495    fn mount_flags(&self) -> MountpointFlags {
496        self.flags.load(Ordering::Relaxed)
497    }
498
499    /// Returns the mount flags for the `FileSystem` of this `Mount`.
500    fn fs_flags(&self) -> FileSystemFlags {
501        self.fs.options.flags.load(Ordering::Relaxed)
502    }
503
504    /// Updates the `Mount` with the per-mount flags specified in `flags`, while preserving the
505    /// existing access-time flag if no access-time flag is set in `flags`.
506    pub fn update_flags(self: &MountHandle, mut flags: MountpointFlags) {
507        let _lock = self.flags_lock.lock();
508        // Since Linux 3.17, if none of MS_NOATIME, MS_NODIRATIME,
509        // MS_RELATIME, or MS_STRICTATIME is specified in mountflags, then
510        // the remount operation preserves the existing values of these
511        // flags (rather than defaulting to MS_RELATIME).
512        flags.default_atime_from(self.flags.load(Ordering::Relaxed));
513        flags &= MountpointFlags::STORED_ON_MOUNT;
514        self.flags.store(flags, Ordering::Relaxed);
515    }
516
517    /// The number of active clients of this mount.
518    ///
519    /// The mount cannot be unmounted if there are any active clients.
520    fn active_clients(&self) -> usize {
521        // We need to subtract one for our own reference. We are not a real client.
522        Arc::strong_count(&self.active_client_counter) - 1
523    }
524
525    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
526        if !flags.contains(UnmountFlags::DETACH) {
527            if self.active_clients() > 0 || !self.state.read().submounts.is_empty() {
528                return error!(EBUSY);
529            }
530        }
531        let mountpoint = self.state.read().mountpoint().ok_or_else(|| errno!(EINVAL))?;
532        let parent_mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
533        parent_mount.remove_submount(mountpoint.mount_hash_key())
534    }
535
536    /// Returns the security state of the fs.
537    pub fn security_state(&self) -> &security::FileSystemState {
538        &self.fs.security_state
539    }
540
541    /// Returns the name of the fs.
542    pub fn fs_name(&self) -> &'static FsStr {
543        self.fs.name()
544    }
545
546    /// Reconfigures the flags for the `FileSystem` backing this mount point.
547    pub fn reconfigure_fs(
548        &self,
549        current_task: &CurrentTask,
550        flags: FileSystemFlags,
551    ) -> Result<(), Errno> {
552        self.fs.update_flags(current_task, flags)
553    }
554
555    state_accessor!(Mount, state, Arc<Mount>);
556}
557
558impl MountState {
559    /// Returns true if there is a submount on top of `dir_entry`.
560    pub fn has_submount(&self, dir_entry: &DirEntryHandle) -> bool {
561        self.submounts.contains(ArcKey::ref_cast(dir_entry))
562    }
563
564    /// The NamespaceNode on which this Mount is mounted.
565    fn mountpoint(&self) -> Option<NamespaceNode> {
566        let (mount, entry) = self.mountpoint.as_ref()?;
567        Some(NamespaceNode::new(mount.upgrade()?, entry.clone()))
568    }
569
570    /// Return this mount's current peer group.
571    fn peer_group(&self) -> Option<&Arc<PeerGroup>> {
572        let (group, _) = self.peer_group_.as_ref()?;
573        Some(group)
574    }
575
576    /// Remove this mount from its peer group and return the peer group.
577    fn take_from_peer_group(&mut self) -> Option<Arc<PeerGroup>> {
578        let (old_group, old_mount) = self.peer_group_.take()?;
579        old_group.remove(old_mount);
580        if let Some(upstream) = self.take_from_upstream() {
581            let next_mount =
582                old_group.state.read().mounts.iter().next().map(|w| w.0.upgrade().unwrap());
583            if let Some(next_mount) = next_mount {
584                // TODO(https://fxbug.dev/42065259): Fix the lock ordering here. We've locked next_mount
585                // while self is locked, and since the propagation tree and mount tree are
586                // separate, this could violate the mount -> submount order previously established.
587                next_mount.write().set_upstream(upstream);
588            }
589        }
590        Some(old_group)
591    }
592
593    fn upstream(&self) -> Option<Arc<PeerGroup>> {
594        self.upstream_.as_ref().and_then(|g| g.0.upgrade())
595    }
596
597    fn take_from_upstream(&mut self) -> Option<Arc<PeerGroup>> {
598        let (old_upstream, old_mount) = self.upstream_.take()?;
599        // TODO(tbodt): Reason about whether the upgrade() could possibly return None, and what we
600        // should actually do in that case.
601        let old_upstream = old_upstream.upgrade()?;
602        old_upstream.remove_downstream(old_mount);
603        Some(old_upstream)
604    }
605}
606
607#[apply(state_implementation!)]
608impl MountState<Base = Mount, BaseType = Arc<Mount>> {
609    /// Add a child mount *without propagating it to the peer group*. For internal use only.
610    fn add_submount_internal(&mut self, dir: &DirEntryHandle, mount: MountHandle) {
611        if !dir.is_descendant_of(&self.base.root) {
612            return;
613        }
614
615        let submount = mount.fs.kernel.upgrade().unwrap().mounts.register_mount(dir, mount.clone());
616        let old_mountpoint =
617            mount.state.write().mountpoint.replace((Arc::downgrade(self.base), Arc::clone(dir)));
618        assert!(old_mountpoint.is_none(), "add_submount can only take a newly created mount");
619        // Mount shadowing is implemented by mounting onto the root of the first mount, not by
620        // creating two mounts on the same mountpoint.
621        let old_mount = self.submounts.replace(submount);
622
623        // In rare cases, mount propagation might result in a request to mount on a directory where
624        // something is already mounted. MountTest.LotsOfShadowing will trigger this. Linux handles
625        // this by inserting the new mount between the old mount and the current mount.
626        if let Some(mut old_mount) = old_mount {
627            // Previous state: self[dir] = old_mount
628            // New state: self[dir] = new_mount, new_mount[new_mount.root] = old_mount
629            // The new mount has already been inserted into self, now just update the old mount to
630            // be a child of the new mount.
631            old_mount.mount.write().mountpoint = Some((Arc::downgrade(&mount), Arc::clone(dir)));
632            old_mount.dir = ArcKey(mount.root.clone());
633            mount.write().submounts.insert(old_mount);
634        }
635    }
636
637    fn remove_submount_internal(&mut self, mount_hash_key: &ArcKey<DirEntry>) -> Result<(), Errno> {
638        if self.submounts.remove(mount_hash_key) { Ok(()) } else { error!(EINVAL) }
639    }
640
641    /// Set this mount's peer group.
642    fn set_peer_group(&mut self, group: Arc<PeerGroup>) {
643        self.take_from_peer_group();
644        group.add(self.base);
645        self.peer_group_ = Some((group, Arc::as_ptr(self.base).into()));
646    }
647
648    fn set_upstream(&mut self, group: Arc<PeerGroup>) {
649        self.take_from_upstream();
650        group.add_downstream(self.base);
651        self.upstream_ = Some((Arc::downgrade(&group), Arc::as_ptr(self.base).into()));
652    }
653
654    /// Is the mount in a peer group? Corresponds to MS_SHARED.
655    pub fn is_shared(&self) -> bool {
656        self.peer_group().is_some()
657    }
658
659    /// Put the mount in a peer group. Implements MS_SHARED.
660    pub fn make_shared(&mut self) {
661        if self.is_shared() {
662            return;
663        }
664        let kernel =
665            self.base.fs.kernel.upgrade().expect("can't create new peer group without kernel");
666        self.set_peer_group(PeerGroup::new(kernel.get_next_peer_group_id()));
667    }
668
669    /// Take the mount out of its peer group, also remove upstream if any. Implements MS_PRIVATE.
670    pub fn make_private(&mut self) {
671        self.take_from_peer_group();
672        self.take_from_upstream();
673    }
674
675    /// Take the mount out of its peer group and make it downstream instead. Implements
676    /// MountFlags::DOWNSTREAM (MS_SLAVE).
677    pub fn make_downstream(&mut self) {
678        if let Some(peer_group) = self.take_from_peer_group() {
679            self.set_upstream(peer_group);
680        }
681    }
682}
683
684impl PeerGroup {
685    fn new(id: u64) -> Arc<Self> {
686        Arc::new(Self { id, state: Default::default() })
687    }
688
689    fn add(&self, mount: &Arc<Mount>) {
690        self.state.write().mounts.insert(WeakKey::from(mount));
691    }
692
693    fn remove(&self, mount: PtrKey<Mount>) {
694        self.state.write().mounts.remove(&mount);
695    }
696
697    fn add_downstream(&self, mount: &Arc<Mount>) {
698        self.state.write().downstream.insert(WeakKey::from(mount));
699    }
700
701    fn remove_downstream(&self, mount: PtrKey<Mount>) {
702        self.state.write().downstream.remove(&mount);
703    }
704
705    fn copy_propagation_targets(&self) -> Vec<MountHandle> {
706        let mut buf = vec![];
707        self.collect_propagation_targets(&mut buf);
708        buf
709    }
710
711    fn collect_propagation_targets(&self, buf: &mut Vec<MountHandle>) {
712        let downstream_mounts: Vec<_> = {
713            let state = self.state.read();
714            buf.extend(state.mounts.iter().filter_map(|m| m.0.upgrade()));
715            state.downstream.iter().filter_map(|m| m.0.upgrade()).collect()
716        };
717        for mount in downstream_mounts {
718            let peer_group = mount.read().peer_group().map(Arc::clone);
719            match peer_group {
720                Some(group) => group.collect_propagation_targets(buf),
721                None => buf.push(mount),
722            }
723        }
724    }
725}
726
727impl Drop for Mount {
728    fn drop(&mut self) {
729        let state = self.state.get_mut();
730        state.take_from_peer_group();
731        state.take_from_upstream();
732    }
733}
734
735impl fmt::Debug for Mount {
736    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737        let state = self.state.read();
738        f.debug_struct("Mount")
739            .field("id", &(self as *const Mount))
740            .field("root", &self.root)
741            .field("mountpoint", &state.mountpoint)
742            .field("submounts", &state.submounts)
743            .finish()
744    }
745}
746
747impl Kernel {
748    pub fn get_next_mount_id(&self) -> u64 {
749        self.next_mount_id.next()
750    }
751
752    pub fn get_next_peer_group_id(&self) -> u64 {
753        self.next_peer_group_id.next()
754    }
755
756    pub fn get_next_namespace_id(&self) -> u64 {
757        self.next_namespace_id.next()
758    }
759}
760
761impl CurrentTask {
762    pub fn create_filesystem(
763        &self,
764        locked: &mut Locked<Unlocked>,
765        fs_type: &FsStr,
766        options: FileSystemOptions,
767    ) -> Result<FileSystemHandle, Errno> {
768        // Please register new file systems via //src/starnix/modules/lib.rs, even if the file
769        // system is implemented inside starnix_core.
770        //
771        // Most file systems should be implemented as modules. The VFS provides various traits that
772        // let starnix_core integrate file systems without needing to depend on the file systems
773        // directly.
774        self.kernel()
775            .expando
776            .get::<FsRegistry>()
777            .create(locked, self, fs_type, options)
778            .ok_or_else(|| errno!(ENODEV, fs_type))?
779    }
780}
781
782struct ProcMountsFileSource(Weak<Task>);
783
784impl DynamicFileSource for ProcMountsFileSource {
785    fn generate(
786        &self,
787        _current_task: &CurrentTask,
788        sink: &mut DynamicFileBuf,
789    ) -> Result<(), Errno> {
790        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
791        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
792        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
793        // extra work to maintain it.
794        let task = Task::from_weak(&self.0)?;
795        let task_fs = task.live()?.fs.read();
796        let root = task_fs.root();
797        let ns = task_fs.namespace();
798        for_each_mount(&ns.root_mount, &mut |mount| {
799            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
800            if !mountpoint.is_descendant_of(&root) {
801                return Ok(());
802            }
803            write!(
804                sink,
805                "{} {} {} {}{}",
806                mount.fs.options.source_for_display(),
807                mountpoint.path(&task_fs),
808                mount.fs.name(),
809                // Report the union of the FileSystem and Mount flags, as well as any FileSystem-
810                // or LSM-specific options.
811                mount.flags(),
812                security::sb_show_options(&task.kernel(), &mount.fs)?,
813            )?;
814            writeln!(sink, " 0 0")?;
815            Ok(())
816        })?;
817        Ok(())
818    }
819}
820
821pub struct ProcMountsFile {
822    dynamic_file: DynamicFile<ProcMountsFileSource>,
823}
824
825impl ProcMountsFile {
826    pub fn new_node(task: Weak<Task>) -> impl FsNodeOps {
827        SimpleFileNode::new(move |_, _| {
828            Ok(Self { dynamic_file: DynamicFile::new(ProcMountsFileSource(task.clone())) })
829        })
830    }
831}
832
833impl FileOps for ProcMountsFile {
834    fileops_impl_delegate_read_write_and_seek!(self, self.dynamic_file);
835    fileops_impl_noop_sync!();
836
837    fn wait_async(
838        &self,
839        _locked: &mut Locked<FileOpsCore>,
840        _file: &FileObject,
841        _current_task: &CurrentTask,
842        waiter: &Waiter,
843        _events: FdEvents,
844        _handler: EventHandler,
845    ) -> Option<WaitCanceler> {
846        // Polling this file gives notifications when any change to mounts occurs. This is not
847        // implemented yet, but stubbed for Android init.
848        Some(waiter.fake_wait())
849    }
850
851    fn query_events(
852        &self,
853        _locked: &mut Locked<FileOpsCore>,
854        _file: &FileObject,
855        _current_task: &CurrentTask,
856    ) -> Result<FdEvents, Errno> {
857        Ok(FdEvents::empty())
858    }
859}
860
861#[derive(Clone)]
862pub struct ProcMountinfoFile(Weak<Task>);
863impl ProcMountinfoFile {
864    pub fn new_node(task: Weak<Task>) -> impl FsNodeOps {
865        DynamicFile::new_node(Self(task))
866    }
867}
868impl DynamicFileSource for ProcMountinfoFile {
869    fn generate(
870        &self,
871        _current_task: &CurrentTask,
872        sink: &mut DynamicFileBuf,
873    ) -> Result<(), Errno> {
874        // Returns path to the `dir` from the root of the file system.
875        fn path_from_fs_root(dir: &DirEntryHandle) -> FsString {
876            let mut path = PathBuilder::new();
877            if dir.is_dead() {
878                // Return `/foo/dir//deleted` if the dir was deleted.
879                path.prepend_element("/deleted".into());
880            }
881            let scope = RcuReadScope::new();
882            let mut current = dir.deref();
883            while let Some(parent) = current.parent_ref(&scope) {
884                path.prepend_element(current.local_name(&scope));
885                current = parent;
886            }
887            path.build_absolute()
888        }
889
890        // TODO(tbodt): We should figure out a way to have a real iterator instead of grabbing the
891        // entire list in one go. Should we have a BTreeMap<u64, Weak<Mount>> in the Namespace?
892        // Also has the benefit of correct (i.e. chronological) ordering. But then we have to do
893        // extra work to maintain it.
894        let task = Task::from_weak(&self.0)?;
895        let task_fs = task.live()?.fs.read();
896        let root = task_fs.root();
897        let ns = task_fs.namespace();
898        for_each_mount(&ns.root_mount, &mut |mount| {
899            let mountpoint = mount.read().mountpoint().unwrap_or_else(|| mount.root());
900            if !mountpoint.is_descendant_of(&root) {
901                return Ok(());
902            }
903            // Can't fail, mountpoint() and root() can't return a NamespaceNode with no mount
904            let parent = mountpoint.mount.as_ref().unwrap();
905            write!(
906                sink,
907                "{} {} {} {} {} {}",
908                mount.id,
909                parent.id,
910                mount.root.node.fs().dev_id,
911                path_from_fs_root(&mount.root),
912                mountpoint.path(&task_fs),
913                mount.mount_flags(),
914            )?;
915            if let Some(peer_group) = mount.read().peer_group() {
916                write!(sink, " shared:{}", peer_group.id)?;
917            }
918            if let Some(upstream) = mount.read().upstream() {
919                write!(sink, " master:{}", upstream.id)?;
920            }
921            writeln!(
922                sink,
923                " - {} {} {}{}",
924                mount.fs.name(),
925                mount.fs.options.source_for_display(),
926                mount.fs_flags(),
927                // LSM options are associated with the FileSystem rather than the Mount.
928                security::sb_show_options(&task.kernel(), &mount.fs)?
929            )?;
930            Ok(())
931        })?;
932        Ok(())
933    }
934}
935
936fn for_each_mount<E>(
937    mount: &MountHandle,
938    callback: &mut impl FnMut(&MountHandle) -> Result<(), E>,
939) -> Result<(), E> {
940    callback(mount)?;
941    // Collect list first to avoid self deadlock when ProcMountinfoFile::read_at tries to call
942    // NamespaceNode::path()
943    let submounts: Vec<_> = mount.read().submounts.iter().map(|s| s.mount.clone()).collect();
944    for submount in submounts {
945        for_each_mount(&submount, callback)?;
946    }
947    Ok(())
948}
949
950/// The `SymlinkMode` enum encodes how symlinks are followed during path traversal.
951#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)]
952pub enum SymlinkMode {
953    /// Follow a symlink at the end of a path resolution.
954    #[default]
955    Follow,
956
957    /// Do not follow a symlink at the end of a path resolution.
958    NoFollow,
959}
960
961/// The maximum number of symlink traversals that can be made during path resolution.
962pub const MAX_SYMLINK_FOLLOWS: u8 = 40;
963
964/// The context passed during namespace lookups.
965///
966/// Namespace lookups need to mutate a shared context in order to correctly
967/// count the number of remaining symlink traversals.
968pub struct LookupContext {
969    /// The SymlinkMode for the lookup.
970    ///
971    /// As the lookup proceeds, the follow count is decremented each time the
972    /// lookup traverses a symlink.
973    pub symlink_mode: SymlinkMode,
974
975    /// The number of symlinks remaining the follow.
976    ///
977    /// Each time path resolution calls readlink, this value is decremented.
978    pub remaining_follows: u8,
979
980    /// Whether the result of the lookup must be a directory.
981    ///
982    /// For example, if the path ends with a `/` or if userspace passes
983    /// O_DIRECTORY. This flag can be set to true if the lookup encounters a
984    /// symlink that ends with a `/`.
985    pub must_be_directory: bool,
986
987    /// Resolve flags passed to `openat2`. Empty if the lookup originated in any other syscall.
988    pub resolve_flags: ResolveFlags,
989
990    /// Base directory for the lookup. Set only when either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT`
991    /// is passed to `openat2`.
992    pub resolve_base: ResolveBase,
993}
994
995/// Used to specify base directory in `LookupContext` for lookups originating in the `openat2`
996/// syscall with either `RESOLVE_BENEATH` or `RESOLVE_IN_ROOT` flag.
997#[derive(Clone, Eq, PartialEq)]
998pub enum ResolveBase {
999    None,
1000
1001    /// The lookup is not allowed to traverse any node that's not beneath the specified node.
1002    Beneath(NamespaceNode),
1003
1004    /// The lookup should be handled as if the root specified node is the file-system root.
1005    InRoot(NamespaceNode),
1006}
1007
1008impl LookupContext {
1009    pub fn new(symlink_mode: SymlinkMode) -> LookupContext {
1010        LookupContext {
1011            symlink_mode,
1012            remaining_follows: MAX_SYMLINK_FOLLOWS,
1013            must_be_directory: false,
1014            resolve_flags: ResolveFlags::empty(),
1015            resolve_base: ResolveBase::None,
1016        }
1017    }
1018
1019    pub fn with(&self, symlink_mode: SymlinkMode) -> LookupContext {
1020        LookupContext { symlink_mode, resolve_base: self.resolve_base.clone(), ..*self }
1021    }
1022
1023    pub fn update_for_path(&mut self, path: &FsStr) {
1024        if path.last() == Some(&b'/') {
1025            // The last path element must resolve to a directory. This is because a trailing slash
1026            // was found in the path.
1027            self.must_be_directory = true;
1028            // If the last path element is a symlink, we should follow it.
1029            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
1030            self.symlink_mode = SymlinkMode::Follow;
1031        }
1032    }
1033}
1034
1035impl Default for LookupContext {
1036    fn default() -> Self {
1037        LookupContext::new(SymlinkMode::Follow)
1038    }
1039}
1040
1041/// Whether the path is reachable from the given root.
1042pub enum PathWithReachability {
1043    /// The path is reachable from the given root.
1044    Reachable(FsString),
1045
1046    /// The path is not reachable from the given root.
1047    Unreachable(FsString),
1048}
1049
1050impl PathWithReachability {
1051    pub fn into_path(self) -> FsString {
1052        match self {
1053            PathWithReachability::Reachable(path) => path,
1054            PathWithReachability::Unreachable(path) => path,
1055        }
1056    }
1057}
1058
1059/// A node in a mount namespace.
1060///
1061/// This tree is a composite of the mount tree and the FsNode tree.
1062///
1063/// These nodes are used when traversing paths in a namespace in order to
1064/// present the client the directory structure that includes the mounted
1065/// filesystems.
1066#[derive(Clone)]
1067pub struct NamespaceNode {
1068    /// The mount where this namespace node is mounted.
1069    ///
1070    /// A given FsNode can be mounted in multiple places in a namespace. This
1071    /// field distinguishes between them.
1072    pub mount: MountInfo,
1073
1074    /// The FsNode that corresponds to this namespace entry.
1075    pub entry: DirEntryHandle,
1076}
1077
1078impl NamespaceNode {
1079    pub fn new(mount: MountHandle, entry: DirEntryHandle) -> Self {
1080        Self { mount: Some(mount).into(), entry }
1081    }
1082
1083    /// Create a namespace node that is not mounted in a namespace.
1084    pub fn new_anonymous(entry: DirEntryHandle) -> Self {
1085        Self { mount: None.into(), entry }
1086    }
1087
1088    /// Create a namespace node that is not mounted in a namespace and that refers to a node that
1089    /// is not rooted in a hierarchy and has no name.
1090    pub fn new_anonymous_unrooted(current_task: &CurrentTask, node: FsNodeHandle) -> Self {
1091        let dir_entry = DirEntry::new_unrooted(node);
1092        let _ = security::fs_node_init_with_dentry_no_xattr(current_task, &dir_entry);
1093        Self::new_anonymous(dir_entry)
1094    }
1095
1096    /// Create a FileObject corresponding to this namespace node.
1097    ///
1098    /// This function is the primary way of instantiating FileObjects. Each
1099    /// FileObject records the NamespaceNode that created it in order to
1100    /// remember its path in the Namespace.
1101    pub fn open(
1102        &self,
1103        locked: &mut Locked<Unlocked>,
1104        current_task: &CurrentTask,
1105        flags: OpenFlags,
1106        access_check: AccessCheck,
1107    ) -> Result<FileHandle, Errno> {
1108        let ops = self.entry.node.open(locked, current_task, self, flags, access_check)?;
1109        FileObject::new(locked, current_task, ops, self.clone(), flags)
1110    }
1111
1112    /// Create or open a node in the file system.
1113    ///
1114    /// Works for any type of node other than a symlink.
1115    ///
1116    /// Will return an existing node unless `flags` contains `OpenFlags::EXCL`.
1117    pub fn open_create_node<L>(
1118        &self,
1119        locked: &mut Locked<L>,
1120        current_task: &CurrentTask,
1121        name: &FsStr,
1122        mode: FileMode,
1123        dev: DeviceId,
1124        flags: OpenFlags,
1125    ) -> Result<NamespaceNode, Errno>
1126    where
1127        L: LockEqualOrBefore<FileOpsCore>,
1128    {
1129        let owner = current_task.current_fscred();
1130        let mode = current_task.fs().apply_umask(mode);
1131        let create_fn =
1132            |locked: &mut Locked<L>, dir: &FsNodeHandle, mount: &MountInfo, name: &_| {
1133                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1134            };
1135        let entry = if flags.contains(OpenFlags::EXCL) {
1136            self.entry.create_entry(locked, current_task, &self.mount, name, create_fn)
1137        } else {
1138            self.entry.get_or_create_entry(locked, current_task, &self.mount, name, create_fn)
1139        }?;
1140        Ok(self.with_new_entry(entry))
1141    }
1142
1143    pub fn into_active(self) -> ActiveNamespaceNode {
1144        ActiveNamespaceNode::new(self)
1145    }
1146
1147    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1148        self.into_active().into_mapping(mode)
1149    }
1150
1151    /// Create a node in the file system.
1152    ///
1153    /// Works for any type of node other than a symlink.
1154    ///
1155    /// Does not return an existing node.
1156    pub fn create_node<L>(
1157        &self,
1158        locked: &mut Locked<L>,
1159        current_task: &CurrentTask,
1160        name: &FsStr,
1161        mode: FileMode,
1162        dev: DeviceId,
1163    ) -> Result<NamespaceNode, Errno>
1164    where
1165        L: LockEqualOrBefore<FileOpsCore>,
1166    {
1167        let owner = current_task.current_fscred();
1168        let mode = current_task.fs().apply_umask(mode);
1169        let entry = self.entry.create_entry(
1170            locked,
1171            current_task,
1172            &self.mount,
1173            name,
1174            |locked, dir, mount, name| {
1175                dir.create_node(locked, current_task, mount, name, mode, dev, owner)
1176            },
1177        )?;
1178        Ok(self.with_new_entry(entry))
1179    }
1180
1181    /// Create a symlink in the file system.
1182    ///
1183    /// To create another type of node, use `create_node`.
1184    pub fn create_symlink<L>(
1185        &self,
1186        locked: &mut Locked<L>,
1187        current_task: &CurrentTask,
1188        name: &FsStr,
1189        target: &FsStr,
1190    ) -> Result<NamespaceNode, Errno>
1191    where
1192        L: LockEqualOrBefore<FileOpsCore>,
1193    {
1194        let owner = current_task.current_fscred();
1195        let entry = self.entry.create_entry(
1196            locked,
1197            current_task,
1198            &self.mount,
1199            name,
1200            |locked, dir, mount, name| {
1201                dir.create_symlink(locked, current_task, mount, name, target, owner)
1202            },
1203        )?;
1204        Ok(self.with_new_entry(entry))
1205    }
1206
1207    /// Creates an anonymous file.
1208    ///
1209    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
1210    ///
1211    /// Used by O_TMPFILE.
1212    pub fn create_tmpfile<L>(
1213        &self,
1214        locked: &mut Locked<L>,
1215        current_task: &CurrentTask,
1216        mode: FileMode,
1217        flags: OpenFlags,
1218    ) -> Result<NamespaceNode, Errno>
1219    where
1220        L: LockEqualOrBefore<FileOpsCore>,
1221    {
1222        let owner = current_task.current_fscred();
1223        let mode = current_task.fs().apply_umask(mode);
1224        Ok(self.with_new_entry(self.entry.create_tmpfile(
1225            locked,
1226            current_task,
1227            &self.mount,
1228            mode,
1229            owner,
1230            flags,
1231        )?))
1232    }
1233
1234    pub fn link<L>(
1235        &self,
1236        locked: &mut Locked<L>,
1237        current_task: &CurrentTask,
1238        name: &FsStr,
1239        child: &FsNodeHandle,
1240    ) -> Result<NamespaceNode, Errno>
1241    where
1242        L: LockEqualOrBefore<FileOpsCore>,
1243    {
1244        let dir_entry = self.entry.create_entry(
1245            locked,
1246            current_task,
1247            &self.mount,
1248            name,
1249            |locked, dir, mount, name| dir.link(locked, current_task, mount, name, child),
1250        )?;
1251        Ok(self.with_new_entry(dir_entry))
1252    }
1253
1254    pub fn bind_socket<L>(
1255        &self,
1256        locked: &mut Locked<L>,
1257        current_task: &CurrentTask,
1258        name: &FsStr,
1259        socket: SocketHandle,
1260        socket_address: SocketAddress,
1261        mode: FileMode,
1262    ) -> Result<NamespaceNode, Errno>
1263    where
1264        L: LockEqualOrBefore<FileOpsCore>,
1265    {
1266        let dir_entry = self.entry.create_entry(
1267            locked,
1268            current_task,
1269            &self.mount,
1270            name,
1271            |locked, dir, mount, name| {
1272                let node = dir.create_node(
1273                    locked,
1274                    current_task,
1275                    mount,
1276                    name,
1277                    mode,
1278                    DeviceId::NONE,
1279                    current_task.current_fscred(),
1280                )?;
1281                if let Some(unix_socket) = socket.downcast_socket::<UnixSocket>() {
1282                    unix_socket.bind_socket_to_node(&socket, socket_address, &node)?;
1283                } else {
1284                    return error!(ENOTSUP);
1285                }
1286                Ok(node)
1287            },
1288        )?;
1289        Ok(self.with_new_entry(dir_entry))
1290    }
1291
1292    pub fn unlink<L>(
1293        &self,
1294        locked: &mut Locked<L>,
1295        current_task: &CurrentTask,
1296        name: &FsStr,
1297        kind: UnlinkKind,
1298        must_be_directory: bool,
1299    ) -> Result<(), Errno>
1300    where
1301        L: LockEqualOrBefore<FileOpsCore>,
1302    {
1303        if DirEntry::is_reserved_name(name) {
1304            match kind {
1305                UnlinkKind::Directory => {
1306                    if name == ".." {
1307                        error!(ENOTEMPTY)
1308                    } else if self.parent().is_none() {
1309                        // The client is attempting to remove the root.
1310                        error!(EBUSY)
1311                    } else {
1312                        error!(EINVAL)
1313                    }
1314                }
1315                UnlinkKind::NonDirectory => error!(ENOTDIR),
1316            }
1317        } else {
1318            self.entry.unlink(locked, current_task, &self.mount, name, kind, must_be_directory)
1319        }
1320    }
1321
1322    // Resolve the current node.
1323    //
1324    // Depending on context, this will resolve symlink and mount point.
1325    fn resolve<L>(
1326        self,
1327        locked: &mut Locked<L>,
1328        current_task: &CurrentTask,
1329        context: &mut LookupContext,
1330    ) -> Result<NamespaceNode, Errno>
1331    where
1332        L: LockEqualOrBefore<FileOpsCore>,
1333    {
1334        let mut node = self;
1335
1336        loop {
1337            if !node.entry.node.is_lnk() || context.symlink_mode == SymlinkMode::NoFollow {
1338                break;
1339            }
1340            if context.remaining_follows == 0
1341                || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
1342            {
1343                return error!(ELOOP);
1344            }
1345            context.remaining_follows -= 1;
1346            node = match node.readlink(locked, current_task)? {
1347                SymlinkTarget::Path(link_target) => {
1348                    let link_directory = if link_target[0] == b'/' {
1349                        // If the path is absolute, we'll resolve the root directory.
1350                        match &context.resolve_base {
1351                            ResolveBase::None => current_task.fs().root(),
1352                            ResolveBase::Beneath(_) => return error!(EXDEV),
1353                            ResolveBase::InRoot(root) => root.clone(),
1354                        }
1355                    } else {
1356                        // If the path is not absolute, it's a relative directory.
1357                        // Let's try to get the parent of the current node, or in the case that
1358                        // the node is the root we can just use that directly.
1359                        node.parent().unwrap_or(node)
1360                    };
1361                    current_task.lookup_path(
1362                        locked,
1363                        context,
1364                        link_directory,
1365                        link_target.as_ref(),
1366                    )?
1367                }
1368                SymlinkTarget::Node(node) => {
1369                    if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
1370                        return error!(ELOOP);
1371                    }
1372                    node
1373                }
1374            };
1375        }
1376        Ok(node.enter_mount())
1377    }
1378
1379    /// Traverse down a parent-to-child link in the namespace.
1380    pub fn lookup_child<L>(
1381        &self,
1382        locked: &mut Locked<L>,
1383        current_task: &CurrentTask,
1384        context: &mut LookupContext,
1385        basename: &FsStr,
1386    ) -> Result<NamespaceNode, Errno>
1387    where
1388        L: LockEqualOrBefore<FileOpsCore>,
1389    {
1390        self.lookup_children(locked, current_task, context, &[basename])
1391    }
1392
1393    /// Traverse down a parent-to-child link in the namespace.
1394    pub fn lookup_children<L>(
1395        &self,
1396        locked: &mut Locked<L>,
1397        current_task: &CurrentTask,
1398        context: &mut LookupContext,
1399        mut basenames: &[&FsStr],
1400    ) -> Result<NamespaceNode, Errno>
1401    where
1402        L: LockEqualOrBefore<FileOpsCore>,
1403    {
1404        for name in basenames {
1405            if name.len() > NAME_MAX as usize {
1406                return error!(ENAMETOOLONG);
1407            }
1408        }
1409
1410        let mut current_namespace_node = self.clone();
1411
1412        while basenames.len() > 0 {
1413            if !current_namespace_node.entry.node.is_dir() {
1414                return error!(ENOTDIR);
1415            }
1416
1417            let basename = basenames[0];
1418            if basename.is_empty() || basename == "." {
1419                basenames = &basenames[1..];
1420                continue;
1421            }
1422            if basename == ".." {
1423                let root = match &context.resolve_base {
1424                    ResolveBase::None => current_task.fs().root(),
1425                    ResolveBase::Beneath(node) => {
1426                        // Do not allow traversal out of the 'node'.
1427                        if current_namespace_node == *node {
1428                            return error!(EXDEV);
1429                        }
1430                        current_task.fs().root()
1431                    }
1432                    ResolveBase::InRoot(root) => root.clone(),
1433                };
1434
1435                // Make sure this can't escape a chroot.
1436                if current_namespace_node != root {
1437                    current_namespace_node =
1438                        current_namespace_node.parent().unwrap_or(current_namespace_node)
1439                }
1440                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1441                    && current_namespace_node.mount != self.mount
1442                {
1443                    return error!(EXDEV);
1444                }
1445
1446                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1447                    return error!(ENOTDIR);
1448                }
1449                basenames = &basenames[1..];
1450                continue;
1451            }
1452            if basenames.len() == 1
1453                || !current_namespace_node.entry.node.ops().has_lookup_pipelined()
1454            {
1455                current_namespace_node = current_namespace_node.with_new_entry(
1456                    current_namespace_node.entry.component_lookup(
1457                        locked,
1458                        current_task,
1459                        &current_namespace_node.mount,
1460                        basename,
1461                    )?,
1462                );
1463
1464                current_namespace_node =
1465                    current_namespace_node.resolve(locked, current_task, context)?;
1466
1467                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1468                    && current_namespace_node.mount != self.mount
1469                {
1470                    return error!(EXDEV);
1471                }
1472
1473                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1474                    return error!(ENOTDIR);
1475                }
1476
1477                basenames = &basenames[1..];
1478                continue;
1479            }
1480
1481            let pipelined_basenames = if let Some(pos) =
1482                basenames.iter().position(|&name| name.is_empty() || name == "." || name == "..")
1483            {
1484                &basenames[..pos]
1485            } else {
1486                basenames
1487            };
1488            let precomputed_entries = current_namespace_node.entry.get_children_pipelined(
1489                locked,
1490                current_task,
1491                &current_namespace_node.mount,
1492                pipelined_basenames,
1493            );
1494            for entry in precomputed_entries {
1495                basenames = &basenames[1..];
1496                let child = current_namespace_node.with_new_entry(entry?);
1497
1498                current_namespace_node = child.clone().resolve(locked, current_task, context)?;
1499
1500                if context.resolve_flags.contains(ResolveFlags::NO_XDEV)
1501                    && current_namespace_node.mount != self.mount
1502                {
1503                    return error!(EXDEV);
1504                }
1505
1506                if context.must_be_directory && !current_namespace_node.entry.node.is_dir() {
1507                    return error!(ENOTDIR);
1508                }
1509
1510                if current_namespace_node != child {
1511                    break;
1512                }
1513            }
1514        }
1515
1516        Ok(current_namespace_node)
1517    }
1518
1519    /// Traverse up a child-to-parent link in the namespace.
1520    ///
1521    /// This traversal matches the child-to-parent link in the underlying
1522    /// FsNode except at mountpoints, where the link switches from one
1523    /// filesystem to another.
1524    pub fn parent(&self) -> Option<NamespaceNode> {
1525        let mountpoint_or_self = self.escape_mount();
1526        let parent = mountpoint_or_self.entry.parent()?;
1527        Some(mountpoint_or_self.with_new_entry(parent))
1528    }
1529
1530    /// Returns the parent, but does not escape mounts i.e. returns None if this node
1531    /// is the root of a mount.
1532    pub fn parent_within_mount(&self) -> Option<DirEntryHandle> {
1533        if let Ok(_) = self.mount_if_root() {
1534            return None;
1535        }
1536        self.entry.parent()
1537    }
1538
1539    /// Whether this namespace node is a descendant of the given node.
1540    ///
1541    /// Walks up the namespace node tree looking for ancestor. If ancestor is
1542    /// found, returns true. Otherwise, returns false.
1543    pub fn is_descendant_of(&self, ancestor: &NamespaceNode) -> bool {
1544        let ancestor = ancestor.escape_mount();
1545        let mut current = self.escape_mount();
1546        while current != ancestor {
1547            if let Some(parent) = current.parent() {
1548                current = parent.escape_mount();
1549            } else {
1550                return false;
1551            }
1552        }
1553        true
1554    }
1555
1556    /// If this is a mount point, return the root of the mount. Otherwise return self.
1557    fn enter_mount(&self) -> NamespaceNode {
1558        // While the child is a mountpoint, replace child with the mount's root.
1559        fn enter_one_mount(node: &NamespaceNode) -> Option<NamespaceNode> {
1560            if let Some(mount) = node.mount.deref() {
1561                if let Some(submount) =
1562                    mount.state.read().submounts.get(ArcKey::ref_cast(&node.entry))
1563                {
1564                    return Some(submount.mount.root());
1565                }
1566            }
1567            None
1568        }
1569        let mut inner = self.clone();
1570        while let Some(inner_root) = enter_one_mount(&inner) {
1571            inner = inner_root;
1572        }
1573        inner
1574    }
1575
1576    /// If this is the root of a mount, return the mount point. Otherwise return self.
1577    ///
1578    /// This is not exactly the same as parent(). If parent() is called on a root, it will escape
1579    /// the mount, but then return the parent of the mount point instead of the mount point.
1580    fn escape_mount(&self) -> NamespaceNode {
1581        let mut mountpoint_or_self = self.clone();
1582        while let Some(mountpoint) = mountpoint_or_self.mountpoint() {
1583            mountpoint_or_self = mountpoint;
1584        }
1585        mountpoint_or_self
1586    }
1587
1588    /// If this node is the root of a mount, return it. Otherwise EINVAL.
1589    pub fn mount_if_root(&self) -> Result<&MountHandle, Errno> {
1590        if let Some(mount) = self.mount.deref() {
1591            if Arc::ptr_eq(&self.entry, &mount.root) {
1592                return Ok(mount);
1593            }
1594        }
1595        error!(EINVAL)
1596    }
1597
1598    /// Returns the mountpoint at this location in the namespace.
1599    ///
1600    /// If this node is mounted in another node, this function returns the node
1601    /// at which this node is mounted. Otherwise, returns None.
1602    fn mountpoint(&self) -> Option<NamespaceNode> {
1603        self.mount_if_root().ok()?.read().mountpoint()
1604    }
1605
1606    /// The path from the filesystem root to this node.
1607    pub fn path(&self, fs: &FsContext) -> FsString {
1608        self.path_from_root(Some(&fs.root())).into_path()
1609    }
1610
1611    /// The path from the root of the namespace to this node.
1612    pub fn path_escaping_chroot(&self) -> FsString {
1613        self.path_from_root(None).into_path()
1614    }
1615
1616    /// Returns the path to this node, accounting for a custom root.
1617    /// A task may have a custom root set by `chroot`.
1618    pub fn path_from_root(&self, root: Option<&NamespaceNode>) -> PathWithReachability {
1619        if self.mount.is_none() {
1620            return PathWithReachability::Reachable(self.entry.node.internal_name());
1621        }
1622
1623        let mut path = PathBuilder::new();
1624        let mut current = self.escape_mount();
1625        if let Some(root) = root {
1626            let scope = RcuReadScope::new();
1627            // The current node is expected to intersect with the custom root as we travel up the tree.
1628            let root = root.escape_mount();
1629            while current != root {
1630                if let Some(parent) = current.parent() {
1631                    path.prepend_element(current.entry.local_name(&scope));
1632                    current = parent.escape_mount();
1633                } else {
1634                    // This node hasn't intersected with the custom root and has reached the namespace root.
1635                    let mut absolute_path = path.build_absolute();
1636                    if self.entry.is_dead() {
1637                        absolute_path.extend_from_slice(b" (deleted)");
1638                    }
1639
1640                    return PathWithReachability::Unreachable(absolute_path);
1641                }
1642            }
1643        } else {
1644            // No custom root, so travel up the tree to the namespace root.
1645            let scope = RcuReadScope::new();
1646            while let Some(parent) = current.parent() {
1647                path.prepend_element(current.entry.local_name(&scope));
1648                current = parent.escape_mount();
1649            }
1650        }
1651
1652        let mut absolute_path = path.build_absolute();
1653        if self.entry.is_dead() {
1654            absolute_path.extend_from_slice(b" (deleted)");
1655        }
1656
1657        PathWithReachability::Reachable(absolute_path)
1658    }
1659
1660    pub fn mount(&self, what: WhatToMount, flags: MountpointFlags) -> Result<(), Errno> {
1661        let mountpoint = self.enter_mount();
1662        let mount = mountpoint.mount.as_ref().expect("a mountpoint must be part of a mount");
1663        mount.create_submount(&mountpoint.entry, WhatSubmount::New(what, flags));
1664        Ok(())
1665    }
1666
1667    /// If this is the root of a filesystem, unmount. Otherwise return EINVAL.
1668    pub fn unmount(&self, flags: UnmountFlags) -> Result<(), Errno> {
1669        let mount = self.enter_mount().mount_if_root()?.clone();
1670        mount.unmount(flags)
1671    }
1672
1673    pub fn rename<L>(
1674        locked: &mut Locked<L>,
1675        current_task: &CurrentTask,
1676        old_parent: &NamespaceNode,
1677        old_name: &FsStr,
1678        new_parent: &NamespaceNode,
1679        new_name: &FsStr,
1680        flags: RenameFlags,
1681    ) -> Result<(), Errno>
1682    where
1683        L: LockEqualOrBefore<FileOpsCore>,
1684    {
1685        DirEntry::rename(
1686            locked,
1687            current_task,
1688            &old_parent.entry,
1689            &old_parent.mount,
1690            old_name,
1691            &new_parent.entry,
1692            &new_parent.mount,
1693            new_name,
1694            flags,
1695        )
1696    }
1697
1698    fn with_new_entry(&self, entry: DirEntryHandle) -> NamespaceNode {
1699        Self { mount: self.mount.clone(), entry }
1700    }
1701
1702    fn mount_hash_key(&self) -> &ArcKey<DirEntry> {
1703        ArcKey::ref_cast(&self.entry)
1704    }
1705
1706    pub fn suid_and_sgid(&self, current_task: &CurrentTask) -> Result<UserAndOrGroupId, Errno> {
1707        if self.mount.flags().contains(MountFlags::NOSUID) {
1708            Ok(UserAndOrGroupId::default())
1709        } else {
1710            self.entry.node.info().suid_and_sgid(current_task, &self.entry.node)
1711        }
1712    }
1713
1714    pub fn update_atime(&self) {
1715        // Do not update the atime of this node if it is mounted with the NOATIME flag.
1716        if !self.mount.flags().contains(MountFlags::NOATIME) {
1717            self.entry.node.update_info(|info| {
1718                let now = utc::utc_now();
1719                info.time_access = now;
1720                info.pending_time_access_update = true;
1721            });
1722        }
1723    }
1724
1725    pub fn readlink<L>(
1726        &self,
1727        locked: &mut Locked<L>,
1728        current_task: &CurrentTask,
1729    ) -> Result<SymlinkTarget, Errno>
1730    where
1731        L: LockEqualOrBefore<FileOpsCore>,
1732    {
1733        self.update_atime();
1734        self.entry.node.readlink(locked, current_task)
1735    }
1736
1737    pub fn notify(&self, event_mask: InotifyMask) {
1738        if self.mount.is_some() {
1739            self.entry.notify(event_mask);
1740        }
1741    }
1742
1743    /// Check whether the node can be accessed in the current context with the specified access
1744    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1745    /// owner or is in the file's group.
1746    pub fn check_access<L>(
1747        &self,
1748        locked: &mut Locked<L>,
1749        current_task: &CurrentTask,
1750        permission_flags: impl Into<security::PermissionFlags>,
1751        reason: CheckAccessReason,
1752    ) -> Result<(), Errno>
1753    where
1754        L: LockEqualOrBefore<FileOpsCore>,
1755    {
1756        self.entry.node.check_access(
1757            locked,
1758            current_task,
1759            &self.mount,
1760            permission_flags,
1761            reason,
1762            self,
1763        )
1764    }
1765
1766    /// Checks if O_NOATIME is allowed,
1767    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1768        self.entry.node.check_o_noatime_allowed(current_task)
1769    }
1770
1771    pub fn truncate<L>(
1772        &self,
1773        locked: &mut Locked<L>,
1774        current_task: &CurrentTask,
1775        length: u64,
1776    ) -> Result<(), Errno>
1777    where
1778        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1779    {
1780        self.entry.node.truncate(locked, current_task, &self.mount, length)?;
1781        self.entry.notify_ignoring_excl_unlink(InotifyMask::MODIFY);
1782        Ok(())
1783    }
1784}
1785
1786impl fmt::Debug for NamespaceNode {
1787    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1788        f.debug_struct("NamespaceNode")
1789            .field("path", &self.path_escaping_chroot())
1790            .field("mount", &self.mount)
1791            .field("entry", &self.entry)
1792            .finish()
1793    }
1794}
1795
1796// Eq/Hash impls intended for the MOUNT_POINTS hash
1797impl PartialEq for NamespaceNode {
1798    fn eq(&self, other: &Self) -> bool {
1799        self.mount.as_ref().map(Arc::as_ptr).eq(&other.mount.as_ref().map(Arc::as_ptr))
1800            && Arc::ptr_eq(&self.entry, &other.entry)
1801    }
1802}
1803impl Eq for NamespaceNode {}
1804impl Hash for NamespaceNode {
1805    fn hash<H: Hasher>(&self, state: &mut H) {
1806        self.mount.as_ref().map(Arc::as_ptr).hash(state);
1807        Arc::as_ptr(&self.entry).hash(state);
1808    }
1809}
1810
1811/// A namespace node that keeps the underly mount busy.
1812#[derive(Debug, Clone)]
1813pub struct ActiveNamespaceNode {
1814    /// The underlying namespace node.
1815    name: NamespaceNode,
1816
1817    /// Adds a reference to the mount client marker to prevent the mount from
1818    /// being removed while the NamespaceNode is active. Is None iff mount is
1819    /// None.
1820    _marker: Option<MountClientMarker>,
1821}
1822
1823impl ActiveNamespaceNode {
1824    pub fn new(name: NamespaceNode) -> Self {
1825        let marker = name.mount.as_ref().map(|mount| mount.active_client_counter.clone());
1826        Self { name, _marker: marker }
1827    }
1828
1829    pub fn to_passive(&self) -> NamespaceNode {
1830        self.deref().clone()
1831    }
1832
1833    pub fn into_mapping(self, mode: Option<FileWriteGuardMode>) -> Result<Arc<FileMapping>, Errno> {
1834        if let Some(mode) = mode {
1835            self.entry.node.write_guard_state.lock().acquire(mode)?;
1836        }
1837        Ok(Arc::new(FileMapping { name: self, mode }))
1838    }
1839}
1840
1841impl Deref for ActiveNamespaceNode {
1842    type Target = NamespaceNode;
1843
1844    fn deref(&self) -> &Self::Target {
1845        &self.name
1846    }
1847}
1848
1849impl PartialEq for ActiveNamespaceNode {
1850    fn eq(&self, other: &Self) -> bool {
1851        self.deref().eq(other.deref())
1852    }
1853}
1854impl Eq for ActiveNamespaceNode {}
1855impl Hash for ActiveNamespaceNode {
1856    fn hash<H: Hasher>(&self, state: &mut H) {
1857        self.deref().hash(state)
1858    }
1859}
1860
1861#[derive(Debug, Clone, PartialEq, Eq)]
1862#[must_use]
1863pub struct FileMapping {
1864    pub name: ActiveNamespaceNode,
1865    mode: Option<FileWriteGuardMode>,
1866}
1867
1868impl Drop for FileMapping {
1869    fn drop(&mut self) {
1870        if let Some(mode) = self.mode {
1871            self.name.entry.node.write_guard_state.lock().release(mode);
1872        }
1873    }
1874}
1875
1876/// Tracks all mounts, keyed by mount point.
1877pub struct Mounts {
1878    mounts: RcuHashMap<WeakKey<DirEntry>, Vec<ArcKey<Mount>>>,
1879}
1880
1881impl Mounts {
1882    pub fn new() -> Self {
1883        Mounts { mounts: RcuHashMap::default() }
1884    }
1885
1886    /// Registers the mount in the global mounts map.
1887    fn register_mount(&self, dir_entry: &Arc<DirEntry>, mount: MountHandle) -> Submount {
1888        let mut mounts = self.mounts.lock();
1889        let key = WeakKey::from(dir_entry);
1890        let mut vec = mounts.get(&key).unwrap_or_else(|| {
1891            dir_entry.set_has_mounts(true);
1892            Vec::new()
1893        });
1894        vec.push(ArcKey(mount.clone()));
1895        mounts.insert(key, vec);
1896        Submount { dir: ArcKey(dir_entry.clone()), mount }
1897    }
1898
1899    /// Unregisters the mount.  This is called by `Submount::drop`.
1900    fn unregister_mount(&self, dir_entry: &Arc<DirEntry>, mount: &MountHandle) {
1901        let mut mounts = self.mounts.lock();
1902        let key = WeakKey::from(dir_entry);
1903        if let Some(mut vec) = mounts.get(&key) {
1904            let index = vec.iter().position(|e| e == ArcKey::ref_cast(mount)).unwrap();
1905            if vec.len() == 1 {
1906                mounts.remove(&key);
1907                dir_entry.set_has_mounts(false);
1908            } else {
1909                vec.swap_remove(index);
1910                mounts.insert(key, vec);
1911            }
1912        }
1913    }
1914
1915    /// Unmounts all mounts associated with `dir_entry`.  This is called when `dir_entry` is
1916    /// unlinked (which would normally result in EBUSY, but not if it isn't mounted in the local
1917    /// namespace).
1918    pub fn unmount(&self, dir_entry: &DirEntry) {
1919        let mounts = self.mounts.lock().remove(&PtrKey::from(dir_entry as *const _));
1920        if let Some(mounts) = mounts {
1921            for mount in mounts {
1922                // Ignore errors.
1923                let _ = mount.unmount(UnmountFlags::DETACH);
1924            }
1925        }
1926    }
1927
1928    /// Drain mounts. For each drained mount, force a FileSystem unmount.
1929    // TODO(https://fxbug.dev/295073633): Graceful shutdown should try to first unmount the mounts
1930    // and only force a FileSystem unmount on failure.
1931    pub fn clear(&self) {
1932        for (_dir_entry, mounts) in self.mounts.lock().drain() {
1933            for mount in mounts {
1934                mount.fs.force_unmount_ops();
1935            }
1936        }
1937    }
1938
1939    pub fn sync_all(
1940        &self,
1941        locked: &mut Locked<Unlocked>,
1942        current_task: &CurrentTask,
1943    ) -> Result<(), Errno> {
1944        let mut filesystems = Vec::new();
1945        {
1946            let scope = RcuReadScope::new();
1947            let mut seen = HashSet::new();
1948            for (_dir_entry, m_list) in self.mounts.iter(&scope) {
1949                for m in m_list {
1950                    if seen.insert(Arc::as_ptr(&m.fs)) {
1951                        filesystems.push(m.fs.clone());
1952                    }
1953                }
1954            }
1955        }
1956
1957        for fs in filesystems {
1958            if let Err(e) = fs.sync(locked, current_task) {
1959                log_warn!("sync failed for filesystem {:?}: {:?}", fs.name(), e);
1960            }
1961        }
1962        Ok(())
1963    }
1964}
1965
1966/// A RAII object that unregisters a mount when dropped.
1967#[derive(Debug)]
1968struct Submount {
1969    dir: ArcKey<DirEntry>,
1970    mount: MountHandle,
1971}
1972
1973impl Drop for Submount {
1974    fn drop(&mut self) {
1975        self.mount.fs.kernel.upgrade().unwrap().mounts.unregister_mount(&self.dir, &self.mount)
1976    }
1977}
1978
1979/// Submount is stored in a mount's submounts hash set, which is keyed by the mountpoint.
1980impl Eq for Submount {}
1981impl PartialEq<Self> for Submount {
1982    fn eq(&self, other: &Self) -> bool {
1983        self.dir == other.dir
1984    }
1985}
1986impl Hash for Submount {
1987    fn hash<H: Hasher>(&self, state: &mut H) {
1988        self.dir.hash(state)
1989    }
1990}
1991
1992impl Borrow<ArcKey<DirEntry>> for Submount {
1993    fn borrow(&self) -> &ArcKey<DirEntry> {
1994        &self.dir
1995    }
1996}
1997
1998#[cfg(test)]
1999mod test {
2000    use crate::fs::tmpfs::TmpFs;
2001    use crate::testing::spawn_kernel_and_run;
2002    use crate::vfs::namespace::DeviceId;
2003    use crate::vfs::{
2004        CallbackSymlinkNode, FsNodeInfo, LookupContext, MountInfo, Namespace, NamespaceNode,
2005        RenameFlags, SymlinkMode, SymlinkTarget, UnlinkKind, WhatToMount,
2006    };
2007    use starnix_uapi::mount_flags::MountpointFlags;
2008    use starnix_uapi::{errno, mode};
2009    use std::sync::Arc;
2010
2011    #[::fuchsia::test]
2012    async fn test_namespace() {
2013        spawn_kernel_and_run(async |locked, current_task| {
2014            let kernel = current_task.kernel();
2015            let root_fs = TmpFs::new_fs(locked, &kernel);
2016            let root_node = Arc::clone(root_fs.root());
2017            let _dev_node = root_node
2018                .create_dir(locked, &current_task, "dev".into())
2019                .expect("failed to mkdir dev");
2020            let dev_fs = TmpFs::new_fs(locked, &kernel);
2021            let dev_root_node = Arc::clone(dev_fs.root());
2022            let _dev_pts_node = dev_root_node
2023                .create_dir(locked, &current_task, "pts".into())
2024                .expect("failed to mkdir pts");
2025
2026            let ns = Namespace::new(root_fs);
2027            let mut context = LookupContext::default();
2028            let dev = ns
2029                .root()
2030                .lookup_child(locked, &current_task, &mut context, "dev".into())
2031                .expect("failed to lookup dev");
2032            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2033                .expect("failed to mount dev root node");
2034
2035            let mut context = LookupContext::default();
2036            let dev = ns
2037                .root()
2038                .lookup_child(locked, &current_task, &mut context, "dev".into())
2039                .expect("failed to lookup dev");
2040            let mut context = LookupContext::default();
2041            let pts = dev
2042                .lookup_child(locked, &current_task, &mut context, "pts".into())
2043                .expect("failed to lookup pts");
2044            let pts_parent =
2045                pts.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of pts");
2046            assert!(Arc::ptr_eq(&pts_parent.entry, &dev.entry));
2047
2048            let dev_parent =
2049                dev.parent().ok_or_else(|| errno!(ENOENT)).expect("failed to get parent of dev");
2050            assert!(Arc::ptr_eq(&dev_parent.entry, &ns.root().entry));
2051        })
2052        .await;
2053    }
2054
2055    #[::fuchsia::test]
2056    async fn test_mount_does_not_upgrade() {
2057        spawn_kernel_and_run(async |locked, current_task| {
2058            let kernel = current_task.kernel();
2059            let root_fs = TmpFs::new_fs(locked, &kernel);
2060            let root_node = Arc::clone(root_fs.root());
2061            let _dev_node = root_node
2062                .create_dir(locked, &current_task, "dev".into())
2063                .expect("failed to mkdir dev");
2064            let dev_fs = TmpFs::new_fs(locked, &kernel);
2065            let dev_root_node = Arc::clone(dev_fs.root());
2066            let _dev_pts_node = dev_root_node
2067                .create_dir(locked, &current_task, "pts".into())
2068                .expect("failed to mkdir pts");
2069
2070            let ns = Namespace::new(root_fs);
2071            let mut context = LookupContext::default();
2072            let dev = ns
2073                .root()
2074                .lookup_child(locked, &current_task, &mut context, "dev".into())
2075                .expect("failed to lookup dev");
2076            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2077                .expect("failed to mount dev root node");
2078            let mut context = LookupContext::default();
2079            let new_dev = ns
2080                .root()
2081                .lookup_child(locked, &current_task, &mut context, "dev".into())
2082                .expect("failed to lookup dev again");
2083            assert!(!Arc::ptr_eq(&dev.entry, &new_dev.entry));
2084            assert_ne!(&dev, &new_dev);
2085
2086            let mut context = LookupContext::default();
2087            let _new_pts = new_dev
2088                .lookup_child(locked, &current_task, &mut context, "pts".into())
2089                .expect("failed to lookup pts");
2090            let mut context = LookupContext::default();
2091            assert!(dev.lookup_child(locked, &current_task, &mut context, "pts".into()).is_err());
2092        })
2093        .await;
2094    }
2095
2096    #[::fuchsia::test]
2097    async fn test_path() {
2098        spawn_kernel_and_run(async |locked, current_task| {
2099            let kernel = current_task.kernel();
2100            let root_fs = TmpFs::new_fs(locked, &kernel);
2101            let root_node = Arc::clone(root_fs.root());
2102            let _dev_node = root_node
2103                .create_dir(locked, &current_task, "dev".into())
2104                .expect("failed to mkdir dev");
2105            let dev_fs = TmpFs::new_fs(locked, &kernel);
2106            let dev_root_node = Arc::clone(dev_fs.root());
2107            let _dev_pts_node = dev_root_node
2108                .create_dir(locked, &current_task, "pts".into())
2109                .expect("failed to mkdir pts");
2110
2111            let ns = Namespace::new(root_fs);
2112            let mut context = LookupContext::default();
2113            let dev = ns
2114                .root()
2115                .lookup_child(locked, &current_task, &mut context, "dev".into())
2116                .expect("failed to lookup dev");
2117            dev.mount(WhatToMount::Fs(dev_fs), MountpointFlags::empty())
2118                .expect("failed to mount dev root node");
2119
2120            let mut context = LookupContext::default();
2121            let dev = ns
2122                .root()
2123                .lookup_child(locked, &current_task, &mut context, "dev".into())
2124                .expect("failed to lookup dev");
2125            let mut context = LookupContext::default();
2126            let pts = dev
2127                .lookup_child(locked, &current_task, &mut context, "pts".into())
2128                .expect("failed to lookup pts");
2129
2130            assert_eq!("/", ns.root().path_escaping_chroot());
2131            assert_eq!("/dev", dev.path_escaping_chroot());
2132            assert_eq!("/dev/pts", pts.path_escaping_chroot());
2133        })
2134        .await;
2135    }
2136
2137    #[::fuchsia::test]
2138    async fn test_shadowing() {
2139        spawn_kernel_and_run(async |locked, current_task| {
2140            let kernel = current_task.kernel();
2141            let root_fs = TmpFs::new_fs(locked, &kernel);
2142            let ns = Namespace::new(root_fs.clone());
2143            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2144            let mut context = LookupContext::default();
2145            let foo_dir =
2146                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2147
2148            let foofs1 = TmpFs::new_fs(locked, &kernel);
2149            foo_dir.mount(WhatToMount::Fs(foofs1.clone()), MountpointFlags::empty()).unwrap();
2150            let mut context = LookupContext::default();
2151            assert!(Arc::ptr_eq(
2152                &ns.root()
2153                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2154                    .unwrap()
2155                    .entry,
2156                foofs1.root()
2157            ));
2158            let foo_dir =
2159                ns.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2160
2161            let ns_clone = ns.clone_namespace();
2162
2163            let foofs2 = TmpFs::new_fs(locked, &kernel);
2164            foo_dir.mount(WhatToMount::Fs(foofs2.clone()), MountpointFlags::empty()).unwrap();
2165            let mut context = LookupContext::default();
2166            assert!(Arc::ptr_eq(
2167                &ns.root()
2168                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2169                    .unwrap()
2170                    .entry,
2171                foofs2.root()
2172            ));
2173
2174            assert!(Arc::ptr_eq(
2175                &ns_clone
2176                    .root()
2177                    .lookup_child(
2178                        locked,
2179                        &current_task,
2180                        &mut LookupContext::default(),
2181                        "foo".into()
2182                    )
2183                    .unwrap()
2184                    .entry,
2185                foofs1.root()
2186            ));
2187        })
2188        .await;
2189    }
2190
2191    #[::fuchsia::test]
2192    async fn test_unlink_mounted_directory() {
2193        spawn_kernel_and_run(async |locked, current_task| {
2194            let kernel = current_task.kernel();
2195            let root_fs = TmpFs::new_fs(locked, &kernel);
2196            let ns1 = Namespace::new(root_fs.clone());
2197            let ns2 = Namespace::new(root_fs.clone());
2198            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2199            let mut context = LookupContext::default();
2200            let foo_dir =
2201                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2202
2203            let foofs = TmpFs::new_fs(locked, &kernel);
2204            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2205
2206            // Trying to unlink from ns1 should fail.
2207            assert_eq!(
2208                ns1.root()
2209                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2210                    .unwrap_err(),
2211                errno!(EBUSY),
2212            );
2213
2214            // But unlinking from ns2 should succeed.
2215            ns2.root()
2216                .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2217                .expect("unlink failed");
2218
2219            // And it should no longer show up in ns1.
2220            assert_eq!(
2221                ns1.root()
2222                    .unlink(locked, &current_task, "foo".into(), UnlinkKind::Directory, false)
2223                    .unwrap_err(),
2224                errno!(ENOENT),
2225            );
2226        })
2227        .await;
2228    }
2229
2230    #[::fuchsia::test]
2231    async fn test_rename_mounted_directory() {
2232        spawn_kernel_and_run(async |locked, current_task| {
2233            let kernel = current_task.kernel();
2234            let root_fs = TmpFs::new_fs(locked, &kernel);
2235            let ns1 = Namespace::new(root_fs.clone());
2236            let ns2 = Namespace::new(root_fs.clone());
2237            let _foo_node = root_fs.root().create_dir(locked, &current_task, "foo".into()).unwrap();
2238            let _bar_node = root_fs.root().create_dir(locked, &current_task, "bar".into()).unwrap();
2239            let _baz_node = root_fs.root().create_dir(locked, &current_task, "baz".into()).unwrap();
2240            let mut context = LookupContext::default();
2241            let foo_dir =
2242                ns1.root().lookup_child(locked, &current_task, &mut context, "foo".into()).unwrap();
2243
2244            let foofs = TmpFs::new_fs(locked, &kernel);
2245            foo_dir.mount(WhatToMount::Fs(foofs), MountpointFlags::empty()).unwrap();
2246
2247            // Trying to rename over foo from ns1 should fail.
2248            let root = ns1.root();
2249            assert_eq!(
2250                NamespaceNode::rename(
2251                    locked,
2252                    &current_task,
2253                    &root,
2254                    "bar".into(),
2255                    &root,
2256                    "foo".into(),
2257                    RenameFlags::empty()
2258                )
2259                .unwrap_err(),
2260                errno!(EBUSY),
2261            );
2262            // Likewise the other way.
2263            assert_eq!(
2264                NamespaceNode::rename(
2265                    locked,
2266                    &current_task,
2267                    &root,
2268                    "foo".into(),
2269                    &root,
2270                    "bar".into(),
2271                    RenameFlags::empty()
2272                )
2273                .unwrap_err(),
2274                errno!(EBUSY),
2275            );
2276
2277            // But renaming from ns2 should succeed.
2278            let root = ns2.root();
2279
2280            // First rename the directory with the mount.
2281            NamespaceNode::rename(
2282                locked,
2283                &current_task,
2284                &root,
2285                "foo".into(),
2286                &root,
2287                "bar".into(),
2288                RenameFlags::empty(),
2289            )
2290            .expect("rename failed");
2291
2292            // Renaming over a directory with a mount should also work.
2293            NamespaceNode::rename(
2294                locked,
2295                &current_task,
2296                &root,
2297                "baz".into(),
2298                &root,
2299                "bar".into(),
2300                RenameFlags::empty(),
2301            )
2302            .expect("rename failed");
2303
2304            // "foo" and "baz" should no longer show up in ns1.
2305            assert_eq!(
2306                ns1.root()
2307                    .lookup_child(locked, &current_task, &mut context, "foo".into())
2308                    .unwrap_err(),
2309                errno!(ENOENT)
2310            );
2311            assert_eq!(
2312                ns1.root()
2313                    .lookup_child(locked, &current_task, &mut context, "baz".into())
2314                    .unwrap_err(),
2315                errno!(ENOENT)
2316            );
2317        })
2318        .await;
2319    }
2320
2321    /// Symlinks which need to be traversed across types (nodes and paths), as well as across
2322    /// owning directories, can be tricky to get right.
2323    #[::fuchsia::test]
2324    async fn test_lookup_with_symlink_chain() {
2325        spawn_kernel_and_run(async |locked, current_task| {
2326            // Set up the root filesystem
2327            let kernel = current_task.kernel();
2328            let root_fs = TmpFs::new_fs(locked, &kernel);
2329            let root_node = Arc::clone(root_fs.root());
2330            let _first_subdir_node = root_node
2331                .create_dir(locked, &current_task, "first_subdir".into())
2332                .expect("failed to mkdir dev");
2333            let _second_subdir_node = root_node
2334                .create_dir(locked, &current_task, "second_subdir".into())
2335                .expect("failed to mkdir dev");
2336
2337            // Set up two subdirectories under the root filesystem
2338            let first_subdir_fs = TmpFs::new_fs(locked, &kernel);
2339            let second_subdir_fs = TmpFs::new_fs(locked, &kernel);
2340
2341            let ns = Namespace::new(root_fs);
2342            let mut context = LookupContext::default();
2343            let first_subdir = ns
2344                .root()
2345                .lookup_child(locked, &current_task, &mut context, "first_subdir".into())
2346                .expect("failed to lookup first_subdir");
2347            first_subdir
2348                .mount(WhatToMount::Fs(first_subdir_fs), MountpointFlags::empty())
2349                .expect("failed to mount first_subdir fs node");
2350            let second_subdir = ns
2351                .root()
2352                .lookup_child(locked, &current_task, &mut context, "second_subdir".into())
2353                .expect("failed to lookup second_subdir");
2354            second_subdir
2355                .mount(WhatToMount::Fs(second_subdir_fs), MountpointFlags::empty())
2356                .expect("failed to mount second_subdir fs node");
2357
2358            // Create the symlink structure. To trigger potential symlink traversal bugs, we're going
2359            // for the following directory structure:
2360            // / (root)
2361            //     + first_subdir/
2362            //         - real_file
2363            //         - path_symlink (-> real_file)
2364            //     + second_subdir/
2365            //         - node_symlink (-> path_symlink)
2366            let real_file_node = first_subdir
2367                .create_node(
2368                    locked,
2369                    &current_task,
2370                    "real_file".into(),
2371                    mode!(IFREG, 0o777),
2372                    DeviceId::NONE,
2373                )
2374                .expect("failed to create real_file");
2375            first_subdir
2376                .create_symlink(locked, &current_task, "path_symlink".into(), "real_file".into())
2377                .expect("failed to create path_symlink");
2378
2379            let mut no_follow_lookup_context = LookupContext::new(SymlinkMode::NoFollow);
2380            let path_symlink_node = first_subdir
2381                .lookup_child(
2382                    locked,
2383                    &current_task,
2384                    &mut no_follow_lookup_context,
2385                    "path_symlink".into(),
2386                )
2387                .expect("Failed to lookup path_symlink");
2388
2389            // The second symlink needs to be of type SymlinkTarget::Node in order to trip the sensitive
2390            // code path. There's no easy method for creating this type of symlink target, so we'll need
2391            // to construct a node from scratch and insert it into the directory manually.
2392            let node_symlink_node = second_subdir.entry.node.fs().create_node_and_allocate_node_id(
2393                CallbackSymlinkNode::new(move || {
2394                    let node = path_symlink_node.clone();
2395                    Ok(SymlinkTarget::Node(node))
2396                }),
2397                FsNodeInfo::new(mode!(IFLNK, 0o777), current_task.current_fscred()),
2398            );
2399            second_subdir
2400                .entry
2401                .create_entry(
2402                    locked,
2403                    &current_task,
2404                    &MountInfo::detached(),
2405                    "node_symlink".into(),
2406                    move |_locked, _dir, _mount, _name| Ok(node_symlink_node),
2407                )
2408                .expect("failed to create node_symlink entry");
2409
2410            // Finally, exercise the lookup under test.
2411            let mut follow_lookup_context = LookupContext::new(SymlinkMode::Follow);
2412            let node_symlink_resolution = second_subdir
2413                .lookup_child(
2414                    locked,
2415                    &current_task,
2416                    &mut follow_lookup_context,
2417                    "node_symlink".into(),
2418                )
2419                .expect("lookup with symlink chain failed");
2420
2421            // The lookup resolution should have correctly followed the symlinks to the real_file node.
2422            assert!(node_symlink_resolution.entry.node.ino == real_file_node.entry.node.ino);
2423        })
2424        .await;
2425    }
2426}