starnix_core/vfs/
dir_entry.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::CurrentTask;
7use crate::vfs::{
8    CheckAccessReason, FileHandle, FileObject, FsNodeHandle, FsNodeLinkBehavior, FsStr, FsString,
9    MountInfo, Mounts, NamespaceNode, UnlinkKind, path,
10};
11use bitflags::bitflags;
12use fuchsia_rcu::{RcuOptionArc, RcuReadScope};
13use starnix_rcu::RcuString;
14use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, RwLock, RwLockWriteGuard};
15use starnix_uapi::auth::FsCred;
16use starnix_uapi::errors::{ENOENT, Errno};
17use starnix_uapi::file_mode::{Access, FileMode};
18use starnix_uapi::inotify_mask::InotifyMask;
19use starnix_uapi::open_flags::OpenFlags;
20use starnix_uapi::{NAME_MAX, RENAME_EXCHANGE, RENAME_NOREPLACE, RENAME_WHITEOUT, errno, error};
21use std::collections::BTreeMap;
22use std::collections::btree_map::Entry;
23use std::fmt;
24use std::ops::Deref;
25use std::sync::atomic::{AtomicU8, Ordering};
26use std::sync::{Arc, Weak};
27#[cfg(detect_lock_cycles)]
28use tracing_mutex::util;
29
30bitflags! {
31    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
32    pub struct RenameFlags: u32 {
33        // Exchange the entries.
34        const EXCHANGE = RENAME_EXCHANGE;
35
36        // Don't overwrite an existing DirEntry.
37        const NOREPLACE = RENAME_NOREPLACE;
38
39        // Create a "whiteout" object to replace the file.
40        const WHITEOUT = RENAME_WHITEOUT;
41
42        // Allow replacing any file with a directory. This is an internal flag used only
43        // internally inside Starnix for OverlayFS.
44        const REPLACE_ANY = 0x80000000;
45
46        // Internal flags that cannot be passed to `sys_rename()`
47        const INTERNAL = Self::REPLACE_ANY.bits();
48    }
49}
50
51pub trait DirEntryOps: Send + Sync + 'static {
52    /// Revalidate the [`DirEntry`], if needed.
53    ///
54    /// Most filesystems don't need to do any revalidations because they are "local"
55    /// and all changes to nodes go through the kernel. However some filesystems
56    /// allow changes to happen through other means (e.g. NFS, FUSE) and these
57    /// filesystems need a way to let the kernel know it may need to refresh its
58    /// cached metadata. This method provides that hook for such filesystems.
59    ///
60    /// For more details, see:
61    ///  - https://www.halolinux.us/kernel-reference/the-dentry-cache.html
62    ///  - https://www.kernel.org/doc/html/latest/filesystems/path-lookup.html#revalidation-and-automounts
63    ///  - https://lwn.net/Articles/649115/
64    ///  - https://www.infradead.org/~mchehab/kernel_docs/filesystems/path-walking.html
65    ///
66    /// Returns `Ok(valid)` where `valid` indicates if the `DirEntry` is still valid,
67    /// or an error.
68    fn revalidate(
69        &self,
70        _locked: &mut Locked<FileOpsCore>,
71        _: &CurrentTask,
72        _: &DirEntry,
73    ) -> Result<bool, Errno> {
74        Ok(true)
75    }
76}
77
78bitflags! {
79    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
80    pub struct DirEntryFlags: u8 {
81        /// Whether this directory entry has been removed from the tree.
82        const IS_DEAD = 1 << 0;
83
84        /// Whether the entry has filesystems mounted on top of it.
85        const HAS_MOUNTS = 1 << 1;
86    }
87}
88
89pub struct DefaultDirEntryOps;
90
91impl DirEntryOps for DefaultDirEntryOps {}
92
93/// An entry in a directory.
94///
95/// This structure assigns a name to an FsNode in a given file system. An
96/// FsNode might have multiple directory entries, for example if there are more
97/// than one hard link to the same FsNode. In those cases, each hard link will
98/// have a different parent and a different local_name because each hard link
99/// has its own DirEntry object.
100///
101/// A directory cannot have more than one hard link, which means there is a
102/// single DirEntry for each Directory FsNode. That invariant lets us store the
103/// children for a directory in the DirEntry rather than in the FsNode.
104pub struct DirEntry {
105    /// The FsNode referenced by this DirEntry.
106    ///
107    /// A given FsNode can be referenced by multiple DirEntry objects, for
108    /// example if there are multiple hard links to a given FsNode.
109    pub node: FsNodeHandle,
110
111    /// The [`DirEntryOps`] for this `DirEntry`.
112    ///
113    /// The `DirEntryOps` are implemented by the individual file systems to provide
114    /// specific behaviours for this `DirEntry`.
115    ops: Box<dyn DirEntryOps>,
116
117    /// The parent DirEntry.
118    ///
119    /// The DirEntry tree has strong references from child-to-parent and weak
120    /// references from parent-to-child. This design ensures that the parent
121    /// chain is always populated in the cache, but some children might be
122    /// missing from the cache.
123    parent: RcuOptionArc<DirEntry>,
124
125    /// The [`DirEntryFlags`] for this `DirEntry`.
126    flags: AtomicU8,
127
128    /// The name that this parent calls this child.
129    ///
130    /// This name might not be reflected in the full path in the namespace that
131    /// contains this DirEntry. For example, this DirEntry might be the root of
132    /// a chroot.
133    ///
134    /// Most callers that want to work with names for DirEntries should use the
135    /// NamespaceNodes.
136    local_name: RcuString,
137
138    /// A partial cache of the children of this DirEntry.
139    ///
140    /// DirEntries are added to this cache when they are looked up and removed
141    /// when they are no longer referenced.
142    ///
143    // FIXME(b/379929394): The lock ordering here assumes parent-to-child lock acquisition, which
144    // a number of algorithms in the DirEntry operations also assume. This assumption can be broken
145    // by the rename operation, which can move nodes around the hierarchy. See the referenced bug
146    // for more details, the current mitigations, and potentials for long-term solutions.
147    children: RwLock<DirEntryChildren>,
148}
149type DirEntryChildren = BTreeMap<FsString, Weak<DirEntry>>;
150
151pub type DirEntryHandle = Arc<DirEntry>;
152
153impl DirEntry {
154    #[allow(clippy::let_and_return)]
155    pub fn new_uncached(
156        node: FsNodeHandle,
157        parent: Option<DirEntryHandle>,
158        local_name: FsString,
159    ) -> DirEntryHandle {
160        let ops = node.create_dir_entry_ops();
161        let result = Arc::new(DirEntry {
162            node,
163            ops,
164            parent: RcuOptionArc::new(parent),
165            flags: Default::default(),
166            local_name: local_name.into(),
167            children: Default::default(),
168        });
169        #[cfg(any(test, debug_assertions))]
170        {
171            // Taking this lock tells the lock tracing system about the parent/child ordering
172            // relation.
173            let _l1 = result.children.read();
174        }
175        result
176    }
177
178    pub fn new(
179        node: FsNodeHandle,
180        parent: Option<DirEntryHandle>,
181        local_name: FsString,
182    ) -> DirEntryHandle {
183        let result = Self::new_uncached(node, parent, local_name);
184        result.node.fs().did_create_dir_entry(&result);
185        result
186    }
187
188    /// Returns a new DirEntry for the given `node` without parent. The entry has no local name and
189    /// is not cached.
190    pub fn new_unrooted(node: FsNodeHandle) -> DirEntryHandle {
191        Self::new_uncached(node, None, FsString::default())
192    }
193
194    /// Returns a new `DirEntry` that is ready marked as having been deleted.
195    pub fn new_deleted(
196        node: FsNodeHandle,
197        parent: Option<DirEntryHandle>,
198        local_name: FsString,
199    ) -> DirEntryHandle {
200        let entry = DirEntry::new_uncached(node, parent, local_name);
201        entry.raise_flags(DirEntryFlags::IS_DEAD);
202        entry
203    }
204
205    /// Returns a file handle to this entry, associated with an anonymous namespace.
206    pub fn open_anonymous<L>(
207        self: &DirEntryHandle,
208        locked: &mut Locked<L>,
209        current_task: &CurrentTask,
210        flags: OpenFlags,
211    ) -> Result<FileHandle, Errno>
212    where
213        L: LockEqualOrBefore<FileOpsCore>,
214    {
215        let ops = self.node.create_file_ops(locked, current_task, flags)?;
216        FileObject::new(
217            locked,
218            current_task,
219            ops,
220            NamespaceNode::new_anonymous(self.clone()),
221            flags,
222        )
223    }
224
225    /// Set the children of this DirEntry to the given `children`. This should only ever be called
226    /// when children is empty.
227    pub fn set_children(self: &DirEntryHandle, children: BTreeMap<FsString, DirEntryHandle>) {
228        let mut dir_entry_children = self.lock_children();
229        assert!(dir_entry_children.children.is_empty());
230        for (name, child) in children.into_iter() {
231            child.set_parent(self.clone());
232            dir_entry_children.children.insert(name, Arc::downgrade(&child));
233        }
234    }
235
236    fn lock_children<'a>(self: &'a DirEntryHandle) -> DirEntryLockedChildren<'a> {
237        DirEntryLockedChildren { entry: self, children: self.children.write() }
238    }
239
240    /// The parent DirEntry.
241    pub fn parent(&self) -> Option<DirEntryHandle> {
242        self.parent.to_option_arc()
243    }
244
245    /// Returns a reference to the parent DirEntry.
246    ///
247    /// The reference is only valid for the duration of the RCU read scope.
248    pub fn parent_ref<'a>(&'a self, scope: &'a RcuReadScope) -> Option<&'a DirEntry> {
249        self.parent.as_ref(scope)
250    }
251
252    /// Set the parent of this DirEntry.
253    pub fn set_parent(&self, parent: DirEntryHandle) {
254        self.parent.update(Some(parent));
255    }
256
257    /// The parent DirEntry object or this DirEntry if this entry is the root.
258    ///
259    /// Useful when traversing up the tree if you always want to find a parent
260    /// (e.g., for "..").
261    ///
262    /// Be aware that the root of one file system might be mounted as a child
263    /// in another file system. For that reason, consider walking the
264    /// NamespaceNode tree (which understands mounts) rather than the DirEntry
265    /// tree.
266    pub fn parent_or_self(self: &DirEntryHandle) -> DirEntryHandle {
267        self.parent().unwrap_or_else(|| self.clone())
268    }
269
270    /// The name that this parent calls this child.
271    ///
272    /// The reference is only valid for the duration of the RCU read scope.
273    pub fn local_name<'a>(&self, scope: &'a RcuReadScope) -> &'a FsStr {
274        self.local_name.read(scope)
275    }
276
277    /// Whether the given name has special semantics as a directory entry.
278    ///
279    /// Specifically, whether the name is empty (which means "self"), dot
280    /// (which also means "self"), or dot dot (which means "parent").
281    pub fn is_reserved_name(name: &FsStr) -> bool {
282        name.is_empty() || name == "." || name == ".."
283    }
284
285    /// Returns the flags of this DirEntry.
286    pub fn flags(&self) -> DirEntryFlags {
287        DirEntryFlags::from_bits_truncate(self.flags.load(Ordering::Acquire))
288    }
289
290    /// Raises the flags of this DirEntry.
291    ///
292    /// Returns the flags of this DirEntry before the flags were raised.
293    pub fn raise_flags(&self, flags: DirEntryFlags) -> DirEntryFlags {
294        let old_flags = self.flags.fetch_or(flags.bits(), Ordering::AcqRel);
295        DirEntryFlags::from_bits_truncate(old_flags)
296    }
297
298    /// Lowers the flags of this DirEntry.
299    ///
300    /// Returns the flags of this DirEntry before the flags were lowered.
301    pub fn lower_flags(&self, flags: DirEntryFlags) -> DirEntryFlags {
302        let old_flags = self.flags.fetch_and(!flags.bits(), Ordering::AcqRel);
303        DirEntryFlags::from_bits_truncate(old_flags)
304    }
305
306    /// Returns true if this DirEntry is dead.
307    pub fn is_dead(&self) -> bool {
308        self.flags().contains(DirEntryFlags::IS_DEAD)
309    }
310
311    /// Look up a directory entry with the given name as direct child of this
312    /// entry.
313    pub fn component_lookup<L>(
314        self: &DirEntryHandle,
315        locked: &mut Locked<L>,
316        current_task: &CurrentTask,
317        mount: &MountInfo,
318        name: &FsStr,
319    ) -> Result<DirEntryHandle, Errno>
320    where
321        L: LockEqualOrBefore<FileOpsCore>,
322    {
323        let (node, _) = self.get_or_create_child(
324            locked,
325            current_task,
326            mount,
327            name,
328            |locked, d, mount, name| d.lookup(locked, current_task, mount, name),
329        )?;
330        Ok(node)
331    }
332
333    /// Creates a new DirEntry
334    ///
335    /// The create_node_fn function is called to create the underlying FsNode
336    /// for the DirEntry.
337    ///
338    /// If the entry already exists, create_node_fn is not called, and EEXIST is
339    /// returned.
340    pub fn create_entry<L>(
341        self: &DirEntryHandle,
342        locked: &mut Locked<L>,
343        current_task: &CurrentTask,
344        mount: &MountInfo,
345        name: &FsStr,
346        create_node_fn: impl FnOnce(
347            &mut Locked<L>,
348            &FsNodeHandle,
349            &MountInfo,
350            &FsStr,
351        ) -> Result<FsNodeHandle, Errno>,
352    ) -> Result<DirEntryHandle, Errno>
353    where
354        L: LockEqualOrBefore<FileOpsCore>,
355    {
356        let (entry, exists) =
357            self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?;
358        if exists {
359            return error!(EEXIST);
360        }
361        Ok(entry)
362    }
363
364    /// Creates a new DirEntry. Works just like create_entry, except if the entry already exists,
365    /// it is returned.
366    pub fn get_or_create_entry<L>(
367        self: &DirEntryHandle,
368        locked: &mut Locked<L>,
369        current_task: &CurrentTask,
370        mount: &MountInfo,
371        name: &FsStr,
372        create_node_fn: impl FnOnce(
373            &mut Locked<L>,
374            &FsNodeHandle,
375            &MountInfo,
376            &FsStr,
377        ) -> Result<FsNodeHandle, Errno>,
378    ) -> Result<DirEntryHandle, Errno>
379    where
380        L: LockEqualOrBefore<FileOpsCore>,
381    {
382        let (entry, _exists) =
383            self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?;
384        Ok(entry)
385    }
386
387    fn create_entry_internal<L>(
388        self: &DirEntryHandle,
389        locked: &mut Locked<L>,
390        current_task: &CurrentTask,
391        mount: &MountInfo,
392        name: &FsStr,
393        create_node_fn: impl FnOnce(
394            &mut Locked<L>,
395            &FsNodeHandle,
396            &MountInfo,
397            &FsStr,
398        ) -> Result<FsNodeHandle, Errno>,
399    ) -> Result<(DirEntryHandle, bool), Errno>
400    where
401        L: LockEqualOrBefore<FileOpsCore>,
402    {
403        if DirEntry::is_reserved_name(name) {
404            return error!(EEXIST);
405        }
406        // TODO: Do we need to check name for embedded NUL characters?
407        if name.len() > NAME_MAX as usize {
408            return error!(ENAMETOOLONG);
409        }
410        if name.contains(&path::SEPARATOR) {
411            return error!(EINVAL);
412        }
413        let (entry, exists) =
414            self.get_or_create_child(locked, current_task, mount, name, create_node_fn)?;
415        if !exists {
416            // An entry was created. Update the ctime and mtime of this directory.
417            self.node.update_ctime_mtime();
418            entry.notify_creation();
419        }
420        Ok((entry, exists))
421    }
422
423    // This is marked as test-only because it sets the owner/group to root instead of the current
424    // user to save a bit of typing in tests, but this shouldn't happen silently in production.
425    #[cfg(test)]
426    pub fn create_dir<L>(
427        self: &DirEntryHandle,
428        locked: &mut starnix_sync::Locked<L>,
429        current_task: &CurrentTask,
430        name: &FsStr,
431    ) -> Result<DirEntryHandle, Errno>
432    where
433        L: LockEqualOrBefore<FileOpsCore>,
434    {
435        self.create_dir_for_testing(locked, current_task, name)
436    }
437
438    // This function is for testing because it sets the owner/group to root instead of the current
439    // user to save a bit of typing in tests, but this shouldn't happen silently in production.
440    pub fn create_dir_for_testing<L>(
441        self: &DirEntryHandle,
442        locked: &mut Locked<L>,
443        current_task: &CurrentTask,
444        name: &FsStr,
445    ) -> Result<DirEntryHandle, Errno>
446    where
447        L: LockEqualOrBefore<FileOpsCore>,
448    {
449        // TODO: apply_umask
450        self.create_entry(
451            locked,
452            current_task,
453            &MountInfo::detached(),
454            name,
455            |locked, dir, mount, name| {
456                dir.create_node(
457                    locked,
458                    current_task,
459                    mount,
460                    name,
461                    starnix_uapi::file_mode::mode!(IFDIR, 0o777),
462                    starnix_uapi::device_type::DeviceType::NONE,
463                    FsCred::root(),
464                )
465            },
466        )
467    }
468
469    /// Creates an anonymous file.
470    ///
471    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
472    ///
473    /// Used by O_TMPFILE.
474    pub fn create_tmpfile<L>(
475        self: &DirEntryHandle,
476        locked: &mut Locked<L>,
477        current_task: &CurrentTask,
478        mount: &MountInfo,
479        mode: FileMode,
480        owner: FsCred,
481        flags: OpenFlags,
482    ) -> Result<DirEntryHandle, Errno>
483    where
484        L: LockEqualOrBefore<FileOpsCore>,
485    {
486        // Only directories can have children.
487        if !self.node.is_dir() {
488            return error!(ENOTDIR);
489        }
490        assert!(mode.is_reg());
491
492        // From <https://man7.org/linux/man-pages/man2/open.2.html>:
493        //
494        //   Specifying O_EXCL in conjunction with O_TMPFILE prevents a
495        //   temporary file from being linked into the filesystem in
496        //   the above manner.  (Note that the meaning of O_EXCL in
497        //   this case is different from the meaning of O_EXCL
498        //   otherwise.)
499        let link_behavior = if flags.contains(OpenFlags::EXCL) {
500            FsNodeLinkBehavior::Disallowed
501        } else {
502            FsNodeLinkBehavior::Allowed
503        };
504
505        let node =
506            self.node.create_tmpfile(locked, current_task, mount, mode, owner, link_behavior)?;
507        let local_name = format!("#{}", node.ino).into();
508        Ok(DirEntry::new_deleted(node, Some(self.clone()), local_name))
509    }
510
511    pub fn unlink<L>(
512        self: &DirEntryHandle,
513        locked: &mut Locked<L>,
514        current_task: &CurrentTask,
515        mount: &MountInfo,
516        name: &FsStr,
517        kind: UnlinkKind,
518        must_be_directory: bool,
519    ) -> Result<(), Errno>
520    where
521        L: LockEqualOrBefore<FileOpsCore>,
522    {
523        assert!(!DirEntry::is_reserved_name(name));
524
525        // child_to_unlink *must* be dropped after self_children (even in the error paths).
526        let child_to_unlink;
527
528        let mut self_children = self.lock_children();
529        child_to_unlink = self_children.component_lookup(locked, current_task, mount, name)?;
530        child_to_unlink.require_no_mounts(mount)?;
531
532        // Check that this filesystem entry must be a directory. This can
533        // happen if the path terminates with a trailing slash.
534        //
535        // Example: If we're unlinking a symlink `/foo/bar/`, this would
536        // result in `ENOTDIR` because of the trailing slash, even if
537        // `UnlinkKind::NonDirectory` was used.
538        if must_be_directory && !child_to_unlink.node.is_dir() {
539            return error!(ENOTDIR);
540        }
541
542        match kind {
543            UnlinkKind::Directory => {
544                if !child_to_unlink.node.is_dir() {
545                    return error!(ENOTDIR);
546                }
547            }
548            UnlinkKind::NonDirectory => {
549                if child_to_unlink.node.is_dir() {
550                    return error!(EISDIR);
551                }
552            }
553        }
554
555        self.node.unlink(locked, current_task, mount, name, &child_to_unlink.node)?;
556        self_children.children.remove(name);
557
558        std::mem::drop(self_children);
559        child_to_unlink.destroy(&current_task.kernel().mounts);
560
561        Ok(())
562    }
563
564    /// Destroy this directory entry.
565    ///
566    /// Notice that this method takes `self` by value to destroy this reference.
567    fn destroy(self: DirEntryHandle, mounts: &Mounts) {
568        let was_already_dead =
569            self.raise_flags(DirEntryFlags::IS_DEAD).contains(DirEntryFlags::IS_DEAD);
570        if was_already_dead {
571            return;
572        }
573        let unmount =
574            self.lower_flags(DirEntryFlags::HAS_MOUNTS).contains(DirEntryFlags::HAS_MOUNTS);
575        self.node.fs().will_destroy_dir_entry(&self);
576        if unmount {
577            mounts.unmount(&self);
578        }
579        self.notify_deletion();
580    }
581
582    /// Returns whether this entry is a descendant of |other|.
583    pub fn is_descendant_of(self: &DirEntryHandle, other: &DirEntryHandle) -> bool {
584        let scope = RcuReadScope::new();
585        let mut current = self.deref();
586        loop {
587            if std::ptr::eq(current, other.deref()) {
588                // We found |other|.
589                return true;
590            }
591            if let Some(parent) = current.parent_ref(&scope) {
592                current = parent;
593            } else {
594                // We reached the root of the file system.
595                return false;
596            }
597        }
598    }
599
600    /// Rename the file with old_basename in old_parent to new_basename in
601    /// new_parent.
602    ///
603    /// old_parent and new_parent must belong to the same file system.
604    pub fn rename<L>(
605        locked: &mut Locked<L>,
606        current_task: &CurrentTask,
607        old_parent: &DirEntryHandle,
608        old_mount: &MountInfo,
609        old_basename: &FsStr,
610        new_parent: &DirEntryHandle,
611        new_mount: &MountInfo,
612        new_basename: &FsStr,
613        flags: RenameFlags,
614    ) -> Result<(), Errno>
615    where
616        L: LockEqualOrBefore<FileOpsCore>,
617    {
618        // The nodes we are touching must be part of the same mount.
619        if old_mount != new_mount {
620            return error!(EXDEV);
621        }
622        // The mounts are equals, choose one.
623        let mount = old_mount;
624
625        // If either the old_basename or the new_basename is a reserved name
626        // (e.g., "." or ".."), then we cannot do the rename.
627        if DirEntry::is_reserved_name(old_basename) || DirEntry::is_reserved_name(new_basename) {
628            if flags.contains(RenameFlags::NOREPLACE) {
629                return error!(EEXIST);
630            }
631            return error!(EBUSY);
632        }
633
634        // If the names and parents are the same, then there's nothing to do
635        // and we can report success.
636        if Arc::ptr_eq(&old_parent.node, &new_parent.node) && old_basename == new_basename {
637            return Ok(());
638        }
639
640        // This task must have write access to the old and new parent nodes.
641        old_parent.node.check_access(
642            locked,
643            current_task,
644            mount,
645            Access::WRITE,
646            CheckAccessReason::InternalPermissionChecks,
647            old_parent,
648        )?;
649        new_parent.node.check_access(
650            locked,
651            current_task,
652            mount,
653            Access::WRITE,
654            CheckAccessReason::InternalPermissionChecks,
655            new_parent,
656        )?;
657
658        // The mount check ensures that the nodes we're touching are part of the
659        // same file system. It doesn't matter where we grab the FileSystem reference from.
660        let fs = old_parent.node.fs();
661
662        // We need to hold these DirEntryHandles until after we drop all the
663        // locks so that we do not deadlock when we drop them.
664        let renamed;
665        let mut maybe_replaced = None;
666
667        {
668            // Before we take any locks, we need to take the rename mutex on
669            // the file system. This lock ensures that no other rename
670            // operations are happening in this file system while we're
671            // analyzing this rename operation.
672            //
673            // For example, we grab writer locks on both old_parent and
674            // new_parent. If there was another rename operation in flight with
675            // old_parent and new_parent reversed, then we could deadlock while
676            // trying to acquire these locks.
677            let _lock = fs.rename_mutex.lock();
678
679            // Compute the list of ancestors of new_parent to check whether new_parent is a
680            // descendant of the renamed node. This must be computed before taking any lock to
681            // prevent lock inversions.
682            //
683            // TODO: This walk is now lock-free, which means we don't need to materialize this
684            // list. We can just walk the tree and check for cycles.
685            let mut new_parent_ancestor_list = Vec::<DirEntryHandle>::new();
686            {
687                let mut current = Some(new_parent.clone());
688                while let Some(entry) = current {
689                    current = entry.parent();
690                    new_parent_ancestor_list.push(entry);
691                }
692            }
693
694            // We cannot simply grab the locks on old_parent and new_parent
695            // independently because old_parent and new_parent might be the
696            // same directory entry. Instead, we use the RenameGuard helper to
697            // grab the appropriate locks.
698            let mut state = RenameGuard::lock(old_parent, new_parent);
699
700            // Now that we know the old_parent child list cannot change, we
701            // establish the DirEntry that we are going to try to rename.
702            renamed =
703                state.old_parent().component_lookup(locked, current_task, mount, old_basename)?;
704
705            // Check whether the sticky bit on the old parent prevents us from
706            // removing this child.
707            old_parent.node.check_sticky_bit(current_task, &renamed.node)?;
708
709            // If new_parent is a descendant of renamed, the operation would
710            // create a cycle. That's disallowed.
711            if new_parent_ancestor_list.into_iter().any(|entry| Arc::ptr_eq(&entry, &renamed)) {
712                return error!(EINVAL);
713            }
714
715            // Check whether the renamed entry is a mountpoint.
716            // TODO: We should hold a read lock on the mount points for this
717            //       namespace to prevent the child from becoming a mount point
718            //       while this function is executing.
719            renamed.require_no_mounts(mount)?;
720
721            // We need to check if there is already a DirEntry with
722            // new_basename in new_parent. If so, there are additional checks
723            // we need to perform.
724            match state.new_parent().component_lookup(locked, current_task, mount, new_basename) {
725                Ok(replaced) => {
726                    // Set `maybe_replaced` now to ensure it gets dropped in the right order.
727                    let replaced = maybe_replaced.insert(replaced);
728
729                    if flags.contains(RenameFlags::NOREPLACE) {
730                        return error!(EEXIST);
731                    }
732
733                    // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html:
734                    //
735                    // "If oldpath and newpath are existing hard links referring to the
736                    // same file, then rename() does nothing, and returns a success
737                    // status."
738                    if Arc::ptr_eq(&renamed.node, &replaced.node) {
739                        return Ok(());
740                    }
741
742                    // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html:
743                    //
744                    // "oldpath can specify a directory.  In this case, newpath must"
745                    // either not exist, or it must specify an empty directory."
746                    if replaced.node.is_dir() {
747                        // Check whether the replaced entry is a mountpoint.
748                        // TODO: We should hold a read lock on the mount points for this
749                        //       namespace to prevent the child from becoming a mount point
750                        //       while this function is executing.
751                        replaced.require_no_mounts(mount)?;
752                    }
753
754                    if !flags.intersects(RenameFlags::EXCHANGE | RenameFlags::REPLACE_ANY) {
755                        if renamed.node.is_dir() && !replaced.node.is_dir() {
756                            return error!(ENOTDIR);
757                        } else if !renamed.node.is_dir() && replaced.node.is_dir() {
758                            return error!(EISDIR);
759                        }
760                    }
761                }
762                // It's fine for the lookup to fail to find a child.
763                Err(errno) if errno == ENOENT => {}
764                // However, other errors are fatal.
765                Err(e) => return Err(e),
766            }
767
768            security::check_fs_node_rename_access(
769                current_task,
770                &old_parent.node,
771                &renamed.node,
772                &new_parent.node,
773                maybe_replaced.as_ref().map(|dir_entry| dir_entry.node.deref().as_ref()),
774                old_basename,
775                new_basename,
776            )?;
777
778            // We've found all the errors that we know how to find. Ask the
779            // file system to actually execute the rename operation. Once the
780            // file system has executed the rename, we are no longer allowed to
781            // fail because we will not be able to return the system to a
782            // consistent state.
783
784            if flags.contains(RenameFlags::EXCHANGE) {
785                let replaced = maybe_replaced.as_ref().ok_or_else(|| errno!(ENOENT))?;
786                fs.exchange(
787                    current_task,
788                    &renamed.node,
789                    &old_parent.node,
790                    old_basename,
791                    &replaced.node,
792                    &new_parent.node,
793                    new_basename,
794                )?;
795            } else {
796                fs.rename(
797                    locked,
798                    current_task,
799                    &old_parent.node,
800                    old_basename,
801                    &new_parent.node,
802                    new_basename,
803                    &renamed.node,
804                    maybe_replaced.as_ref().map(|replaced| &replaced.node),
805                )?;
806            }
807
808            // We need to update the parent and local name for the DirEntry
809            // we are renaming to reflect its new parent and its new name.
810            renamed.set_parent(new_parent.clone());
811            renamed.local_name.update(new_basename.to_owned());
812
813            // Actually add the renamed child to the new_parent's child list.
814            // This operation implicitly removes the replaced child (if any)
815            // from the child list.
816            state.new_parent().children.insert(new_basename.into(), Arc::downgrade(&renamed));
817
818            #[cfg(detect_lock_cycles)]
819            unsafe {
820                // Lock ordering is enforced from parent-to-child, and therefore we need to
821                // reset the lock ordering constraints when we reorder the tree nodes.
822                util::reset_dependencies(renamed.children.raw());
823            }
824
825            if flags.contains(RenameFlags::EXCHANGE) {
826                // Reparent `replaced` when exchanging.
827                let replaced =
828                    maybe_replaced.as_ref().expect("replaced expected with RENAME_EXCHANGE");
829                replaced.set_parent(old_parent.clone());
830                replaced.local_name.update(old_basename.to_owned());
831                state.old_parent().children.insert(old_basename.into(), Arc::downgrade(replaced));
832
833                #[cfg(detect_lock_cycles)]
834                unsafe {
835                    // Lock ordering is enforced from parent-to-child, and therefore we need to
836                    // reset the lock ordering constraints when we reorder the tree nodes.
837                    util::reset_dependencies(replaced.children.raw());
838                }
839            } else {
840                // Remove the renamed child from the old_parent's child list.
841                state.old_parent().children.remove(old_basename);
842            }
843        };
844
845        fs.purge_old_entries();
846
847        if let Some(replaced) = maybe_replaced {
848            if !flags.contains(RenameFlags::EXCHANGE) {
849                replaced.destroy(&current_task.kernel().mounts);
850            }
851        }
852
853        // Renaming a file updates its ctime.
854        renamed.node.update_ctime();
855
856        let mode = renamed.node.info().mode;
857        let cookie = current_task.kernel().get_next_inotify_cookie();
858        old_parent.node.notify(InotifyMask::MOVE_FROM, cookie, old_basename, mode, false);
859        new_parent.node.notify(InotifyMask::MOVE_TO, cookie, new_basename, mode, false);
860        renamed.node.notify(InotifyMask::MOVE_SELF, 0, Default::default(), mode, false);
861
862        Ok(())
863    }
864
865    pub fn get_children<F, T>(&self, callback: F) -> T
866    where
867        F: FnOnce(&DirEntryChildren) -> T,
868    {
869        let children = self.children.read();
870        callback(&children)
871    }
872
873    /// Remove the child with the given name from the children cache.  The child must not have any
874    /// mounts.
875    pub fn remove_child(&self, name: &FsStr, mounts: &Mounts) {
876        let mut children = self.children.write();
877        let child = children.get(name).and_then(Weak::upgrade);
878        if let Some(child) = child {
879            children.remove(name);
880            std::mem::drop(children);
881            child.destroy(mounts);
882        }
883    }
884
885    fn get_or_create_child<L>(
886        self: &DirEntryHandle,
887        locked: &mut Locked<L>,
888        current_task: &CurrentTask,
889        mount: &MountInfo,
890        name: &FsStr,
891        create_fn: impl FnOnce(
892            &mut Locked<L>,
893            &FsNodeHandle,
894            &MountInfo,
895            &FsStr,
896        ) -> Result<FsNodeHandle, Errno>,
897    ) -> Result<(DirEntryHandle, bool), Errno>
898    where
899        L: LockEqualOrBefore<FileOpsCore>,
900    {
901        assert!(!DirEntry::is_reserved_name(name));
902        // Only directories can have children.
903        if !self.node.is_dir() {
904            return error!(ENOTDIR);
905        }
906        // The user must be able to search the directory (requires the EXEC permission)
907        self.node.check_access(
908            locked,
909            current_task,
910            mount,
911            Access::EXEC,
912            CheckAccessReason::InternalPermissionChecks,
913            self,
914        )?;
915
916        // Check if the child is already in children. In that case, we can
917        // simply return the child and we do not need to call init_fn.
918        let child = self.children.read().get(name).and_then(Weak::upgrade);
919        let (child, create_result) = if let Some(child) = child {
920            // Do not cache a child in a locked directory
921            if self.node.fail_if_locked(current_task).is_ok() {
922                child.node.fs().did_access_dir_entry(&child);
923            }
924            (child, CreationResult::Existed { create_fn })
925        } else {
926            let (child, create_result) = self.lock_children().get_or_create_child(
927                locked,
928                current_task,
929                mount,
930                name,
931                create_fn,
932            )?;
933            child.node.fs().purge_old_entries();
934            (child, create_result)
935        };
936
937        let (child, exists) = match create_result {
938            CreationResult::Created => (child, false),
939            CreationResult::Existed { create_fn } => {
940                if child.ops.revalidate(
941                    locked.cast_locked::<FileOpsCore>(),
942                    current_task,
943                    &child,
944                )? {
945                    (child, true)
946                } else {
947                    self.internal_remove_child(&child);
948                    child.destroy(&current_task.kernel().mounts);
949
950                    let (child, create_result) = self.lock_children().get_or_create_child(
951                        locked,
952                        current_task,
953                        mount,
954                        name,
955                        create_fn,
956                    )?;
957                    child.node.fs().purge_old_entries();
958                    (child, matches!(create_result, CreationResult::Existed { .. }))
959                }
960            }
961        };
962
963        Ok((child, exists))
964    }
965
966    // This function is only useful for tests and has some oddities.
967    //
968    // For example, not all the children might have been looked up yet, which
969    // means the returned vector could be missing some names.
970    //
971    // Also, the vector might have "extra" names that are in the process of
972    // being looked up. If the lookup fails, they'll be removed.
973    #[cfg(test)]
974    pub fn copy_child_names(&self) -> Vec<FsString> {
975        let scope = RcuReadScope::new();
976        self.children
977            .read()
978            .values()
979            .filter_map(|child| Weak::upgrade(child).map(|c| c.local_name.read(&scope).to_owned()))
980            .collect()
981    }
982
983    fn internal_remove_child(&self, child: &DirEntry) {
984        let mut children = self.children.write();
985        let scope = RcuReadScope::new();
986        let local_name = child.local_name.read(&scope);
987        if let Some(weak_child) = children.get(local_name) {
988            // If this entry is occupied, we need to check whether child is
989            // the current occupant. If so, we should remove the entry
990            // because the child no longer exists.
991            if std::ptr::eq(weak_child.as_ptr(), child) {
992                children.remove(local_name);
993            }
994        }
995    }
996
997    /// Notifies watchers on the current node and its parent about an event.
998    pub fn notify(&self, event_mask: InotifyMask) {
999        self.notify_watchers(event_mask, self.is_dead());
1000    }
1001
1002    /// Notifies watchers on the current node and its parent about an event.
1003    ///
1004    /// Used for FSNOTIFY_EVENT_INODE events, which ignore IN_EXCL_UNLINK.
1005    pub fn notify_ignoring_excl_unlink(&self, event_mask: InotifyMask) {
1006        // We pretend that this directory entry is not dead to ignore IN_EXCL_UNLINK.
1007        self.notify_watchers(event_mask, false);
1008    }
1009
1010    fn notify_watchers(&self, event_mask: InotifyMask, is_dead: bool) {
1011        let mode = self.node.info().mode;
1012        {
1013            let scope = RcuReadScope::new();
1014            if let Some(parent) = self.parent_ref(&scope) {
1015                let local_name = self.local_name.read(&scope);
1016                parent.node.notify(event_mask, 0, local_name, mode, is_dead);
1017            }
1018        }
1019        self.node.notify(event_mask, 0, Default::default(), mode, is_dead);
1020    }
1021
1022    /// Notifies parents about creation, and notifies current node about link_count change.
1023    fn notify_creation(&self) {
1024        let mode = self.node.info().mode;
1025        if Arc::strong_count(&self.node) > 1 {
1026            // Notify about link change only if there is already a hardlink.
1027            self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false);
1028        }
1029        let scope = RcuReadScope::new();
1030        if let Some(parent) = self.parent_ref(&scope) {
1031            let local_name = self.local_name.read(&scope);
1032            parent.node.notify(InotifyMask::CREATE, 0, local_name, mode, false);
1033        }
1034    }
1035
1036    /// Notifies watchers on the current node about deletion if this is the
1037    /// last hardlink, and drops the DirEntryHandle kept by Inotify.
1038    /// Parent is also notified about deletion.
1039    fn notify_deletion(&self) {
1040        let mode = self.node.info().mode;
1041        if !mode.is_dir() {
1042            // Linux notifies link count change for non-directories.
1043            self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false);
1044        }
1045
1046        let scope = RcuReadScope::new();
1047        if let Some(parent) = self.parent_ref(&scope) {
1048            let local_name = self.local_name.read(&scope);
1049            parent.node.notify(InotifyMask::DELETE, 0, local_name, mode, false);
1050        }
1051
1052        // This check is incorrect if there's another hard link to this FsNode that isn't in
1053        // memory at the moment.
1054        if Arc::strong_count(&self.node) == 1 {
1055            self.node.notify(InotifyMask::DELETE_SELF, 0, Default::default(), mode, false);
1056        }
1057    }
1058
1059    /// Returns true if this entry has mounts.
1060    pub fn has_mounts(&self) -> bool {
1061        self.flags().contains(DirEntryFlags::HAS_MOUNTS)
1062    }
1063
1064    /// Records whether or not the entry has mounts.
1065    pub fn set_has_mounts(&self, v: bool) {
1066        if v {
1067            self.raise_flags(DirEntryFlags::HAS_MOUNTS);
1068        } else {
1069            self.lower_flags(DirEntryFlags::HAS_MOUNTS);
1070        }
1071    }
1072
1073    /// Verifies this directory has nothing mounted on it.
1074    fn require_no_mounts(self: &Arc<Self>, parent_mount: &MountInfo) -> Result<(), Errno> {
1075        if self.has_mounts() {
1076            if let Some(mount) = parent_mount.as_ref() {
1077                if mount.read().has_submount(self) {
1078                    return error!(EBUSY);
1079                }
1080            }
1081        }
1082        Ok(())
1083    }
1084}
1085
1086struct DirEntryLockedChildren<'a> {
1087    entry: &'a DirEntryHandle,
1088    children: RwLockWriteGuard<'a, DirEntryChildren>,
1089}
1090
1091enum CreationResult<F> {
1092    Created,
1093    Existed { create_fn: F },
1094}
1095
1096impl<'a> DirEntryLockedChildren<'a> {
1097    fn component_lookup<L>(
1098        &mut self,
1099        locked: &mut Locked<L>,
1100        current_task: &CurrentTask,
1101        mount: &MountInfo,
1102        name: &FsStr,
1103    ) -> Result<DirEntryHandle, Errno>
1104    where
1105        L: LockEqualOrBefore<FileOpsCore>,
1106    {
1107        assert!(!DirEntry::is_reserved_name(name));
1108        let (node, _) =
1109            self.get_or_create_child(locked, current_task, mount, name, |_, _, _, _| {
1110                error!(ENOENT)
1111            })?;
1112        Ok(node)
1113    }
1114
1115    fn get_or_create_child<
1116        L,
1117        F: FnOnce(&mut Locked<L>, &FsNodeHandle, &MountInfo, &FsStr) -> Result<FsNodeHandle, Errno>,
1118    >(
1119        &mut self,
1120        locked: &mut Locked<L>,
1121        current_task: &CurrentTask,
1122        mount: &MountInfo,
1123        name: &FsStr,
1124        create_fn: F,
1125    ) -> Result<(DirEntryHandle, CreationResult<F>), Errno>
1126    where
1127        L: LockEqualOrBefore<FileOpsCore>,
1128    {
1129        let create_child = |locked: &mut Locked<L>, create_fn: F| {
1130            // Before creating the child, check for existence.
1131            let (node, create_result) =
1132                match self.entry.node.lookup(locked, current_task, mount, name) {
1133                    Ok(node) => (node, CreationResult::Existed { create_fn }),
1134                    Err(e) if e == ENOENT => {
1135                        (create_fn(locked, &self.entry.node, mount, name)?, CreationResult::Created)
1136                    }
1137                    Err(e) => return Err(e),
1138                };
1139
1140            assert!(
1141                node.info().mode & FileMode::IFMT != FileMode::EMPTY,
1142                "FsNode initialization did not populate the FileMode in FsNodeInfo."
1143            );
1144
1145            let entry = DirEntry::new(node, Some(self.entry.clone()), name.to_owned());
1146            Ok((entry, create_result))
1147        };
1148
1149        let (child, create_result) = match self.children.entry(name.to_owned()) {
1150            Entry::Vacant(entry) => {
1151                let (child, create_result) = create_child(locked, create_fn)?;
1152                // Do not cache a child in a locked directory
1153                if self.entry.node.fail_if_locked(current_task).is_ok() {
1154                    entry.insert(Arc::downgrade(&child));
1155                }
1156                (child, create_result)
1157            }
1158            Entry::Occupied(mut entry) => {
1159                // It's possible that the upgrade will succeed this time around because we dropped
1160                // the read lock before acquiring the write lock. Another thread might have
1161                // populated this entry while we were not holding any locks.
1162                if let Some(child) = Weak::upgrade(entry.get()) {
1163                    // Do not cache a child in a locked directory
1164                    if self.entry.node.fail_if_locked(current_task).is_ok() {
1165                        child.node.fs().did_access_dir_entry(&child);
1166                    }
1167                    return Ok((child, CreationResult::Existed { create_fn }));
1168                }
1169                let (child, create_result) = create_child(locked, create_fn)?;
1170                // Do not cache a child in a locked directory
1171                if self.entry.node.fail_if_locked(current_task).is_ok() {
1172                    entry.insert(Arc::downgrade(&child));
1173                }
1174                (child, create_result)
1175            }
1176        };
1177
1178        security::fs_node_init_with_dentry(locked, current_task, &child)?;
1179
1180        Ok((child, create_result))
1181    }
1182}
1183
1184impl fmt::Debug for DirEntry {
1185    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1186        let scope = RcuReadScope::new();
1187        let mut parents = vec![];
1188        let mut maybe_parent = self.parent_ref(&scope);
1189        while let Some(parent) = maybe_parent {
1190            parents.push(parent.local_name.read(&scope));
1191            maybe_parent = parent.parent_ref(&scope);
1192        }
1193        let mut builder = f.debug_struct("DirEntry");
1194        builder.field("id", &(self as *const DirEntry));
1195        builder.field("local_name", &self.local_name.read(&scope).to_owned());
1196        if !parents.is_empty() {
1197            builder.field("parents", &parents);
1198        }
1199        builder.finish()
1200    }
1201}
1202
1203struct RenameGuard<'a> {
1204    old_parent_guard: DirEntryLockedChildren<'a>,
1205    new_parent_guard: Option<DirEntryLockedChildren<'a>>,
1206}
1207
1208impl<'a> RenameGuard<'a> {
1209    fn lock(old_parent: &'a DirEntryHandle, new_parent: &'a DirEntryHandle) -> Self {
1210        if Arc::ptr_eq(old_parent, new_parent) {
1211            Self { old_parent_guard: old_parent.lock_children(), new_parent_guard: None }
1212        } else {
1213            // Following gVisor, these locks are taken in ancestor-to-descendant order.
1214            // Moreover, if the node are not comparable, they are taken from smallest inode to
1215            // biggest.
1216            if new_parent.is_descendant_of(old_parent)
1217                || (!old_parent.is_descendant_of(new_parent)
1218                    && old_parent.node.node_key() < new_parent.node.node_key())
1219            {
1220                let old_parent_guard = old_parent.lock_children();
1221                let new_parent_guard = new_parent.lock_children();
1222                Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) }
1223            } else {
1224                let new_parent_guard = new_parent.lock_children();
1225                let old_parent_guard = old_parent.lock_children();
1226                Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) }
1227            }
1228        }
1229    }
1230
1231    fn old_parent(&mut self) -> &mut DirEntryLockedChildren<'a> {
1232        &mut self.old_parent_guard
1233    }
1234
1235    fn new_parent(&mut self) -> &mut DirEntryLockedChildren<'a> {
1236        if let Some(new_guard) = self.new_parent_guard.as_mut() {
1237            new_guard
1238        } else {
1239            &mut self.old_parent_guard
1240        }
1241    }
1242}
1243
1244/// The Drop trait for DirEntry removes the entry from the child list of the
1245/// parent entry, which means we cannot drop DirEntry objects while holding a
1246/// lock on the parent's child list.
1247impl Drop for DirEntry {
1248    fn drop(&mut self) {
1249        let scope = RcuReadScope::new();
1250        let maybe_parent = self.parent_ref(&scope);
1251        self.parent.update(None);
1252        if let Some(parent) = maybe_parent {
1253            parent.internal_remove_child(self);
1254        }
1255    }
1256}