Skip to main content

starnix_core/vfs/
dir_entry.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::CurrentTask;
7use crate::vfs::{
8    CheckAccessReason, FileHandle, FileObject, FsNodeHandle, FsNodeLinkBehavior, FsStr, FsString,
9    MountInfo, Mounts, NamespaceNode, UnlinkKind, path,
10};
11use atomic_bitflags::atomic_bitflags;
12use bitflags::bitflags;
13use fuchsia_rcu::{RcuOptionArc, RcuReadScope};
14use starnix_rcu::RcuString;
15use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, RwLock, RwLockWriteGuard};
16use starnix_uapi::auth::FsCred;
17use starnix_uapi::errors::{ENOENT, Errno};
18use starnix_uapi::file_mode::{Access, FileMode};
19use starnix_uapi::inotify_mask::InotifyMask;
20use starnix_uapi::open_flags::OpenFlags;
21use starnix_uapi::{NAME_MAX, RENAME_EXCHANGE, RENAME_NOREPLACE, RENAME_WHITEOUT, errno, error};
22use std::collections::BTreeMap;
23use std::collections::btree_map::Entry;
24use std::fmt;
25use std::ops::Deref;
26use std::sync::atomic::Ordering;
27use std::sync::{Arc, Weak};
28#[cfg(detect_lock_cycles)]
29use tracing_mutex::util;
30
31bitflags! {
32    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
33    pub struct RenameFlags: u32 {
34        // Exchange the entries.
35        const EXCHANGE = RENAME_EXCHANGE;
36
37        // Don't overwrite an existing DirEntry.
38        const NOREPLACE = RENAME_NOREPLACE;
39
40        // Create a "whiteout" object to replace the file.
41        const WHITEOUT = RENAME_WHITEOUT;
42
43        // Allow replacing any file with a directory. This is an internal flag used only
44        // internally inside Starnix for OverlayFS.
45        const REPLACE_ANY = 0x80000000;
46
47        // Internal flags that cannot be passed to `sys_rename()`
48        const INTERNAL = Self::REPLACE_ANY.bits();
49    }
50}
51
52pub trait DirEntryOps: Send + Sync + 'static {
53    /// Revalidate the [`DirEntry`], if needed.
54    ///
55    /// Most filesystems don't need to do any revalidations because they are "local"
56    /// and all changes to nodes go through the kernel. However some filesystems
57    /// allow changes to happen through other means (e.g. NFS, FUSE) and these
58    /// filesystems need a way to let the kernel know it may need to refresh its
59    /// cached metadata. This method provides that hook for such filesystems.
60    ///
61    /// For more details, see:
62    ///  - https://www.halolinux.us/kernel-reference/the-dentry-cache.html
63    ///  - https://www.kernel.org/doc/html/latest/filesystems/path-lookup.html#revalidation-and-automounts
64    ///  - https://lwn.net/Articles/649115/
65    ///  - https://www.infradead.org/~mchehab/kernel_docs/filesystems/path-walking.html
66    ///
67    /// Returns `Ok(valid)` where `valid` indicates if the `DirEntry` is still valid,
68    /// or an error.
69    fn revalidate(
70        &self,
71        _locked: &mut Locked<FileOpsCore>,
72        _: &CurrentTask,
73        _: &DirEntry,
74    ) -> Result<bool, Errno> {
75        Ok(true)
76    }
77}
78
79atomic_bitflags! {
80    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
81    pub struct DirEntryFlags: u8 {
82        /// Whether this directory entry has been removed from the tree.
83        const IS_DEAD = 1 << 0;
84
85        /// Whether the entry has filesystems mounted on top of it.
86        const HAS_MOUNTS = 1 << 1;
87    }
88}
89
90pub struct DefaultDirEntryOps;
91
92impl DirEntryOps for DefaultDirEntryOps {}
93
94/// An entry in a directory.
95///
96/// This structure assigns a name to an FsNode in a given file system. An
97/// FsNode might have multiple directory entries, for example if there are more
98/// than one hard link to the same FsNode. In those cases, each hard link will
99/// have a different parent and a different local_name because each hard link
100/// has its own DirEntry object.
101///
102/// A directory cannot have more than one hard link, which means there is a
103/// single DirEntry for each Directory FsNode. That invariant lets us store the
104/// children for a directory in the DirEntry rather than in the FsNode.
105pub struct DirEntry {
106    /// The FsNode referenced by this DirEntry.
107    ///
108    /// A given FsNode can be referenced by multiple DirEntry objects, for
109    /// example if there are multiple hard links to a given FsNode.
110    pub node: FsNodeHandle,
111
112    /// The [`DirEntryOps`] for this `DirEntry`.
113    ///
114    /// The `DirEntryOps` are implemented by the individual file systems to provide
115    /// specific behaviours for this `DirEntry`.
116    ops: Box<dyn DirEntryOps>,
117
118    /// The parent DirEntry.
119    ///
120    /// The DirEntry tree has strong references from child-to-parent and weak
121    /// references from parent-to-child. This design ensures that the parent
122    /// chain is always populated in the cache, but some children might be
123    /// missing from the cache.
124    parent: RcuOptionArc<DirEntry>,
125
126    /// The [`DirEntryFlags`] for this `DirEntry`.
127    flags: AtomicDirEntryFlags,
128
129    /// The name that this parent calls this child.
130    ///
131    /// This name might not be reflected in the full path in the namespace that
132    /// contains this DirEntry. For example, this DirEntry might be the root of
133    /// a chroot.
134    ///
135    /// Most callers that want to work with names for DirEntries should use the
136    /// NamespaceNodes.
137    local_name: RcuString,
138
139    /// A partial cache of the children of this DirEntry.
140    ///
141    /// DirEntries are added to this cache when they are looked up and removed
142    /// when they are no longer referenced.
143    ///
144    // FIXME(b/379929394): The lock ordering here assumes parent-to-child lock acquisition, which
145    // a number of algorithms in the DirEntry operations also assume. This assumption can be broken
146    // by the rename operation, which can move nodes around the hierarchy. See the referenced bug
147    // for more details, the current mitigations, and potentials for long-term solutions.
148    children: RwLock<DirEntryChildren>,
149}
150type DirEntryChildren = BTreeMap<FsString, Weak<DirEntry>>;
151
152pub type DirEntryHandle = Arc<DirEntry>;
153
154impl DirEntry {
155    #[allow(clippy::let_and_return)]
156    pub fn new_uncached(
157        node: FsNodeHandle,
158        parent: Option<DirEntryHandle>,
159        local_name: FsString,
160    ) -> DirEntryHandle {
161        let ops = node.create_dir_entry_ops();
162        let result = Arc::new(DirEntry {
163            node,
164            ops,
165            parent: RcuOptionArc::new(parent),
166            flags: Default::default(),
167            local_name: local_name.into(),
168            children: Default::default(),
169        });
170        #[cfg(any(test, debug_assertions))]
171        {
172            // Taking this lock tells the lock tracing system about the parent/child ordering
173            // relation.
174            let _l1 = result.children.read();
175        }
176        result
177    }
178
179    pub fn new(
180        node: FsNodeHandle,
181        parent: Option<DirEntryHandle>,
182        local_name: FsString,
183    ) -> DirEntryHandle {
184        let result = Self::new_uncached(node, parent, local_name);
185        result.node.fs().did_create_dir_entry(&result);
186        result
187    }
188
189    /// Returns a new DirEntry for the given `node` without parent. The entry has no local name and
190    /// is not cached.
191    pub fn new_unrooted(node: FsNodeHandle) -> DirEntryHandle {
192        Self::new_uncached(node, None, FsString::default())
193    }
194
195    /// Returns a new `DirEntry` that is ready marked as having been deleted.
196    pub fn new_deleted(
197        node: FsNodeHandle,
198        parent: Option<DirEntryHandle>,
199        local_name: FsString,
200    ) -> DirEntryHandle {
201        let entry = DirEntry::new_uncached(node, parent, local_name);
202        entry.raise_flags(DirEntryFlags::IS_DEAD);
203        entry
204    }
205
206    /// Returns a file handle to this entry, associated with an anonymous namespace.
207    pub fn open_anonymous<L>(
208        self: &DirEntryHandle,
209        locked: &mut Locked<L>,
210        current_task: &CurrentTask,
211        flags: OpenFlags,
212    ) -> Result<FileHandle, Errno>
213    where
214        L: LockEqualOrBefore<FileOpsCore>,
215    {
216        let ops = self.node.create_file_ops(locked, current_task, flags)?;
217        FileObject::new(
218            locked,
219            current_task,
220            ops,
221            NamespaceNode::new_anonymous(self.clone()),
222            flags,
223        )
224    }
225
226    /// Set the children of this DirEntry to the given `children`. This should only ever be called
227    /// when children is empty.
228    pub fn set_children(self: &DirEntryHandle, children: BTreeMap<FsString, DirEntryHandle>) {
229        let mut dir_entry_children = self.lock_children();
230        assert!(dir_entry_children.children.is_empty());
231        for (name, child) in children.into_iter() {
232            child.set_parent(self.clone());
233            dir_entry_children.children.insert(name, Arc::downgrade(&child));
234        }
235    }
236
237    fn lock_children<'a>(self: &'a DirEntryHandle) -> DirEntryLockedChildren<'a> {
238        DirEntryLockedChildren { entry: self, children: self.children.write() }
239    }
240
241    /// The parent DirEntry.
242    pub fn parent(&self) -> Option<DirEntryHandle> {
243        self.parent.to_option_arc()
244    }
245
246    /// Returns a reference to the parent DirEntry.
247    ///
248    /// The reference is only valid for the duration of the RCU read scope.
249    pub fn parent_ref<'a>(&'a self, scope: &'a RcuReadScope) -> Option<&'a DirEntry> {
250        self.parent.as_ref(scope)
251    }
252
253    /// Set the parent of this DirEntry.
254    pub fn set_parent(&self, parent: DirEntryHandle) {
255        self.parent.update(Some(parent));
256    }
257
258    /// The parent DirEntry object or this DirEntry if this entry is the root.
259    ///
260    /// Useful when traversing up the tree if you always want to find a parent
261    /// (e.g., for "..").
262    ///
263    /// Be aware that the root of one file system might be mounted as a child
264    /// in another file system. For that reason, consider walking the
265    /// NamespaceNode tree (which understands mounts) rather than the DirEntry
266    /// tree.
267    pub fn parent_or_self(self: &DirEntryHandle) -> DirEntryHandle {
268        self.parent().unwrap_or_else(|| self.clone())
269    }
270
271    /// The name that this parent calls this child.
272    ///
273    /// The reference is only valid for the duration of the RCU read scope.
274    pub fn local_name<'a>(&self, scope: &'a RcuReadScope) -> &'a FsStr {
275        self.local_name.read(scope)
276    }
277
278    /// Whether the given name has special semantics as a directory entry.
279    ///
280    /// Specifically, whether the name is empty (which means "self"), dot
281    /// (which also means "self"), or dot dot (which means "parent").
282    pub fn is_reserved_name(name: &FsStr) -> bool {
283        name.is_empty() || name == "." || name == ".."
284    }
285
286    /// Returns the flags of this DirEntry.
287    pub fn flags(&self) -> DirEntryFlags {
288        self.flags.load(Ordering::Acquire)
289    }
290
291    /// Raises the flags of this DirEntry.
292    ///
293    /// Returns the flags of this DirEntry before the flags were raised.
294    pub fn raise_flags(&self, flags: DirEntryFlags) -> DirEntryFlags {
295        self.flags.fetch_or(flags, Ordering::AcqRel)
296    }
297
298    /// Lowers the flags of this DirEntry.
299    ///
300    /// Returns the flags of this DirEntry before the flags were lowered.
301    pub fn lower_flags(&self, flags: DirEntryFlags) -> DirEntryFlags {
302        self.flags.fetch_and(!flags, Ordering::AcqRel)
303    }
304
305    /// Returns true if this DirEntry is dead.
306    pub fn is_dead(&self) -> bool {
307        self.flags().contains(DirEntryFlags::IS_DEAD)
308    }
309
310    /// Look up a directory entry with the given name as direct child of this
311    /// entry.
312    pub fn component_lookup<L>(
313        self: &DirEntryHandle,
314        locked: &mut Locked<L>,
315        current_task: &CurrentTask,
316        mount: &MountInfo,
317        name: &FsStr,
318    ) -> Result<DirEntryHandle, Errno>
319    where
320        L: LockEqualOrBefore<FileOpsCore>,
321    {
322        let (node, _) = self.get_or_create_child(
323            locked,
324            current_task,
325            mount,
326            name,
327            |locked, d, mount, name| d.lookup(locked, current_task, mount, name),
328        )?;
329        Ok(node)
330    }
331
332    /// Creates a new DirEntry
333    ///
334    /// The create_node_fn function is called to create the underlying FsNode
335    /// for the DirEntry.
336    ///
337    /// If the entry already exists, create_node_fn is not called, and EEXIST is
338    /// returned.
339    pub fn create_entry<L>(
340        self: &DirEntryHandle,
341        locked: &mut Locked<L>,
342        current_task: &CurrentTask,
343        mount: &MountInfo,
344        name: &FsStr,
345        create_node_fn: impl FnOnce(
346            &mut Locked<L>,
347            &FsNodeHandle,
348            &MountInfo,
349            &FsStr,
350        ) -> Result<FsNodeHandle, Errno>,
351    ) -> Result<DirEntryHandle, Errno>
352    where
353        L: LockEqualOrBefore<FileOpsCore>,
354    {
355        let (entry, exists) =
356            self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?;
357        if exists {
358            return error!(EEXIST);
359        }
360        Ok(entry)
361    }
362
363    /// Creates a new DirEntry. Works just like create_entry, except if the entry already exists,
364    /// it is returned.
365    pub fn get_or_create_entry<L>(
366        self: &DirEntryHandle,
367        locked: &mut Locked<L>,
368        current_task: &CurrentTask,
369        mount: &MountInfo,
370        name: &FsStr,
371        create_node_fn: impl FnOnce(
372            &mut Locked<L>,
373            &FsNodeHandle,
374            &MountInfo,
375            &FsStr,
376        ) -> Result<FsNodeHandle, Errno>,
377    ) -> Result<DirEntryHandle, Errno>
378    where
379        L: LockEqualOrBefore<FileOpsCore>,
380    {
381        let (entry, _exists) =
382            self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?;
383        Ok(entry)
384    }
385
386    fn create_entry_internal<L>(
387        self: &DirEntryHandle,
388        locked: &mut Locked<L>,
389        current_task: &CurrentTask,
390        mount: &MountInfo,
391        name: &FsStr,
392        create_node_fn: impl FnOnce(
393            &mut Locked<L>,
394            &FsNodeHandle,
395            &MountInfo,
396            &FsStr,
397        ) -> Result<FsNodeHandle, Errno>,
398    ) -> Result<(DirEntryHandle, bool), Errno>
399    where
400        L: LockEqualOrBefore<FileOpsCore>,
401    {
402        if DirEntry::is_reserved_name(name) {
403            return error!(EEXIST);
404        }
405        // TODO: Do we need to check name for embedded NUL characters?
406        if name.len() > NAME_MAX as usize {
407            return error!(ENAMETOOLONG);
408        }
409        if name.contains(&path::SEPARATOR) {
410            return error!(EINVAL);
411        }
412        let (entry, exists) =
413            self.get_or_create_child(locked, current_task, mount, name, create_node_fn)?;
414        if !exists {
415            // An entry was created. Update the ctime and mtime of this directory.
416            self.node.update_ctime_mtime();
417            entry.notify_creation();
418        }
419        Ok((entry, exists))
420    }
421
422    // This is marked as test-only because it sets the owner/group to root instead of the current
423    // user to save a bit of typing in tests, but this shouldn't happen silently in production.
424    #[cfg(test)]
425    pub fn create_dir<L>(
426        self: &DirEntryHandle,
427        locked: &mut starnix_sync::Locked<L>,
428        current_task: &CurrentTask,
429        name: &FsStr,
430    ) -> Result<DirEntryHandle, Errno>
431    where
432        L: LockEqualOrBefore<FileOpsCore>,
433    {
434        self.create_dir_for_testing(locked, current_task, name)
435    }
436
437    // This function is for testing because it sets the owner/group to root instead of the current
438    // user to save a bit of typing in tests, but this shouldn't happen silently in production.
439    pub fn create_dir_for_testing<L>(
440        self: &DirEntryHandle,
441        locked: &mut Locked<L>,
442        current_task: &CurrentTask,
443        name: &FsStr,
444    ) -> Result<DirEntryHandle, Errno>
445    where
446        L: LockEqualOrBefore<FileOpsCore>,
447    {
448        // TODO: apply_umask
449        self.create_entry(
450            locked,
451            current_task,
452            &MountInfo::detached(),
453            name,
454            |locked, dir, mount, name| {
455                dir.create_node(
456                    locked,
457                    current_task,
458                    mount,
459                    name,
460                    starnix_uapi::file_mode::mode!(IFDIR, 0o777),
461                    starnix_uapi::device_type::DeviceType::NONE,
462                    FsCred::root(),
463                )
464            },
465        )
466    }
467
468    /// Creates an anonymous file.
469    ///
470    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
471    ///
472    /// Used by O_TMPFILE.
473    pub fn create_tmpfile<L>(
474        self: &DirEntryHandle,
475        locked: &mut Locked<L>,
476        current_task: &CurrentTask,
477        mount: &MountInfo,
478        mode: FileMode,
479        owner: FsCred,
480        flags: OpenFlags,
481    ) -> Result<DirEntryHandle, Errno>
482    where
483        L: LockEqualOrBefore<FileOpsCore>,
484    {
485        // Only directories can have children.
486        if !self.node.is_dir() {
487            return error!(ENOTDIR);
488        }
489        assert!(mode.is_reg());
490
491        // From <https://man7.org/linux/man-pages/man2/open.2.html>:
492        //
493        //   Specifying O_EXCL in conjunction with O_TMPFILE prevents a
494        //   temporary file from being linked into the filesystem in
495        //   the above manner.  (Note that the meaning of O_EXCL in
496        //   this case is different from the meaning of O_EXCL
497        //   otherwise.)
498        let link_behavior = if flags.contains(OpenFlags::EXCL) {
499            FsNodeLinkBehavior::Disallowed
500        } else {
501            FsNodeLinkBehavior::Allowed
502        };
503
504        let node =
505            self.node.create_tmpfile(locked, current_task, mount, mode, owner, link_behavior)?;
506        let local_name = format!("#{}", node.ino).into();
507        Ok(DirEntry::new_deleted(node, Some(self.clone()), local_name))
508    }
509
510    pub fn unlink<L>(
511        self: &DirEntryHandle,
512        locked: &mut Locked<L>,
513        current_task: &CurrentTask,
514        mount: &MountInfo,
515        name: &FsStr,
516        kind: UnlinkKind,
517        must_be_directory: bool,
518    ) -> Result<(), Errno>
519    where
520        L: LockEqualOrBefore<FileOpsCore>,
521    {
522        assert!(!DirEntry::is_reserved_name(name));
523
524        // child_to_unlink *must* be dropped after self_children (even in the error paths).
525        let child_to_unlink;
526
527        let mut self_children = self.lock_children();
528        child_to_unlink = self_children.component_lookup(locked, current_task, mount, name)?;
529        child_to_unlink.require_no_mounts(mount)?;
530
531        // Check that this filesystem entry must be a directory. This can
532        // happen if the path terminates with a trailing slash.
533        //
534        // Example: If we're unlinking a symlink `/foo/bar/`, this would
535        // result in `ENOTDIR` because of the trailing slash, even if
536        // `UnlinkKind::NonDirectory` was used.
537        if must_be_directory && !child_to_unlink.node.is_dir() {
538            return error!(ENOTDIR);
539        }
540
541        match kind {
542            UnlinkKind::Directory => {
543                if !child_to_unlink.node.is_dir() {
544                    return error!(ENOTDIR);
545                }
546            }
547            UnlinkKind::NonDirectory => {
548                if child_to_unlink.node.is_dir() {
549                    return error!(EISDIR);
550                }
551            }
552        }
553
554        self.node.unlink(locked, current_task, mount, name, &child_to_unlink.node)?;
555        self_children.children.remove(name);
556
557        std::mem::drop(self_children);
558        child_to_unlink.destroy(&current_task.kernel().mounts);
559
560        Ok(())
561    }
562
563    /// Destroy this directory entry.
564    ///
565    /// Notice that this method takes `self` by value to destroy this reference.
566    fn destroy(self: DirEntryHandle, mounts: &Mounts) {
567        let was_already_dead =
568            self.raise_flags(DirEntryFlags::IS_DEAD).contains(DirEntryFlags::IS_DEAD);
569        if was_already_dead {
570            return;
571        }
572        let unmount =
573            self.lower_flags(DirEntryFlags::HAS_MOUNTS).contains(DirEntryFlags::HAS_MOUNTS);
574        self.node.fs().will_destroy_dir_entry(&self);
575        if unmount {
576            mounts.unmount(&self);
577        }
578        self.notify_deletion();
579    }
580
581    /// Returns whether this entry is a descendant of |other|.
582    pub fn is_descendant_of(self: &DirEntryHandle, other: &DirEntryHandle) -> bool {
583        let scope = RcuReadScope::new();
584        let mut current = self.deref();
585        loop {
586            if std::ptr::eq(current, other.deref()) {
587                // We found |other|.
588                return true;
589            }
590            if let Some(parent) = current.parent_ref(&scope) {
591                current = parent;
592            } else {
593                // We reached the root of the file system.
594                return false;
595            }
596        }
597    }
598
599    /// Rename the file with old_basename in old_parent to new_basename in
600    /// new_parent.
601    ///
602    /// old_parent and new_parent must belong to the same file system.
603    pub fn rename<L>(
604        locked: &mut Locked<L>,
605        current_task: &CurrentTask,
606        old_parent: &DirEntryHandle,
607        old_mount: &MountInfo,
608        old_basename: &FsStr,
609        new_parent: &DirEntryHandle,
610        new_mount: &MountInfo,
611        new_basename: &FsStr,
612        flags: RenameFlags,
613    ) -> Result<(), Errno>
614    where
615        L: LockEqualOrBefore<FileOpsCore>,
616    {
617        // The nodes we are touching must be part of the same mount.
618        if old_mount != new_mount {
619            return error!(EXDEV);
620        }
621        // The mounts are equals, choose one.
622        let mount = old_mount;
623
624        // If either the old_basename or the new_basename is a reserved name
625        // (e.g., "." or ".."), then we cannot do the rename.
626        if DirEntry::is_reserved_name(old_basename) || DirEntry::is_reserved_name(new_basename) {
627            if flags.contains(RenameFlags::NOREPLACE) {
628                return error!(EEXIST);
629            }
630            return error!(EBUSY);
631        }
632
633        // If the names and parents are the same, then there's nothing to do
634        // and we can report success.
635        if Arc::ptr_eq(&old_parent.node, &new_parent.node) && old_basename == new_basename {
636            return Ok(());
637        }
638
639        // This task must have write access to the old and new parent nodes.
640        old_parent.node.check_access(
641            locked,
642            current_task,
643            mount,
644            Access::WRITE,
645            CheckAccessReason::InternalPermissionChecks,
646            old_parent,
647        )?;
648        new_parent.node.check_access(
649            locked,
650            current_task,
651            mount,
652            Access::WRITE,
653            CheckAccessReason::InternalPermissionChecks,
654            new_parent,
655        )?;
656
657        // The mount check ensures that the nodes we're touching are part of the
658        // same file system. It doesn't matter where we grab the FileSystem reference from.
659        let fs = old_parent.node.fs();
660
661        // We need to hold these DirEntryHandles until after we drop all the
662        // locks so that we do not deadlock when we drop them.
663        let renamed;
664        let mut maybe_replaced = None;
665
666        {
667            // Before we take any locks, we need to take the rename mutex on
668            // the file system. This lock ensures that no other rename
669            // operations are happening in this file system while we're
670            // analyzing this rename operation.
671            //
672            // For example, we grab writer locks on both old_parent and
673            // new_parent. If there was another rename operation in flight with
674            // old_parent and new_parent reversed, then we could deadlock while
675            // trying to acquire these locks.
676            let _lock = fs.rename_mutex.lock();
677
678            // Compute the list of ancestors of new_parent to check whether new_parent is a
679            // descendant of the renamed node. This must be computed before taking any lock to
680            // prevent lock inversions.
681            //
682            // TODO: This walk is now lock-free, which means we don't need to materialize this
683            // list. We can just walk the tree and check for cycles.
684            let mut new_parent_ancestor_list = Vec::<DirEntryHandle>::new();
685            {
686                let mut current = Some(new_parent.clone());
687                while let Some(entry) = current {
688                    current = entry.parent();
689                    new_parent_ancestor_list.push(entry);
690                }
691            }
692
693            // We cannot simply grab the locks on old_parent and new_parent
694            // independently because old_parent and new_parent might be the
695            // same directory entry. Instead, we use the RenameGuard helper to
696            // grab the appropriate locks.
697            let mut state = RenameGuard::lock(old_parent, new_parent);
698
699            // Now that we know the old_parent child list cannot change, we
700            // establish the DirEntry that we are going to try to rename.
701            renamed =
702                state.old_parent().component_lookup(locked, current_task, mount, old_basename)?;
703
704            // Check whether the sticky bit on the old parent prevents us from
705            // removing this child.
706            old_parent.node.check_sticky_bit(current_task, &renamed.node)?;
707
708            // If new_parent is a descendant of renamed, the operation would
709            // create a cycle. That's disallowed.
710            if new_parent_ancestor_list.into_iter().any(|entry| Arc::ptr_eq(&entry, &renamed)) {
711                return error!(EINVAL);
712            }
713
714            // Check whether the renamed entry is a mountpoint.
715            // TODO: We should hold a read lock on the mount points for this
716            //       namespace to prevent the child from becoming a mount point
717            //       while this function is executing.
718            renamed.require_no_mounts(mount)?;
719
720            // We need to check if there is already a DirEntry with
721            // new_basename in new_parent. If so, there are additional checks
722            // we need to perform.
723            match state.new_parent().component_lookup(locked, current_task, mount, new_basename) {
724                Ok(replaced) => {
725                    // Set `maybe_replaced` now to ensure it gets dropped in the right order.
726                    let replaced = maybe_replaced.insert(replaced);
727
728                    if flags.contains(RenameFlags::NOREPLACE) {
729                        return error!(EEXIST);
730                    }
731
732                    // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html:
733                    //
734                    // "If oldpath and newpath are existing hard links referring to the
735                    // same file, then rename() does nothing, and returns a success
736                    // status."
737                    if Arc::ptr_eq(&renamed.node, &replaced.node) {
738                        return Ok(());
739                    }
740
741                    // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html:
742                    //
743                    // "oldpath can specify a directory.  In this case, newpath must"
744                    // either not exist, or it must specify an empty directory."
745                    if replaced.node.is_dir() {
746                        // Check whether the replaced entry is a mountpoint.
747                        // TODO: We should hold a read lock on the mount points for this
748                        //       namespace to prevent the child from becoming a mount point
749                        //       while this function is executing.
750                        replaced.require_no_mounts(mount)?;
751                    }
752
753                    if !flags.intersects(RenameFlags::EXCHANGE | RenameFlags::REPLACE_ANY) {
754                        if renamed.node.is_dir() && !replaced.node.is_dir() {
755                            return error!(ENOTDIR);
756                        } else if !renamed.node.is_dir() && replaced.node.is_dir() {
757                            return error!(EISDIR);
758                        }
759                    }
760                }
761                // It's fine for the lookup to fail to find a child.
762                Err(errno) if errno == ENOENT => {}
763                // However, other errors are fatal.
764                Err(e) => return Err(e),
765            }
766
767            security::check_fs_node_rename_access(
768                current_task,
769                &old_parent.node,
770                &renamed.node,
771                &new_parent.node,
772                maybe_replaced.as_ref().map(|dir_entry| dir_entry.node.deref().as_ref()),
773                old_basename,
774                new_basename,
775            )?;
776
777            // We've found all the errors that we know how to find. Ask the
778            // file system to actually execute the rename operation. Once the
779            // file system has executed the rename, we are no longer allowed to
780            // fail because we will not be able to return the system to a
781            // consistent state.
782
783            if flags.contains(RenameFlags::EXCHANGE) {
784                let replaced = maybe_replaced.as_ref().ok_or_else(|| errno!(ENOENT))?;
785                fs.exchange(
786                    current_task,
787                    &renamed.node,
788                    &old_parent.node,
789                    old_basename,
790                    &replaced.node,
791                    &new_parent.node,
792                    new_basename,
793                )?;
794            } else {
795                fs.rename(
796                    locked,
797                    current_task,
798                    &old_parent.node,
799                    old_basename,
800                    &new_parent.node,
801                    new_basename,
802                    &renamed.node,
803                    maybe_replaced.as_ref().map(|replaced| &replaced.node),
804                )?;
805            }
806
807            // We need to update the parent and local name for the DirEntry
808            // we are renaming to reflect its new parent and its new name.
809            renamed.set_parent(new_parent.clone());
810            renamed.local_name.update(new_basename.to_owned());
811
812            // Actually add the renamed child to the new_parent's child list.
813            // This operation implicitly removes the replaced child (if any)
814            // from the child list.
815            state.new_parent().children.insert(new_basename.into(), Arc::downgrade(&renamed));
816
817            #[cfg(detect_lock_cycles)]
818            unsafe {
819                // Lock ordering is enforced from parent-to-child, and therefore we need to
820                // reset the lock ordering constraints when we reorder the tree nodes.
821                util::reset_dependencies(renamed.children.raw());
822            }
823
824            if flags.contains(RenameFlags::EXCHANGE) {
825                // Reparent `replaced` when exchanging.
826                let replaced =
827                    maybe_replaced.as_ref().expect("replaced expected with RENAME_EXCHANGE");
828                replaced.set_parent(old_parent.clone());
829                replaced.local_name.update(old_basename.to_owned());
830                state.old_parent().children.insert(old_basename.into(), Arc::downgrade(replaced));
831
832                #[cfg(detect_lock_cycles)]
833                unsafe {
834                    // Lock ordering is enforced from parent-to-child, and therefore we need to
835                    // reset the lock ordering constraints when we reorder the tree nodes.
836                    util::reset_dependencies(replaced.children.raw());
837                }
838            } else {
839                // Remove the renamed child from the old_parent's child list.
840                state.old_parent().children.remove(old_basename);
841            }
842        };
843
844        fs.purge_old_entries();
845
846        if let Some(replaced) = maybe_replaced {
847            if !flags.contains(RenameFlags::EXCHANGE) {
848                replaced.destroy(&current_task.kernel().mounts);
849            }
850        }
851
852        // Renaming a file updates its ctime.
853        renamed.node.update_ctime();
854
855        let mode = renamed.node.info().mode;
856        let cookie = current_task.kernel().get_next_inotify_cookie();
857        old_parent.node.notify(InotifyMask::MOVE_FROM, cookie, old_basename, mode, false);
858        new_parent.node.notify(InotifyMask::MOVE_TO, cookie, new_basename, mode, false);
859        renamed.node.notify(InotifyMask::MOVE_SELF, 0, Default::default(), mode, false);
860
861        Ok(())
862    }
863
864    pub fn get_children<F, T>(&self, callback: F) -> T
865    where
866        F: FnOnce(&DirEntryChildren) -> T,
867    {
868        let children = self.children.read();
869        callback(&children)
870    }
871
872    /// Remove the child with the given name from the children cache.  The child must not have any
873    /// mounts.
874    pub fn remove_child(&self, name: &FsStr, mounts: &Mounts) {
875        let mut children = self.children.write();
876        let child = children.get(name).and_then(Weak::upgrade);
877        if let Some(child) = child {
878            children.remove(name);
879            std::mem::drop(children);
880            child.destroy(mounts);
881        }
882    }
883
884    fn get_or_create_child<L>(
885        self: &DirEntryHandle,
886        locked: &mut Locked<L>,
887        current_task: &CurrentTask,
888        mount: &MountInfo,
889        name: &FsStr,
890        create_fn: impl FnOnce(
891            &mut Locked<L>,
892            &FsNodeHandle,
893            &MountInfo,
894            &FsStr,
895        ) -> Result<FsNodeHandle, Errno>,
896    ) -> Result<(DirEntryHandle, bool), Errno>
897    where
898        L: LockEqualOrBefore<FileOpsCore>,
899    {
900        assert!(!DirEntry::is_reserved_name(name));
901        // Only directories can have children.
902        if !self.node.is_dir() {
903            return error!(ENOTDIR);
904        }
905        // The user must be able to search the directory (requires the EXEC permission)
906        self.node.check_access(
907            locked,
908            current_task,
909            mount,
910            Access::EXEC,
911            CheckAccessReason::InternalPermissionChecks,
912            self,
913        )?;
914
915        // Check if the child is already in children. In that case, we can
916        // simply return the child and we do not need to call init_fn.
917        let child = self.children.read().get(name).and_then(Weak::upgrade);
918        let (child, create_result) = if let Some(child) = child {
919            // Do not cache a child in a locked directory
920            if self.node.fail_if_locked(current_task).is_ok() {
921                child.node.fs().did_access_dir_entry(&child);
922            }
923            (child, CreationResult::Existed { create_fn })
924        } else {
925            let (child, create_result) = self.lock_children().get_or_create_child(
926                locked,
927                current_task,
928                mount,
929                name,
930                create_fn,
931            )?;
932            child.node.fs().purge_old_entries();
933            (child, create_result)
934        };
935
936        let (child, exists) = match create_result {
937            CreationResult::Created => (child, false),
938            CreationResult::Existed { create_fn } => {
939                if child.ops.revalidate(
940                    locked.cast_locked::<FileOpsCore>(),
941                    current_task,
942                    &child,
943                )? {
944                    (child, true)
945                } else {
946                    self.internal_remove_child(&child);
947                    child.destroy(&current_task.kernel().mounts);
948
949                    let (child, create_result) = self.lock_children().get_or_create_child(
950                        locked,
951                        current_task,
952                        mount,
953                        name,
954                        create_fn,
955                    )?;
956                    child.node.fs().purge_old_entries();
957                    (child, matches!(create_result, CreationResult::Existed { .. }))
958                }
959            }
960        };
961
962        Ok((child, exists))
963    }
964
965    // This function is only useful for tests and has some oddities.
966    //
967    // For example, not all the children might have been looked up yet, which
968    // means the returned vector could be missing some names.
969    //
970    // Also, the vector might have "extra" names that are in the process of
971    // being looked up. If the lookup fails, they'll be removed.
972    #[cfg(test)]
973    pub fn copy_child_names(&self) -> Vec<FsString> {
974        let scope = RcuReadScope::new();
975        self.children
976            .read()
977            .values()
978            .filter_map(|child| Weak::upgrade(child).map(|c| c.local_name.read(&scope).to_owned()))
979            .collect()
980    }
981
982    fn internal_remove_child(&self, child: &DirEntry) {
983        let mut children = self.children.write();
984        let scope = RcuReadScope::new();
985        let local_name = child.local_name.read(&scope);
986        if let Some(weak_child) = children.get(local_name) {
987            // If this entry is occupied, we need to check whether child is
988            // the current occupant. If so, we should remove the entry
989            // because the child no longer exists.
990            if std::ptr::eq(weak_child.as_ptr(), child) {
991                children.remove(local_name);
992            }
993        }
994    }
995
996    /// Notifies watchers on the current node and its parent about an event.
997    pub fn notify(&self, event_mask: InotifyMask) {
998        self.notify_watchers(event_mask, self.is_dead());
999    }
1000
1001    /// Notifies watchers on the current node and its parent about an event.
1002    ///
1003    /// Used for FSNOTIFY_EVENT_INODE events, which ignore IN_EXCL_UNLINK.
1004    pub fn notify_ignoring_excl_unlink(&self, event_mask: InotifyMask) {
1005        // We pretend that this directory entry is not dead to ignore IN_EXCL_UNLINK.
1006        self.notify_watchers(event_mask, false);
1007    }
1008
1009    fn notify_watchers(&self, event_mask: InotifyMask, is_dead: bool) {
1010        let mode = self.node.info().mode;
1011        {
1012            let scope = RcuReadScope::new();
1013            if let Some(parent) = self.parent_ref(&scope) {
1014                let local_name = self.local_name.read(&scope);
1015                parent.node.notify(event_mask, 0, local_name, mode, is_dead);
1016            }
1017        }
1018        self.node.notify(event_mask, 0, Default::default(), mode, is_dead);
1019    }
1020
1021    /// Notifies parents about creation, and notifies current node about link_count change.
1022    fn notify_creation(&self) {
1023        let mode = self.node.info().mode;
1024        if Arc::strong_count(&self.node) > 1 {
1025            // Notify about link change only if there is already a hardlink.
1026            self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false);
1027        }
1028        let scope = RcuReadScope::new();
1029        if let Some(parent) = self.parent_ref(&scope) {
1030            let local_name = self.local_name.read(&scope);
1031            parent.node.notify(InotifyMask::CREATE, 0, local_name, mode, false);
1032        }
1033    }
1034
1035    /// Notifies watchers on the current node about deletion if this is the
1036    /// last hardlink, and drops the DirEntryHandle kept by Inotify.
1037    /// Parent is also notified about deletion.
1038    fn notify_deletion(&self) {
1039        let mode = self.node.info().mode;
1040        if !mode.is_dir() {
1041            // Linux notifies link count change for non-directories.
1042            self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false);
1043        }
1044
1045        let scope = RcuReadScope::new();
1046        if let Some(parent) = self.parent_ref(&scope) {
1047            let local_name = self.local_name.read(&scope);
1048            parent.node.notify(InotifyMask::DELETE, 0, local_name, mode, false);
1049        }
1050
1051        // This check is incorrect if there's another hard link to this FsNode that isn't in
1052        // memory at the moment.
1053        if Arc::strong_count(&self.node) == 1 {
1054            self.node.notify(InotifyMask::DELETE_SELF, 0, Default::default(), mode, false);
1055        }
1056    }
1057
1058    /// Returns true if this entry has mounts.
1059    pub fn has_mounts(&self) -> bool {
1060        self.flags().contains(DirEntryFlags::HAS_MOUNTS)
1061    }
1062
1063    /// Records whether or not the entry has mounts.
1064    pub fn set_has_mounts(&self, v: bool) {
1065        if v {
1066            self.raise_flags(DirEntryFlags::HAS_MOUNTS);
1067        } else {
1068            self.lower_flags(DirEntryFlags::HAS_MOUNTS);
1069        }
1070    }
1071
1072    /// Verifies this directory has nothing mounted on it.
1073    fn require_no_mounts(self: &Arc<Self>, parent_mount: &MountInfo) -> Result<(), Errno> {
1074        if self.has_mounts() {
1075            if let Some(mount) = parent_mount.as_ref() {
1076                if mount.read().has_submount(self) {
1077                    return error!(EBUSY);
1078                }
1079            }
1080        }
1081        Ok(())
1082    }
1083}
1084
1085struct DirEntryLockedChildren<'a> {
1086    entry: &'a DirEntryHandle,
1087    children: RwLockWriteGuard<'a, DirEntryChildren>,
1088}
1089
1090enum CreationResult<F> {
1091    Created,
1092    Existed { create_fn: F },
1093}
1094
1095impl<'a> DirEntryLockedChildren<'a> {
1096    fn component_lookup<L>(
1097        &mut self,
1098        locked: &mut Locked<L>,
1099        current_task: &CurrentTask,
1100        mount: &MountInfo,
1101        name: &FsStr,
1102    ) -> Result<DirEntryHandle, Errno>
1103    where
1104        L: LockEqualOrBefore<FileOpsCore>,
1105    {
1106        assert!(!DirEntry::is_reserved_name(name));
1107        let (node, _) =
1108            self.get_or_create_child(locked, current_task, mount, name, |_, _, _, _| {
1109                error!(ENOENT)
1110            })?;
1111        Ok(node)
1112    }
1113
1114    fn get_or_create_child<
1115        L,
1116        F: FnOnce(&mut Locked<L>, &FsNodeHandle, &MountInfo, &FsStr) -> Result<FsNodeHandle, Errno>,
1117    >(
1118        &mut self,
1119        locked: &mut Locked<L>,
1120        current_task: &CurrentTask,
1121        mount: &MountInfo,
1122        name: &FsStr,
1123        create_fn: F,
1124    ) -> Result<(DirEntryHandle, CreationResult<F>), Errno>
1125    where
1126        L: LockEqualOrBefore<FileOpsCore>,
1127    {
1128        let create_child = |locked: &mut Locked<L>, create_fn: F| {
1129            // Before creating the child, check for existence.
1130            let (node, create_result) =
1131                match self.entry.node.lookup(locked, current_task, mount, name) {
1132                    Ok(node) => (node, CreationResult::Existed { create_fn }),
1133                    Err(e) if e == ENOENT => {
1134                        (create_fn(locked, &self.entry.node, mount, name)?, CreationResult::Created)
1135                    }
1136                    Err(e) => return Err(e),
1137                };
1138
1139            assert!(
1140                node.info().mode & FileMode::IFMT != FileMode::EMPTY,
1141                "FsNode initialization did not populate the FileMode in FsNodeInfo."
1142            );
1143
1144            let entry = DirEntry::new(node, Some(self.entry.clone()), name.to_owned());
1145            Ok((entry, create_result))
1146        };
1147
1148        let (child, create_result) = match self.children.entry(name.to_owned()) {
1149            Entry::Vacant(entry) => {
1150                let (child, create_result) = create_child(locked, create_fn)?;
1151                // Do not cache a child in a locked directory
1152                if self.entry.node.fail_if_locked(current_task).is_ok() {
1153                    entry.insert(Arc::downgrade(&child));
1154                }
1155                (child, create_result)
1156            }
1157            Entry::Occupied(mut entry) => {
1158                // It's possible that the upgrade will succeed this time around because we dropped
1159                // the read lock before acquiring the write lock. Another thread might have
1160                // populated this entry while we were not holding any locks.
1161                if let Some(child) = Weak::upgrade(entry.get()) {
1162                    // Do not cache a child in a locked directory
1163                    if self.entry.node.fail_if_locked(current_task).is_ok() {
1164                        child.node.fs().did_access_dir_entry(&child);
1165                    }
1166                    return Ok((child, CreationResult::Existed { create_fn }));
1167                }
1168                let (child, create_result) = create_child(locked, create_fn)?;
1169                // Do not cache a child in a locked directory
1170                if self.entry.node.fail_if_locked(current_task).is_ok() {
1171                    entry.insert(Arc::downgrade(&child));
1172                }
1173                (child, create_result)
1174            }
1175        };
1176
1177        security::fs_node_init_with_dentry(locked, current_task, &child)?;
1178
1179        Ok((child, create_result))
1180    }
1181}
1182
1183impl fmt::Debug for DirEntry {
1184    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1185        let scope = RcuReadScope::new();
1186        let mut parents = vec![];
1187        let mut maybe_parent = self.parent_ref(&scope);
1188        while let Some(parent) = maybe_parent {
1189            parents.push(parent.local_name.read(&scope));
1190            maybe_parent = parent.parent_ref(&scope);
1191        }
1192        let mut builder = f.debug_struct("DirEntry");
1193        builder.field("id", &(self as *const DirEntry));
1194        builder.field("local_name", &self.local_name.read(&scope).to_owned());
1195        if !parents.is_empty() {
1196            builder.field("parents", &parents);
1197        }
1198        builder.finish()
1199    }
1200}
1201
1202struct RenameGuard<'a> {
1203    old_parent_guard: DirEntryLockedChildren<'a>,
1204    new_parent_guard: Option<DirEntryLockedChildren<'a>>,
1205}
1206
1207impl<'a> RenameGuard<'a> {
1208    fn lock(old_parent: &'a DirEntryHandle, new_parent: &'a DirEntryHandle) -> Self {
1209        if Arc::ptr_eq(old_parent, new_parent) {
1210            Self { old_parent_guard: old_parent.lock_children(), new_parent_guard: None }
1211        } else {
1212            // Following gVisor, these locks are taken in ancestor-to-descendant order.
1213            // Moreover, if the node are not comparable, they are taken from smallest inode to
1214            // biggest.
1215            if new_parent.is_descendant_of(old_parent)
1216                || (!old_parent.is_descendant_of(new_parent)
1217                    && old_parent.node.node_key() < new_parent.node.node_key())
1218            {
1219                let old_parent_guard = old_parent.lock_children();
1220                let new_parent_guard = new_parent.lock_children();
1221                Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) }
1222            } else {
1223                let new_parent_guard = new_parent.lock_children();
1224                let old_parent_guard = old_parent.lock_children();
1225                Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) }
1226            }
1227        }
1228    }
1229
1230    fn old_parent(&mut self) -> &mut DirEntryLockedChildren<'a> {
1231        &mut self.old_parent_guard
1232    }
1233
1234    fn new_parent(&mut self) -> &mut DirEntryLockedChildren<'a> {
1235        if let Some(new_guard) = self.new_parent_guard.as_mut() {
1236            new_guard
1237        } else {
1238            &mut self.old_parent_guard
1239        }
1240    }
1241}
1242
1243/// The Drop trait for DirEntry removes the entry from the child list of the
1244/// parent entry, which means we cannot drop DirEntry objects while holding a
1245/// lock on the parent's child list.
1246impl Drop for DirEntry {
1247    fn drop(&mut self) {
1248        let scope = RcuReadScope::new();
1249        let maybe_parent = self.parent_ref(&scope);
1250        self.parent.update(None);
1251        if let Some(parent) = maybe_parent {
1252            parent.internal_remove_child(self);
1253        }
1254    }
1255}