Skip to main content

starnix_core/vfs/
dir_entry.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::CurrentTask;
7use crate::vfs::{
8    CheckAccessReason, FileHandle, FileObject, FsLockDepType, FsNodeHandle, FsNodeLinkBehavior,
9    FsStr, FsString, LookupVec, MountInfo, Mounts, NamespaceNode, UnlinkKind, path,
10};
11use atomic_bitflags::atomic_bitflags;
12use bitflags::bitflags;
13use fuchsia_rcu::{RcuOptionArc, RcuReadScope};
14use starnix_rcu::RcuString;
15use starnix_sync::{
16    DirEntryChildrenLevel, DirEntryChildrenRecursiveLevel, DynamicLockDepRwLock, FileOpsCore,
17    FuseDirEntryChildrenLevel, LockDepWriteGuard, LockEqualOrBefore, Locked, allow_subclass,
18};
19use starnix_uapi::auth::FsCred;
20use starnix_uapi::errors::{ENOENT, Errno};
21use starnix_uapi::file_mode::{Access, FileMode};
22use starnix_uapi::inotify_mask::InotifyMask;
23use starnix_uapi::open_flags::OpenFlags;
24use starnix_uapi::{NAME_MAX, RENAME_EXCHANGE, RENAME_NOREPLACE, RENAME_WHITEOUT, errno, error};
25use std::collections::BTreeMap;
26use std::collections::btree_map::Entry;
27use std::fmt;
28use std::ops::Deref;
29use std::sync::atomic::Ordering;
30use std::sync::{Arc, Weak};
31#[cfg(detect_lock_cycles)]
32use tracing_mutex::util;
33
34bitflags! {
35    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
36    pub struct RenameFlags: u32 {
37        // Exchange the entries.
38        const EXCHANGE = RENAME_EXCHANGE;
39
40        // Don't overwrite an existing DirEntry.
41        const NOREPLACE = RENAME_NOREPLACE;
42
43        // Create a "whiteout" object to replace the file.
44        const WHITEOUT = RENAME_WHITEOUT;
45
46        // Allow replacing any file with a directory. This is an internal flag used only
47        // internally inside Starnix for OverlayFS.
48        const REPLACE_ANY = 1 << 31;
49
50        // Internal flags that cannot be passed to `sys_rename()`
51        const INTERNAL = Self::REPLACE_ANY.bits();
52    }
53}
54
55pub trait DirEntryOps: Send + Sync + 'static {
56    /// Revalidate the [`DirEntry`], if needed.
57    ///
58    /// Most filesystems don't need to do any revalidations because they are "local"
59    /// and all changes to nodes go through the kernel. However some filesystems
60    /// allow changes to happen through other means (e.g. NFS, FUSE) and these
61    /// filesystems need a way to let the kernel know it may need to refresh its
62    /// cached metadata. This method provides that hook for such filesystems.
63    ///
64    /// For more details, see:
65    ///  - https://www.halolinux.us/kernel-reference/the-dentry-cache.html
66    ///  - https://www.kernel.org/doc/html/latest/filesystems/path-lookup.html#revalidation-and-automounts
67    ///  - https://lwn.net/Articles/649115/
68    ///  - https://www.infradead.org/~mchehab/kernel_docs/filesystems/path-walking.html
69    ///
70    /// Returns `Ok(valid)` where `valid` indicates if the `DirEntry` is still valid,
71    /// or an error.
72    fn revalidate(
73        &self,
74        _locked: &mut Locked<FileOpsCore>,
75        _: &CurrentTask,
76        _: &DirEntry,
77    ) -> Result<bool, Errno> {
78        Ok(true)
79    }
80}
81
82atomic_bitflags! {
83    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
84    pub struct DirEntryFlags: u8 {
85        /// Whether this directory entry has been removed from the tree.
86        const IS_DEAD = 1 << 0;
87
88        /// Whether the entry has filesystems mounted on top of it.
89        const HAS_MOUNTS = 1 << 1;
90    }
91}
92
93pub struct DefaultDirEntryOps;
94
95impl DirEntryOps for DefaultDirEntryOps {}
96
97/// An entry in a directory.
98///
99/// This structure assigns a name to an FsNode in a given file system. An
100/// FsNode might have multiple directory entries, for example if there are more
101/// than one hard link to the same FsNode. In those cases, each hard link will
102/// have a different parent and a different local_name because each hard link
103/// has its own DirEntry object.
104///
105/// A directory cannot have more than one hard link, which means there is a
106/// single DirEntry for each Directory FsNode. That invariant lets us store the
107/// children for a directory in the DirEntry rather than in the FsNode.
108pub struct DirEntry {
109    /// The FsNode referenced by this DirEntry.
110    ///
111    /// A given FsNode can be referenced by multiple DirEntry objects, for
112    /// example if there are multiple hard links to a given FsNode.
113    pub node: FsNodeHandle,
114
115    /// The [`DirEntryOps`] for this `DirEntry`.
116    ///
117    /// The `DirEntryOps` are implemented by the individual file systems to provide
118    /// specific behaviours for this `DirEntry`.
119    ops: Box<dyn DirEntryOps>,
120
121    /// The parent DirEntry.
122    ///
123    /// The DirEntry tree has strong references from child-to-parent and weak
124    /// references from parent-to-child. This design ensures that the parent
125    /// chain is always populated in the cache, but some children might be
126    /// missing from the cache.
127    parent: RcuOptionArc<DirEntry>,
128
129    /// The [`DirEntryFlags`] for this `DirEntry`.
130    flags: AtomicDirEntryFlags,
131
132    /// The name that this parent calls this child.
133    ///
134    /// This name might not be reflected in the full path in the namespace that
135    /// contains this DirEntry. For example, this DirEntry might be the root of
136    /// a chroot.
137    ///
138    /// Most callers that want to work with names for DirEntries should use the
139    /// NamespaceNodes.
140    local_name: RcuString,
141
142    /// A partial cache of the children of this DirEntry.
143    ///
144    /// DirEntries are added to this cache when they are looked up and removed
145    /// when they are no longer referenced.
146    ///
147    // FIXME(b/379929394): The lock ordering here assumes parent-to-child lock acquisition, which
148    // a number of algorithms in the DirEntry operations also assume. This assumption can be broken
149    // by the rename operation, which can move nodes around the hierarchy. See the referenced bug
150    // for more details, the current mitigations, and potentials for long-term solutions.
151    children: DynamicLockDepRwLock<DirEntryChildren>,
152}
153type DirEntryChildren = BTreeMap<FsString, Weak<DirEntry>>;
154
155pub type DirEntryHandle = Arc<DirEntry>;
156
157impl DirEntry {
158    #[allow(clippy::let_and_return)]
159    pub fn new_uncached(
160        node: FsNodeHandle,
161        parent: Option<DirEntryHandle>,
162        local_name: FsString,
163    ) -> DirEntryHandle {
164        let ops = node.create_dir_entry_ops();
165        let fs_lockdep_type = node.fs().fs_lockdep_type();
166        let result = Arc::new(DirEntry {
167            node,
168            ops,
169            parent: RcuOptionArc::new(parent),
170            flags: Default::default(),
171            local_name: local_name.into(),
172            children: match fs_lockdep_type {
173                FsLockDepType::Normal => {
174                    DynamicLockDepRwLock::new::<DirEntryChildrenLevel>(Default::default())
175                }
176                FsLockDepType::Recursive => {
177                    DynamicLockDepRwLock::new::<DirEntryChildrenRecursiveLevel>(Default::default())
178                }
179                FsLockDepType::Fuse => {
180                    DynamicLockDepRwLock::new::<FuseDirEntryChildrenLevel>(Default::default())
181                }
182            },
183        });
184        #[cfg(any(test, debug_assertions))]
185        {
186            // Taking this lock tells the lock tracing system about the parent/child ordering
187            // relation.
188            let _token = allow_subclass();
189            let _l1 = result.children.read();
190        }
191        result
192    }
193
194    pub fn new(
195        node: FsNodeHandle,
196        parent: Option<DirEntryHandle>,
197        local_name: FsString,
198    ) -> DirEntryHandle {
199        let result = Self::new_uncached(node, parent, local_name);
200        result.node.fs().did_create_dir_entry(&result);
201        result
202    }
203
204    /// Returns a new DirEntry for the given `node` without parent. The entry has no local name and
205    /// is not cached.
206    pub fn new_unrooted(node: FsNodeHandle) -> DirEntryHandle {
207        Self::new_uncached(node, None, FsString::default())
208    }
209
210    /// Returns a new `DirEntry` that is ready marked as having been deleted.
211    pub fn new_deleted(
212        node: FsNodeHandle,
213        parent: Option<DirEntryHandle>,
214        local_name: FsString,
215    ) -> DirEntryHandle {
216        let entry = DirEntry::new_uncached(node, parent, local_name);
217        entry.raise_flags(DirEntryFlags::IS_DEAD);
218        entry
219    }
220
221    /// Returns a file handle to this entry, associated with an anonymous namespace.
222    pub fn open_anonymous<L>(
223        self: &DirEntryHandle,
224        locked: &mut Locked<L>,
225        current_task: &CurrentTask,
226        flags: OpenFlags,
227    ) -> Result<FileHandle, Errno>
228    where
229        L: LockEqualOrBefore<FileOpsCore>,
230    {
231        let ops = self.node.create_file_ops(locked, current_task, flags)?;
232        FileObject::new(
233            locked,
234            current_task,
235            ops,
236            NamespaceNode::new_anonymous(self.clone()),
237            flags,
238        )
239    }
240
241    /// Set the children of this DirEntry to the given `children`. This should only ever be called
242    /// when children is empty.
243    pub fn set_children(self: &DirEntryHandle, children: BTreeMap<FsString, DirEntryHandle>) {
244        let mut dir_entry_children = self.lock_children();
245        assert!(dir_entry_children.children.is_empty());
246        for (name, child) in children.into_iter() {
247            child.set_parent(self.clone());
248            dir_entry_children.children.insert(name, Arc::downgrade(&child));
249        }
250    }
251
252    fn lock_children<'a>(self: &'a DirEntryHandle) -> DirEntryLockedChildren<'a> {
253        DirEntryLockedChildren { entry: self, children: self.children.write() }
254    }
255
256    /// The parent DirEntry.
257    pub fn parent(&self) -> Option<DirEntryHandle> {
258        self.parent.to_option_arc()
259    }
260
261    /// Returns a reference to the parent DirEntry.
262    ///
263    /// The reference is only valid for the duration of the RCU read scope.
264    pub fn parent_ref<'a>(&'a self, scope: &'a RcuReadScope) -> Option<&'a DirEntry> {
265        self.parent.as_ref(scope)
266    }
267
268    /// Set the parent of this DirEntry.
269    pub fn set_parent(&self, parent: DirEntryHandle) {
270        self.parent.update(Some(parent));
271    }
272
273    /// The parent DirEntry object or this DirEntry if this entry is the root.
274    ///
275    /// Useful when traversing up the tree if you always want to find a parent
276    /// (e.g., for "..").
277    ///
278    /// Be aware that the root of one file system might be mounted as a child
279    /// in another file system. For that reason, consider walking the
280    /// NamespaceNode tree (which understands mounts) rather than the DirEntry
281    /// tree.
282    pub fn parent_or_self(self: &DirEntryHandle) -> DirEntryHandle {
283        self.parent().unwrap_or_else(|| self.clone())
284    }
285
286    /// The name that this parent calls this child.
287    ///
288    /// The reference is only valid for the duration of the RCU read scope.
289    pub fn local_name<'a>(&self, scope: &'a RcuReadScope) -> &'a FsStr {
290        self.local_name.read(scope)
291    }
292
293    /// Whether the given name has special semantics as a directory entry.
294    ///
295    /// Specifically, whether the name is empty (which means "self"), dot
296    /// (which also means "self"), or dot dot (which means "parent").
297    pub fn is_reserved_name(name: &FsStr) -> bool {
298        name.is_empty() || name == "." || name == ".."
299    }
300
301    /// Returns the flags of this DirEntry.
302    pub fn flags(&self) -> DirEntryFlags {
303        self.flags.load(Ordering::Acquire)
304    }
305
306    /// Raises the flags of this DirEntry.
307    ///
308    /// Returns the flags of this DirEntry before the flags were raised.
309    pub fn raise_flags(&self, flags: DirEntryFlags) -> DirEntryFlags {
310        self.flags.fetch_or(flags, Ordering::AcqRel)
311    }
312
313    /// Lowers the flags of this DirEntry.
314    ///
315    /// Returns the flags of this DirEntry before the flags were lowered.
316    pub fn lower_flags(&self, flags: DirEntryFlags) -> DirEntryFlags {
317        self.flags.fetch_and(!flags, Ordering::AcqRel)
318    }
319
320    /// Returns true if this DirEntry is dead.
321    pub fn is_dead(&self) -> bool {
322        self.flags().contains(DirEntryFlags::IS_DEAD)
323    }
324
325    /// Look up a directory entry with the given name as direct child of this
326    /// entry.
327    pub fn component_lookup<L>(
328        self: &DirEntryHandle,
329        locked: &mut Locked<L>,
330        current_task: &CurrentTask,
331        mount: &MountInfo,
332        name: &FsStr,
333    ) -> Result<DirEntryHandle, Errno>
334    where
335        L: LockEqualOrBefore<FileOpsCore>,
336    {
337        let (node, _) = self.get_or_create_child(
338            locked,
339            current_task,
340            mount,
341            name,
342            |locked, d, mount, name| d.lookup(locked, current_task, mount, name),
343        )?;
344        Ok(node)
345    }
346
347    pub fn get_children_pipelined<L>(
348        self: &DirEntryHandle,
349        locked: &mut Locked<L>,
350        current_task: &CurrentTask,
351        mount: &MountInfo,
352        names: &[&FsStr],
353    ) -> LookupVec<Result<DirEntryHandle, Errno>>
354    where
355        L: LockEqualOrBefore<FileOpsCore>,
356    {
357        let locked = locked.cast_locked::<FileOpsCore>();
358
359        let mut nodes = LookupVec::new();
360        let mut results = LookupVec::new();
361        let mut current_parent = self.clone();
362        for i in 0..names.len() {
363            let next_node = nodes.pop();
364            match current_parent.get_or_create_child(
365                locked,
366                current_task,
367                mount,
368                names[i],
369                |locked, parent_node, _mount, _name| {
370                    if let Some(node) = next_node {
371                        return node;
372                    }
373                    nodes = parent_node.ops().lookup_pipelined(
374                        locked,
375                        parent_node,
376                        current_task,
377                        &names[i..],
378                    );
379                    nodes.reverse();
380                    nodes.pop().unwrap()
381                },
382            ) {
383                Ok((entry, _)) => {
384                    results.push(Ok(entry.clone()));
385                    current_parent = entry;
386                }
387                Err(e) => {
388                    results.push(Err(e));
389                    break;
390                }
391            }
392        }
393        results
394    }
395
396    /// Creates a new DirEntry
397    ///
398    /// The create_node_fn function is called to create the underlying FsNode
399    /// for the DirEntry.
400    ///
401    /// If the entry already exists, create_node_fn is not called, and EEXIST is
402    /// returned.
403    pub fn create_entry<L>(
404        self: &DirEntryHandle,
405        locked: &mut Locked<L>,
406        current_task: &CurrentTask,
407        mount: &MountInfo,
408        name: &FsStr,
409        create_node_fn: impl FnOnce(
410            &mut Locked<L>,
411            &FsNodeHandle,
412            &MountInfo,
413            &FsStr,
414        ) -> Result<FsNodeHandle, Errno>,
415    ) -> Result<DirEntryHandle, Errno>
416    where
417        L: LockEqualOrBefore<FileOpsCore>,
418    {
419        let (entry, exists) =
420            self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?;
421        if exists {
422            return error!(EEXIST);
423        }
424        Ok(entry)
425    }
426
427    /// Creates a new DirEntry. Works just like create_entry, except if the entry already exists,
428    /// it is returned.
429    pub fn get_or_create_entry<L>(
430        self: &DirEntryHandle,
431        locked: &mut Locked<L>,
432        current_task: &CurrentTask,
433        mount: &MountInfo,
434        name: &FsStr,
435        create_node_fn: impl FnOnce(
436            &mut Locked<L>,
437            &FsNodeHandle,
438            &MountInfo,
439            &FsStr,
440        ) -> Result<FsNodeHandle, Errno>,
441    ) -> Result<DirEntryHandle, Errno>
442    where
443        L: LockEqualOrBefore<FileOpsCore>,
444    {
445        let (entry, _exists) =
446            self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?;
447        Ok(entry)
448    }
449
450    fn create_entry_internal<L>(
451        self: &DirEntryHandle,
452        locked: &mut Locked<L>,
453        current_task: &CurrentTask,
454        mount: &MountInfo,
455        name: &FsStr,
456        create_node_fn: impl FnOnce(
457            &mut Locked<L>,
458            &FsNodeHandle,
459            &MountInfo,
460            &FsStr,
461        ) -> Result<FsNodeHandle, Errno>,
462    ) -> Result<(DirEntryHandle, bool), Errno>
463    where
464        L: LockEqualOrBefore<FileOpsCore>,
465    {
466        if DirEntry::is_reserved_name(name) {
467            return error!(EEXIST);
468        }
469        // TODO: Do we need to check name for embedded NUL characters?
470        if name.len() > NAME_MAX as usize {
471            return error!(ENAMETOOLONG);
472        }
473        if name.contains(&path::SEPARATOR) {
474            return error!(EINVAL);
475        }
476        let (entry, exists) =
477            self.get_or_create_child(locked, current_task, mount, name, create_node_fn)?;
478        if !exists {
479            // An entry was created. Update the ctime and mtime of this directory.
480            self.node.update_ctime_mtime();
481            entry.notify_creation();
482        }
483        Ok((entry, exists))
484    }
485
486    // This is marked as test-only because it sets the owner/group to root instead of the current
487    // user to save a bit of typing in tests, but this shouldn't happen silently in production.
488    #[cfg(test)]
489    pub fn create_dir<L>(
490        self: &DirEntryHandle,
491        locked: &mut starnix_sync::Locked<L>,
492        current_task: &CurrentTask,
493        name: &FsStr,
494    ) -> Result<DirEntryHandle, Errno>
495    where
496        L: LockEqualOrBefore<FileOpsCore>,
497    {
498        self.create_dir_for_testing(locked, current_task, name)
499    }
500
501    // This function is for testing because it sets the owner/group to root instead of the current
502    // user to save a bit of typing in tests, but this shouldn't happen silently in production.
503    pub fn create_dir_for_testing<L>(
504        self: &DirEntryHandle,
505        locked: &mut Locked<L>,
506        current_task: &CurrentTask,
507        name: &FsStr,
508    ) -> Result<DirEntryHandle, Errno>
509    where
510        L: LockEqualOrBefore<FileOpsCore>,
511    {
512        // TODO: apply_umask
513        self.create_entry(
514            locked,
515            current_task,
516            &MountInfo::detached(),
517            name,
518            |locked, dir, mount, name| {
519                dir.create_node(
520                    locked,
521                    current_task,
522                    mount,
523                    name,
524                    starnix_uapi::file_mode::mode!(IFDIR, 0o777),
525                    starnix_uapi::device_id::DeviceId::NONE,
526                    FsCred::root(),
527                )
528            },
529        )
530    }
531
532    /// Creates an anonymous file.
533    ///
534    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
535    ///
536    /// Used by O_TMPFILE.
537    pub fn create_tmpfile<L>(
538        self: &DirEntryHandle,
539        locked: &mut Locked<L>,
540        current_task: &CurrentTask,
541        mount: &MountInfo,
542        mode: FileMode,
543        owner: FsCred,
544        flags: OpenFlags,
545    ) -> Result<DirEntryHandle, Errno>
546    where
547        L: LockEqualOrBefore<FileOpsCore>,
548    {
549        // Only directories can have children.
550        if !self.node.is_dir() {
551            return error!(ENOTDIR);
552        }
553        assert!(mode.is_reg());
554
555        // From <https://man7.org/linux/man-pages/man2/open.2.html>:
556        //
557        //   Specifying O_EXCL in conjunction with O_TMPFILE prevents a
558        //   temporary file from being linked into the filesystem in
559        //   the above manner.  (Note that the meaning of O_EXCL in
560        //   this case is different from the meaning of O_EXCL
561        //   otherwise.)
562        let link_behavior = if flags.contains(OpenFlags::EXCL) {
563            FsNodeLinkBehavior::Disallowed
564        } else {
565            FsNodeLinkBehavior::Allowed
566        };
567
568        let node =
569            self.node.create_tmpfile(locked, current_task, mount, mode, owner, link_behavior)?;
570        let local_name = format!("#{}", node.ino).into();
571        Ok(DirEntry::new_deleted(node, Some(self.clone()), local_name))
572    }
573
574    pub fn unlink<L>(
575        self: &DirEntryHandle,
576        locked: &mut Locked<L>,
577        current_task: &CurrentTask,
578        mount: &MountInfo,
579        name: &FsStr,
580        kind: UnlinkKind,
581        must_be_directory: bool,
582    ) -> Result<(), Errno>
583    where
584        L: LockEqualOrBefore<FileOpsCore>,
585    {
586        assert!(!DirEntry::is_reserved_name(name));
587
588        // child_to_unlink *must* be dropped after self_children (even in the error paths).
589        let child_to_unlink;
590
591        let mut self_children = self.lock_children();
592        child_to_unlink = self_children.component_lookup(locked, current_task, mount, name)?;
593        child_to_unlink.require_no_mounts(mount)?;
594
595        // Check that this filesystem entry must be a directory. This can
596        // happen if the path terminates with a trailing slash.
597        //
598        // Example: If we're unlinking a symlink `/foo/bar/`, this would
599        // result in `ENOTDIR` because of the trailing slash, even if
600        // `UnlinkKind::NonDirectory` was used.
601        if must_be_directory && !child_to_unlink.node.is_dir() {
602            return error!(ENOTDIR);
603        }
604
605        match kind {
606            UnlinkKind::Directory => {
607                if !child_to_unlink.node.is_dir() {
608                    return error!(ENOTDIR);
609                }
610            }
611            UnlinkKind::NonDirectory => {
612                if child_to_unlink.node.is_dir() {
613                    return error!(EISDIR);
614                }
615            }
616        }
617
618        self.node.unlink(locked, current_task, mount, name, &child_to_unlink.node)?;
619        self_children.children.remove(name);
620
621        std::mem::drop(self_children);
622        child_to_unlink.destroy(&current_task.kernel().mounts);
623
624        Ok(())
625    }
626
627    /// Destroy this directory entry.
628    ///
629    /// Notice that this method takes `self` by value to destroy this reference.
630    fn destroy(self: DirEntryHandle, mounts: &Mounts) {
631        let was_already_dead =
632            self.raise_flags(DirEntryFlags::IS_DEAD).contains(DirEntryFlags::IS_DEAD);
633        if was_already_dead {
634            return;
635        }
636        let unmount =
637            self.lower_flags(DirEntryFlags::HAS_MOUNTS).contains(DirEntryFlags::HAS_MOUNTS);
638        self.node.fs().will_destroy_dir_entry(&self);
639        if unmount {
640            mounts.unmount(&self);
641        }
642        self.notify_deletion();
643    }
644
645    /// Returns whether this entry is a descendant of |other|.
646    pub fn is_descendant_of(self: &DirEntryHandle, other: &DirEntryHandle) -> bool {
647        let scope = RcuReadScope::new();
648        let mut current = self.deref();
649        loop {
650            if std::ptr::eq(current, other.deref()) {
651                // We found |other|.
652                return true;
653            }
654            if let Some(parent) = current.parent_ref(&scope) {
655                current = parent;
656            } else {
657                // We reached the root of the file system.
658                return false;
659            }
660        }
661    }
662
663    /// Rename the file with old_basename in old_parent to new_basename in
664    /// new_parent.
665    ///
666    /// old_parent and new_parent must belong to the same file system.
667    pub fn rename<L>(
668        locked: &mut Locked<L>,
669        current_task: &CurrentTask,
670        old_parent: &DirEntryHandle,
671        old_mount: &MountInfo,
672        old_basename: &FsStr,
673        new_parent: &DirEntryHandle,
674        new_mount: &MountInfo,
675        new_basename: &FsStr,
676        flags: RenameFlags,
677    ) -> Result<(), Errno>
678    where
679        L: LockEqualOrBefore<FileOpsCore>,
680    {
681        // The nodes we are touching must be part of the same mount.
682        if old_mount != new_mount {
683            return error!(EXDEV);
684        }
685
686        // The mounts are equals, choose one.
687        let mount = old_mount;
688
689        // If either the old_basename or the new_basename is a reserved name
690        // (e.g., "." or ".."), then we cannot do the rename.
691        if DirEntry::is_reserved_name(old_basename) || DirEntry::is_reserved_name(new_basename) {
692            if flags.contains(RenameFlags::NOREPLACE) {
693                return error!(EEXIST);
694            }
695            return error!(EBUSY);
696        }
697
698        // If the names and parents are the same, then there's nothing to do
699        // and we can report success.
700        if Arc::ptr_eq(&old_parent.node, &new_parent.node) && old_basename == new_basename {
701            return Ok(());
702        }
703
704        // This task must have write access to the old and new parent nodes.
705        old_parent.node.check_access(
706            locked,
707            current_task,
708            mount,
709            Access::WRITE,
710            CheckAccessReason::InternalPermissionChecks,
711            old_parent,
712        )?;
713        new_parent.node.check_access(
714            locked,
715            current_task,
716            mount,
717            Access::WRITE,
718            CheckAccessReason::InternalPermissionChecks,
719            new_parent,
720        )?;
721
722        // The mount check ensures that the nodes we're touching are part of the
723        // same file system. It doesn't matter where we grab the FileSystem reference from.
724        let fs = old_parent.node.fs();
725
726        // We need to hold these DirEntryHandles until after we drop all the
727        // locks so that we do not deadlock when we drop them.
728        let renamed;
729        let mut maybe_replaced = None;
730
731        {
732            // Before we take any locks, we need to take the rename mutex on
733            // the file system. This lock ensures that no other rename
734            // operations are happening in this file system while we're
735            // analyzing this rename operation.
736            //
737            // For example, we grab writer locks on both old_parent and
738            // new_parent. If there was another rename operation in flight with
739            // old_parent and new_parent reversed, then we could deadlock while
740            // trying to acquire these locks.
741            let _lock = fs.rename_mutex.lock();
742
743            // Compute the list of ancestors of new_parent to check whether new_parent is a
744            // descendant of the renamed node. This must be computed before taking any lock to
745            // prevent lock inversions.
746            //
747            // TODO: This walk is now lock-free, which means we don't need to materialize this
748            // list. We can just walk the tree and check for cycles.
749            let mut new_parent_ancestor_list = Vec::<DirEntryHandle>::new();
750            {
751                let mut current = Some(new_parent.clone());
752                while let Some(entry) = current {
753                    current = entry.parent();
754                    new_parent_ancestor_list.push(entry);
755                }
756            }
757
758            // We cannot simply grab the locks on old_parent and new_parent
759            // independently because old_parent and new_parent might be the
760            // same directory entry. Instead, we use the RenameGuard helper to
761            // grab the appropriate locks.
762            let mut state = RenameGuard::lock(old_parent, new_parent);
763
764            // Now that we know the old_parent child list cannot change, we
765            // establish the DirEntry that we are going to try to rename.
766            renamed =
767                state.old_parent().component_lookup(locked, current_task, mount, old_basename)?;
768
769            // Check whether the sticky bit on the old parent prevents us from
770            // removing this child.
771            old_parent.node.check_sticky_bit(current_task, &renamed.node)?;
772
773            // If new_parent is a descendant of renamed, the operation would
774            // create a cycle. That's disallowed.
775            if new_parent_ancestor_list.into_iter().any(|entry| Arc::ptr_eq(&entry, &renamed)) {
776                return error!(EINVAL);
777            }
778
779            // Check whether the renamed entry is a mountpoint.
780            // TODO: We should hold a read lock on the mount points for this
781            //       namespace to prevent the child from becoming a mount point
782            //       while this function is executing.
783            renamed.require_no_mounts(mount)?;
784
785            // We need to check if there is already a DirEntry with
786            // new_basename in new_parent. If so, there are additional checks
787            // we need to perform.
788            match state.new_parent().component_lookup(locked, current_task, mount, new_basename) {
789                Ok(replaced) => {
790                    // Set `maybe_replaced` now to ensure it gets dropped in the right order.
791                    let replaced = maybe_replaced.insert(replaced);
792
793                    if flags.contains(RenameFlags::NOREPLACE) {
794                        return error!(EEXIST);
795                    }
796
797                    // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html:
798                    //
799                    // "If oldpath and newpath are existing hard links referring to the
800                    // same file, then rename() does nothing, and returns a success
801                    // status."
802                    if Arc::ptr_eq(&renamed.node, &replaced.node) {
803                        return Ok(());
804                    }
805
806                    // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html:
807                    //
808                    // "oldpath can specify a directory.  In this case, newpath must"
809                    // either not exist, or it must specify an empty directory."
810                    if replaced.node.is_dir() {
811                        // Check whether the replaced entry is a mountpoint.
812                        // TODO: We should hold a read lock on the mount points for this
813                        //       namespace to prevent the child from becoming a mount point
814                        //       while this function is executing.
815                        replaced.require_no_mounts(mount)?;
816                    }
817
818                    if !flags.intersects(RenameFlags::EXCHANGE | RenameFlags::REPLACE_ANY) {
819                        if renamed.node.is_dir() && !replaced.node.is_dir() {
820                            return error!(ENOTDIR);
821                        } else if !renamed.node.is_dir() && replaced.node.is_dir() {
822                            return error!(EISDIR);
823                        }
824                    }
825                }
826                // It's fine for the lookup to fail to find a child.
827                Err(errno) if errno == ENOENT => {}
828                // However, other errors are fatal.
829                Err(e) => return Err(e),
830            }
831
832            security::check_fs_node_rename_access(
833                current_task,
834                &old_parent.node,
835                &renamed.node,
836                &new_parent.node,
837                maybe_replaced.as_ref().map(|dir_entry| dir_entry.node.deref().as_ref()),
838                old_basename,
839                new_basename,
840            )?;
841
842            // We've found all the errors that we know how to find. Ask the
843            // file system to actually execute the rename operation. Once the
844            // file system has executed the rename, we are no longer allowed to
845            // fail because we will not be able to return the system to a
846            // consistent state.
847
848            if flags.contains(RenameFlags::EXCHANGE) {
849                let replaced = maybe_replaced.as_ref().ok_or_else(|| errno!(ENOENT))?;
850                fs.exchange(
851                    current_task,
852                    &renamed.node,
853                    &old_parent.node,
854                    old_basename,
855                    &replaced.node,
856                    &new_parent.node,
857                    new_basename,
858                )?;
859            } else {
860                fs.rename(
861                    locked,
862                    current_task,
863                    &old_parent.node,
864                    old_basename,
865                    &new_parent.node,
866                    new_basename,
867                    &renamed.node,
868                    maybe_replaced.as_ref().map(|replaced| &replaced.node),
869                )?;
870            }
871
872            // We need to update the parent and local name for the DirEntry
873            // we are renaming to reflect its new parent and its new name.
874            renamed.set_parent(new_parent.clone());
875            renamed.local_name.update(new_basename.to_owned());
876
877            // Actually add the renamed child to the new_parent's child list.
878            // This operation implicitly removes the replaced child (if any)
879            // from the child list.
880            state.new_parent().children.insert(new_basename.into(), Arc::downgrade(&renamed));
881
882            #[cfg(detect_lock_cycles)]
883            unsafe {
884                // Lock ordering is enforced from parent-to-child, and therefore we need to
885                // reset the lock ordering constraints when we reorder the tree nodes.
886                util::reset_dependencies(renamed.children.raw());
887            }
888
889            if flags.contains(RenameFlags::EXCHANGE) {
890                // Reparent `replaced` when exchanging.
891                let replaced =
892                    maybe_replaced.as_ref().expect("replaced expected with RENAME_EXCHANGE");
893                replaced.set_parent(old_parent.clone());
894                replaced.local_name.update(old_basename.to_owned());
895                state.old_parent().children.insert(old_basename.into(), Arc::downgrade(replaced));
896
897                #[cfg(detect_lock_cycles)]
898                unsafe {
899                    // Lock ordering is enforced from parent-to-child, and therefore we need to
900                    // reset the lock ordering constraints when we reorder the tree nodes.
901                    util::reset_dependencies(replaced.children.raw());
902                }
903            } else {
904                // Remove the renamed child from the old_parent's child list.
905                state.old_parent().children.remove(old_basename);
906            }
907        };
908
909        fs.purge_old_entries();
910
911        if let Some(replaced) = maybe_replaced {
912            if !flags.contains(RenameFlags::EXCHANGE) {
913                replaced.destroy(&current_task.kernel().mounts);
914            }
915        }
916
917        // Renaming a file updates its ctime.
918        renamed.node.update_ctime();
919
920        let mode = renamed.node.info().mode;
921        let cookie = current_task.kernel().get_next_inotify_cookie();
922        old_parent.node.notify(InotifyMask::MOVE_FROM, cookie, old_basename, mode, false);
923        new_parent.node.notify(InotifyMask::MOVE_TO, cookie, new_basename, mode, false);
924        renamed.node.notify(InotifyMask::MOVE_SELF, 0, Default::default(), mode, false);
925
926        Ok(())
927    }
928
929    pub fn get_children<F, T>(&self, callback: F) -> T
930    where
931        F: FnOnce(&DirEntryChildren) -> T,
932    {
933        let children = self.children.read();
934        callback(&children)
935    }
936
937    /// Remove the child with the given name from the children cache.  The child must not have any
938    /// mounts.
939    pub fn remove_child(&self, name: &FsStr, mounts: &Mounts) {
940        let mut children = self.children.write();
941        let child = children.get(name).and_then(Weak::upgrade);
942        if let Some(child) = child {
943            children.remove(name);
944            std::mem::drop(children);
945            child.destroy(mounts);
946        }
947    }
948
949    fn get_or_create_child<L>(
950        self: &DirEntryHandle,
951        locked: &mut Locked<L>,
952        current_task: &CurrentTask,
953        mount: &MountInfo,
954        name: &FsStr,
955        create_fn: impl FnOnce(
956            &mut Locked<L>,
957            &FsNodeHandle,
958            &MountInfo,
959            &FsStr,
960        ) -> Result<FsNodeHandle, Errno>,
961    ) -> Result<(DirEntryHandle, bool), Errno>
962    where
963        L: LockEqualOrBefore<FileOpsCore>,
964    {
965        assert!(!DirEntry::is_reserved_name(name));
966        // Only directories can have children.
967        if !self.node.is_dir() {
968            return error!(ENOTDIR);
969        }
970        // The user must be able to search the directory (requires the EXEC permission)
971        self.node.check_access(
972            locked,
973            current_task,
974            mount,
975            Access::EXEC,
976            CheckAccessReason::InternalPermissionChecks,
977            self,
978        )?;
979
980        // Check if the child is already in children. In that case, we can
981        // simply return the child and we do not need to call init_fn.
982        let child = self.children.read().get(name).and_then(Weak::upgrade);
983        let (child, create_result) = if let Some(child) = child {
984            // Do not cache a child in a locked directory
985            if self.node.fail_if_locked(current_task).is_ok() {
986                child.node.fs().did_access_dir_entry(&child);
987            }
988            (child, CreationResult::Existed { create_fn })
989        } else {
990            let (child, create_result) = self.lock_children().get_or_create_child(
991                locked,
992                current_task,
993                mount,
994                name,
995                create_fn,
996            )?;
997            child.node.fs().purge_old_entries();
998            (child, create_result)
999        };
1000
1001        let (child, exists) = match create_result {
1002            CreationResult::Created => (child, false),
1003            CreationResult::Existed { create_fn } => {
1004                if child.ops.revalidate(
1005                    locked.cast_locked::<FileOpsCore>(),
1006                    current_task,
1007                    &child,
1008                )? {
1009                    (child, true)
1010                } else {
1011                    self.internal_remove_child(&child);
1012                    child.destroy(&current_task.kernel().mounts);
1013
1014                    let (child, create_result) = self.lock_children().get_or_create_child(
1015                        locked,
1016                        current_task,
1017                        mount,
1018                        name,
1019                        create_fn,
1020                    )?;
1021                    child.node.fs().purge_old_entries();
1022                    (child, matches!(create_result, CreationResult::Existed { .. }))
1023                }
1024            }
1025        };
1026
1027        Ok((child, exists))
1028    }
1029
1030    // This function is only useful for tests and has some oddities.
1031    //
1032    // For example, not all the children might have been looked up yet, which
1033    // means the returned vector could be missing some names.
1034    //
1035    // Also, the vector might have "extra" names that are in the process of
1036    // being looked up. If the lookup fails, they'll be removed.
1037    #[cfg(test)]
1038    pub fn copy_child_names(&self) -> Vec<FsString> {
1039        let scope = RcuReadScope::new();
1040        self.children
1041            .read()
1042            .values()
1043            .filter_map(|child| Weak::upgrade(child).map(|c| c.local_name.read(&scope).to_owned()))
1044            .collect()
1045    }
1046
1047    fn internal_remove_child(&self, child: &DirEntry) {
1048        let mut children = self.children.write();
1049        let scope = RcuReadScope::new();
1050        let local_name = child.local_name.read(&scope);
1051        if let Some(weak_child) = children.get(local_name) {
1052            // If this entry is occupied, we need to check whether child is
1053            // the current occupant. If so, we should remove the entry
1054            // because the child no longer exists.
1055            if std::ptr::eq(weak_child.as_ptr(), child) {
1056                children.remove(local_name);
1057            }
1058        }
1059    }
1060
1061    /// Notifies watchers on the current node and its parent about an event.
1062    pub fn notify(&self, event_mask: InotifyMask) {
1063        self.notify_watchers(event_mask, self.is_dead());
1064    }
1065
1066    /// Notifies watchers on the current node and its parent about an event.
1067    ///
1068    /// Used for FSNOTIFY_EVENT_INODE events, which ignore IN_EXCL_UNLINK.
1069    pub fn notify_ignoring_excl_unlink(&self, event_mask: InotifyMask) {
1070        // We pretend that this directory entry is not dead to ignore IN_EXCL_UNLINK.
1071        self.notify_watchers(event_mask, false);
1072    }
1073
1074    fn notify_watchers(&self, event_mask: InotifyMask, is_dead: bool) {
1075        let mode = self.node.info().mode;
1076        {
1077            let scope = RcuReadScope::new();
1078            if let Some(parent) = self.parent_ref(&scope) {
1079                let local_name = self.local_name.read(&scope);
1080                parent.node.notify(event_mask, 0, local_name, mode, is_dead);
1081            }
1082        }
1083        self.node.notify(event_mask, 0, Default::default(), mode, is_dead);
1084    }
1085
1086    /// Notifies parents about creation, and notifies current node about link_count change.
1087    fn notify_creation(&self) {
1088        let mode = self.node.info().mode;
1089        if Arc::strong_count(&self.node) > 1 {
1090            // Notify about link change only if there is already a hardlink.
1091            self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false);
1092        }
1093        let scope = RcuReadScope::new();
1094        if let Some(parent) = self.parent_ref(&scope) {
1095            let local_name = self.local_name.read(&scope);
1096            parent.node.notify(InotifyMask::CREATE, 0, local_name, mode, false);
1097        }
1098    }
1099
1100    /// Notifies watchers on the current node about deletion if this is the
1101    /// last hardlink, and drops the DirEntryHandle kept by Inotify.
1102    /// Parent is also notified about deletion.
1103    fn notify_deletion(&self) {
1104        let mode = self.node.info().mode;
1105        if !mode.is_dir() {
1106            // Linux notifies link count change for non-directories.
1107            self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false);
1108        }
1109
1110        // This check is incorrect if there's another hard link to this FsNode that isn't in
1111        // memory at the moment.
1112        if Arc::strong_count(&self.node) == 1 {
1113            self.node.notify(InotifyMask::DELETE_SELF, 0, Default::default(), mode, false);
1114        }
1115
1116        let scope = RcuReadScope::new();
1117        if let Some(parent) = self.parent_ref(&scope) {
1118            let local_name = self.local_name.read(&scope);
1119            parent.node.notify(InotifyMask::DELETE, 0, local_name, mode, false);
1120        }
1121    }
1122
1123    /// Returns true if this entry has mounts.
1124    pub fn has_mounts(&self) -> bool {
1125        self.flags().contains(DirEntryFlags::HAS_MOUNTS)
1126    }
1127
1128    /// Records whether or not the entry has mounts.
1129    pub fn set_has_mounts(&self, v: bool) {
1130        if v {
1131            self.raise_flags(DirEntryFlags::HAS_MOUNTS);
1132        } else {
1133            self.lower_flags(DirEntryFlags::HAS_MOUNTS);
1134        }
1135    }
1136
1137    /// Verifies this directory has nothing mounted on it.
1138    fn require_no_mounts(self: &Arc<Self>, parent_mount: &MountInfo) -> Result<(), Errno> {
1139        if self.has_mounts() {
1140            if let Some(mount) = parent_mount.as_ref() {
1141                if mount.read().has_submount(self) {
1142                    return error!(EBUSY);
1143                }
1144            }
1145        }
1146        Ok(())
1147    }
1148}
1149
1150struct DirEntryLockedChildren<'a> {
1151    entry: &'a DirEntryHandle,
1152    children: LockDepWriteGuard<'a, DirEntryChildren>,
1153}
1154
1155enum CreationResult<F> {
1156    Created,
1157    Existed { create_fn: F },
1158}
1159
1160impl<'a> DirEntryLockedChildren<'a> {
1161    fn component_lookup<L>(
1162        &mut self,
1163        locked: &mut Locked<L>,
1164        current_task: &CurrentTask,
1165        mount: &MountInfo,
1166        name: &FsStr,
1167    ) -> Result<DirEntryHandle, Errno>
1168    where
1169        L: LockEqualOrBefore<FileOpsCore>,
1170    {
1171        assert!(!DirEntry::is_reserved_name(name));
1172        let (node, _) =
1173            self.get_or_create_child(locked, current_task, mount, name, |_, _, _, _| {
1174                error!(ENOENT)
1175            })?;
1176        Ok(node)
1177    }
1178
1179    fn get_or_create_child<
1180        L,
1181        F: FnOnce(&mut Locked<L>, &FsNodeHandle, &MountInfo, &FsStr) -> Result<FsNodeHandle, Errno>,
1182    >(
1183        &mut self,
1184        locked: &mut Locked<L>,
1185        current_task: &CurrentTask,
1186        mount: &MountInfo,
1187        name: &FsStr,
1188        create_fn: F,
1189    ) -> Result<(DirEntryHandle, CreationResult<F>), Errno>
1190    where
1191        L: LockEqualOrBefore<FileOpsCore>,
1192    {
1193        let create_child = |locked: &mut Locked<L>, create_fn: F| {
1194            // Before creating the child, check for existence.
1195            let (node, create_result) =
1196                match self.entry.node.lookup(locked, current_task, mount, name) {
1197                    Ok(node) => (node, CreationResult::Existed { create_fn }),
1198                    Err(e) if e == ENOENT => {
1199                        (create_fn(locked, &self.entry.node, mount, name)?, CreationResult::Created)
1200                    }
1201                    Err(e) => return Err(e),
1202                };
1203
1204            assert!(
1205                node.info().mode & FileMode::IFMT != FileMode::EMPTY,
1206                "FsNode initialization did not populate the FileMode in FsNodeInfo."
1207            );
1208
1209            let entry = DirEntry::new(node, Some(self.entry.clone()), name.to_owned());
1210
1211            if let Err(err) = security::fs_node_init_with_dentry(locked, current_task, &entry) {
1212                // Null out the `parent` reference from `entry` otherwise dropping `entry` will
1213                // attempt to remove itself from `parent`, triggering a deadlock with `self`.
1214                entry.parent.update(None);
1215                return Err(err);
1216            }
1217
1218            Ok((entry, create_result))
1219        };
1220
1221        let (child, create_result) = match self.children.entry(name.to_owned()) {
1222            Entry::Vacant(entry) => {
1223                let (child, create_result) = create_child(locked, create_fn)?;
1224                // Do not cache a child in a locked directory
1225                if self.entry.node.fail_if_locked(current_task).is_ok() {
1226                    entry.insert(Arc::downgrade(&child));
1227                }
1228                (child, create_result)
1229            }
1230            Entry::Occupied(mut entry) => {
1231                // It's possible that the upgrade will succeed this time around because we dropped
1232                // the read lock before acquiring the write lock. Another thread might have
1233                // populated this entry while we were not holding any locks.
1234                if let Some(child) = Weak::upgrade(entry.get()) {
1235                    // Do not cache a child in a locked directory
1236                    if self.entry.node.fail_if_locked(current_task).is_ok() {
1237                        child.node.fs().did_access_dir_entry(&child);
1238                    }
1239                    return Ok((child, CreationResult::Existed { create_fn }));
1240                }
1241                let (child, create_result) = create_child(locked, create_fn)?;
1242                // Do not cache a child in a locked directory
1243                if self.entry.node.fail_if_locked(current_task).is_ok() {
1244                    entry.insert(Arc::downgrade(&child));
1245                }
1246                (child, create_result)
1247            }
1248        };
1249
1250        Ok((child, create_result))
1251    }
1252}
1253
1254impl fmt::Debug for DirEntry {
1255    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1256        let scope = RcuReadScope::new();
1257        let mut parents = vec![];
1258        let mut maybe_parent = self.parent_ref(&scope);
1259        while let Some(parent) = maybe_parent {
1260            parents.push(parent.local_name.read(&scope));
1261            maybe_parent = parent.parent_ref(&scope);
1262        }
1263        let mut builder = f.debug_struct("DirEntry");
1264        builder.field("id", &(self as *const DirEntry));
1265        builder.field("local_name", &self.local_name.read(&scope).to_owned());
1266        if !parents.is_empty() {
1267            builder.field("parents", &parents);
1268        }
1269        builder.finish()
1270    }
1271}
1272
1273struct RenameGuard<'a> {
1274    old_parent_guard: DirEntryLockedChildren<'a>,
1275    new_parent_guard: Option<DirEntryLockedChildren<'a>>,
1276}
1277
1278impl<'a> RenameGuard<'a> {
1279    fn lock(old_parent: &'a DirEntryHandle, new_parent: &'a DirEntryHandle) -> Self {
1280        if Arc::ptr_eq(old_parent, new_parent) {
1281            Self { old_parent_guard: old_parent.lock_children(), new_parent_guard: None }
1282        } else {
1283            // Following gVisor, these locks are taken in ancestor-to-descendant order.
1284            // Moreover, if the node are not comparable, they are taken from smallest inode to
1285            // biggest.
1286            if new_parent.is_descendant_of(old_parent)
1287                || (!old_parent.is_descendant_of(new_parent)
1288                    && old_parent.node.node_key() < new_parent.node.node_key())
1289            {
1290                let old_parent_guard = old_parent.lock_children();
1291                let _token = allow_subclass();
1292                let new_parent_guard = new_parent.lock_children();
1293                Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) }
1294            } else {
1295                let new_parent_guard = new_parent.lock_children();
1296                let _token = allow_subclass();
1297                let old_parent_guard = old_parent.lock_children();
1298                Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) }
1299            }
1300        }
1301    }
1302
1303    fn old_parent(&mut self) -> &mut DirEntryLockedChildren<'a> {
1304        &mut self.old_parent_guard
1305    }
1306
1307    fn new_parent(&mut self) -> &mut DirEntryLockedChildren<'a> {
1308        if let Some(new_guard) = self.new_parent_guard.as_mut() {
1309            new_guard
1310        } else {
1311            &mut self.old_parent_guard
1312        }
1313    }
1314}
1315
1316/// The Drop trait for DirEntry removes the entry from the child list of the
1317/// parent entry, which means we cannot drop DirEntry objects while holding a
1318/// lock on the parent's child list.
1319impl Drop for DirEntry {
1320    fn drop(&mut self) {
1321        let maybe_parent = self.parent();
1322        self.parent.update(None);
1323        if let Some(parent) = maybe_parent {
1324            parent.internal_remove_child(self);
1325        }
1326    }
1327}