Skip to main content

starnix_core/vfs/
fs_node.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::device::DeviceMode;
6use crate::mm::PAGE_SIZE;
7use crate::security::{self, Auditable, PermissionFlags};
8use crate::signals::{SignalInfo, send_standard_signal};
9use crate::task::{CurrentTask, CurrentTaskAndLocked, WaitQueue, Waiter, register_delayed_release};
10use crate::time::utc;
11use crate::vfs::fsverity::FsVerityState;
12use crate::vfs::pipe::{Pipe, PipeHandle};
13use crate::vfs::rw_queue::{RwQueue, RwQueueReadGuard, RwQueueWriteGuard};
14use crate::vfs::socket::SocketHandle;
15use crate::vfs::{
16    DefaultDirEntryOps, DirEntryOps, FileObject, FileObjectState, FileOps, FileSystem,
17    FileSystemHandle, FileWriteGuardState, FsLockDepType, FsStr, FsString, MAX_LFS_FILESIZE,
18    MountInfo, NamespaceNode, OPathOps, RecordLockCommand, RecordLockOwner, RecordLocks,
19    WeakFileHandle, checked_add_offset_and_length, inotify,
20};
21use bitflags::bitflags;
22use fuchsia_runtime::UtcInstant;
23use linux_uapi::{XATTR_SECURITY_PREFIX, XATTR_SYSTEM_PREFIX, XATTR_TRUSTED_PREFIX};
24use once_cell::race::OnceBool;
25use smallvec::SmallVec;
26use starnix_crypt::EncryptionKeyId;
27use starnix_lifecycle::{ObjectReleaser, ReleaserAction};
28use starnix_logging::{log_error, track_stub};
29use starnix_sync::{
30    BeforeFsNodeAppend, DynamicLockDepRwLock, FileOpsCore, FsNodeAppend, FsNodeFlockInfoLock,
31    FsNodeFsVerityLock, FsNodeInfoLevel, FsNodeInfoRecursiveLevel, FsNodeWriteGuardStateLock,
32    FuseFsNodeInfoLevel, LockDepMutex, LockDepReadGuard, LockEqualOrBefore, Locked, Unlocked,
33    allow_subclass,
34};
35use starnix_types::ownership::{Releasable, ReleaseGuard};
36use starnix_types::time::{NANOS_PER_SECOND, timespec_from_time};
37use starnix_uapi::as_any::AsAny;
38use starnix_uapi::auth::{
39    CAP_CHOWN, CAP_DAC_OVERRIDE, CAP_DAC_READ_SEARCH, CAP_FOWNER, CAP_FSETID, CAP_MKNOD,
40    CAP_SYS_ADMIN, CAP_SYS_RESOURCE, Credentials, FsCred,
41};
42use starnix_uapi::device_id::DeviceId;
43use starnix_uapi::errors::{EACCES, ENOTSUP, EPERM, Errno};
44use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
45use starnix_uapi::inotify_mask::InotifyMask;
46use starnix_uapi::mount_flags::MountFlags;
47use starnix_uapi::open_flags::OpenFlags;
48use starnix_uapi::resource_limits::Resource;
49use starnix_uapi::seal_flags::SealFlags;
50use starnix_uapi::signals::SIGXFSZ;
51use starnix_uapi::{
52    FALLOC_FL_COLLAPSE_RANGE, FALLOC_FL_INSERT_RANGE, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE,
53    FALLOC_FL_UNSHARE_RANGE, FALLOC_FL_ZERO_RANGE, LOCK_EX, LOCK_NB, LOCK_SH, LOCK_UN,
54    STATX__RESERVED, STATX_ATIME, STATX_ATTR_VERITY, STATX_BASIC_STATS, STATX_BLOCKS, STATX_CTIME,
55    STATX_GID, STATX_INO, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_UID, XATTR_USER_PREFIX,
56    errno, error, fsverity_descriptor, gid_t, ino_t, statx, statx_timestamp, timespec, uapi, uid_t,
57};
58use std::sync::atomic::Ordering;
59use std::sync::{Arc, OnceLock, Weak};
60use syncio::zxio_node_attr_has_t;
61
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum FsNodeLinkBehavior {
64    Allowed,
65    Disallowed,
66}
67
68impl Default for FsNodeLinkBehavior {
69    fn default() -> Self {
70        FsNodeLinkBehavior::Allowed
71    }
72}
73
74pub type AppendLockGuard<'a> = RwQueueReadGuard<'a, FsNodeAppend>;
75pub type AppendLockWriteGuard<'a> = RwQueueWriteGuard<'a, FsNodeAppend>;
76
77bitflags! {
78    pub struct FsNodeFlags: u8 {
79        const IS_PRIVATE = 1 << 0;
80    }
81}
82
83pub struct FsNode {
84    /// The inode number for this FsNode.
85    pub ino: ino_t,
86
87    /// Flags for this node.
88    pub flags: FsNodeFlags,
89
90    /// The FsNodeOps for this FsNode.
91    ///
92    /// The FsNodeOps are implemented by the individual file systems to provide
93    /// specific behaviors for this FsNode.
94    ops: Box<dyn FsNodeOps>,
95
96    /// The FileSystem that owns this FsNode's tree.
97    fs: Weak<FileSystem>,
98
99    /// A RwLock to synchronize append operations for this node.
100    ///
101    /// FileObjects writing with O_APPEND should grab a write() lock on this
102    /// field to ensure they operate sequentially. FileObjects writing without
103    /// O_APPEND should grab read() lock so that they can operate in parallel.
104    pub append_lock: RwQueue<FsNodeAppend>,
105
106    /// Mutable information about this node.
107    ///
108    /// This data is used to populate the uapi::stat structure.
109    info: DynamicLockDepRwLock<FsNodeInfo>,
110
111    /// Data associated with an FsNode that is rarely needed.
112    rare_data: OnceLock<Box<FsNodeRareData>>,
113
114    /// Tracks lock state for this file.
115    pub write_guard_state: LockDepMutex<FileWriteGuardState, FsNodeWriteGuardStateLock>,
116
117    /// Cached FsVerity state associated with this node.
118    pub fsverity: LockDepMutex<FsVerityState, FsNodeFsVerityLock>,
119
120    /// The security state associated with this node. Must always be acquired last
121    /// relative to other `FsNode` locks.
122    pub security_state: security::FsNodeState,
123}
124
125#[derive(Default)]
126struct FsNodeRareData {
127    /// The pipe located at this node, if any.
128    ///
129    /// Used if, and only if, the node has a mode of FileMode::IFIFO.
130    fifo: OnceLock<PipeHandle>,
131
132    /// The UNIX domain socket bound to this node, if any.
133    bound_socket: OnceLock<SocketHandle>,
134
135    /// Information about the locking information on this node.
136    ///
137    /// No other lock on this object may be taken while this lock is held.
138    flock_info: LockDepMutex<FlockInfo, FsNodeFlockInfoLock>,
139
140    /// Records locks associated with this node.
141    record_locks: RecordLocks,
142
143    /// Whether this node can be linked into a directory.
144    ///
145    /// Only set for nodes created with `O_TMPFILE`.
146    link_behavior: OnceLock<FsNodeLinkBehavior>,
147
148    /// Inotify watchers on this node. See inotify(7).
149    watchers: inotify::InotifyWatchers,
150}
151
152impl FsNodeRareData {
153    fn ensure_fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
154        self.fifo.get_or_init(|| {
155            let default_pipe_capacity = (*PAGE_SIZE * 16) as usize;
156            let kernel = current_task.kernel();
157            let max_size = kernel.system_limits.pipe_max_size.load(Ordering::Relaxed);
158            let capacity = if default_pipe_capacity <= max_size
159                || security::is_task_capable_noaudit(current_task, CAP_SYS_RESOURCE)
160            {
161                default_pipe_capacity
162            } else {
163                max_size
164            };
165            Pipe::new(capacity)
166        })
167    }
168}
169
170pub enum FsNodeReleaserAction {}
171impl ReleaserAction<FsNode> for FsNodeReleaserAction {
172    fn release(fs_node: ReleaseGuard<FsNode>) {
173        register_delayed_release(fs_node);
174    }
175}
176pub type FsNodeReleaser = ObjectReleaser<FsNode, FsNodeReleaserAction>;
177pub type FsNodeHandle = Arc<FsNodeReleaser>;
178pub type WeakFsNodeHandle = Weak<FsNodeReleaser>;
179
180#[derive(Debug, Default, Clone, PartialEq)]
181pub struct FsNodeInfo {
182    pub mode: FileMode,
183    pub link_count: usize,
184    pub uid: uid_t,
185    pub gid: gid_t,
186    pub rdev: DeviceId,
187    pub size: usize,
188    pub blksize: usize,
189    pub blocks: usize,
190    pub time_status_change: UtcInstant,
191    pub time_access: UtcInstant,
192    pub time_modify: UtcInstant,
193    pub casefold: bool,
194
195    // If this node is fscrypt encrypted, stores the id of the user wrapping key used to encrypt it.
196    pub wrapping_key_id: Option<[u8; 16]>,
197
198    // Used to indicate to filesystems that manage timestamps that an access has occurred and to
199    // update the node's atime.
200    // This only impacts accesses within Starnix. Most Fuchsia programs are not expected to maintain
201    // access times. If the file handle is transferred out of Starnix, there may be inconsistencies.
202    pub pending_time_access_update: bool,
203}
204
205impl FsNodeInfo {
206    pub fn new(mode: FileMode, owner: FsCred) -> Self {
207        let now = utc::utc_now();
208        Self {
209            mode,
210            link_count: if mode.is_dir() { 2 } else { 1 },
211            uid: owner.uid,
212            gid: owner.gid,
213            blksize: DEFAULT_BYTES_PER_BLOCK,
214            time_status_change: now,
215            time_access: now,
216            time_modify: now,
217            ..Default::default()
218        }
219    }
220
221    pub fn storage_size(&self) -> usize {
222        self.blksize.saturating_mul(self.blocks)
223    }
224
225    pub fn chmod(&mut self, mode: FileMode) {
226        self.mode = (self.mode & !FileMode::PERMISSIONS) | (mode & FileMode::PERMISSIONS);
227    }
228
229    pub fn chown(&mut self, owner: Option<uid_t>, group: Option<gid_t>) {
230        if let Some(owner) = owner {
231            self.uid = owner;
232        }
233        if let Some(group) = group {
234            self.gid = group;
235        }
236        // Clear the setuid and setgid bits if the file is executable and a regular file.
237        if self.mode.is_reg() {
238            self.mode &= !FileMode::ISUID;
239            self.clear_sgid_bit();
240        }
241    }
242
243    fn clear_sgid_bit(&mut self) {
244        // If the group execute bit is not set, the setgid bit actually indicates mandatory
245        // locking and should not be cleared.
246        if self.mode.intersects(FileMode::IXGRP) {
247            self.mode &= !FileMode::ISGID;
248        }
249    }
250
251    fn clear_suid_and_sgid_bits(&mut self) {
252        self.mode &= !FileMode::ISUID;
253        self.clear_sgid_bit();
254    }
255
256    pub fn cred(&self) -> FsCred {
257        FsCred { uid: self.uid, gid: self.gid }
258    }
259
260    pub fn apply_suid_and_sgid(&self, creds: &mut Credentials) {
261        if self.mode.contains(FileMode::ISUID) {
262            creds.euid = self.uid;
263        }
264
265        // See <https://man7.org/linux/man-pages/man7/inode.7.html>:
266        //
267        //   For an executable file, the set-group-ID bit causes the
268        //   effective group ID of a process that executes the file to change
269        //   as described in execve(2).  For a file that does not have the
270        //   group execution bit (S_IXGRP) set, the set-group-ID bit indicates
271        //   mandatory file/record locking.
272        if self.mode.contains(FileMode::ISGID | FileMode::IXGRP) {
273            creds.egid = self.gid;
274        }
275    }
276}
277
278#[derive(Default)]
279struct FlockInfo {
280    /// Whether the node is currently locked. The meaning of the different values are:
281    /// - `None`: The node is not locked.
282    /// - `Some(false)`: The node is locked non exclusively.
283    /// - `Some(true)`: The node is locked exclusively.
284    locked_exclusive: Option<bool>,
285    /// The FileObject that hold the lock.
286    locking_handles: Vec<WeakFileHandle>,
287    /// The queue to notify process waiting on the lock.
288    wait_queue: WaitQueue,
289}
290
291impl FlockInfo {
292    /// Removes all file handle not holding `predicate` from the list of object holding the lock. If
293    /// this empties the list, unlocks the node and notifies all waiting processes.
294    pub fn retain<F>(&mut self, predicate: F)
295    where
296        F: Fn(&FileObject) -> bool,
297    {
298        if !self.locking_handles.is_empty() {
299            self.locking_handles
300                .retain(|w| if let Some(fh) = w.upgrade() { predicate(&fh) } else { false });
301            if self.locking_handles.is_empty() {
302                self.locked_exclusive = None;
303                self.wait_queue.notify_all();
304            }
305        }
306    }
307}
308
309/// `st_blksize` is measured in units of 512 bytes.
310pub const DEFAULT_BYTES_PER_BLOCK: usize = 512;
311
312pub struct FlockOperation {
313    operation: u32,
314}
315
316impl FlockOperation {
317    pub fn from_flags(operation: u32) -> Result<Self, Errno> {
318        if operation & !(LOCK_SH | LOCK_EX | LOCK_UN | LOCK_NB) != 0 {
319            return error!(EINVAL);
320        }
321        if [LOCK_SH, LOCK_EX, LOCK_UN].iter().filter(|&&o| operation & o == o).count() != 1 {
322            return error!(EINVAL);
323        }
324        Ok(Self { operation })
325    }
326
327    pub fn is_unlock(&self) -> bool {
328        self.operation & LOCK_UN > 0
329    }
330
331    pub fn is_lock_exclusive(&self) -> bool {
332        self.operation & LOCK_EX > 0
333    }
334
335    pub fn is_blocking(&self) -> bool {
336        self.operation & LOCK_NB == 0
337    }
338}
339
340impl FileObject {
341    /// Advisory locking.
342    ///
343    /// See flock(2).
344    pub fn flock(
345        &self,
346        locked: &mut Locked<Unlocked>,
347        current_task: &CurrentTask,
348        operation: FlockOperation,
349    ) -> Result<(), Errno> {
350        if self.flags().contains(OpenFlags::PATH) {
351            return error!(EBADF);
352        }
353        security::check_file_lock_access(current_task, self)?;
354        loop {
355            let mut flock_info = self.name.entry.node.ensure_rare_data().flock_info.lock();
356            if operation.is_unlock() {
357                flock_info.retain(|fh| !std::ptr::eq(fh, self));
358                return Ok(());
359            }
360            // Operation is a locking operation.
361            // 1. File is not locked
362            if flock_info.locked_exclusive.is_none() {
363                flock_info.locked_exclusive = Some(operation.is_lock_exclusive());
364                flock_info.locking_handles.push(self.weak_handle.clone());
365                return Ok(());
366            }
367
368            let file_lock_is_exclusive = flock_info.locked_exclusive == Some(true);
369            let fd_has_lock = flock_info
370                .locking_handles
371                .iter()
372                .find_map(|w| {
373                    w.upgrade().and_then(|fh| {
374                        if std::ptr::eq(&fh as &FileObject, self) { Some(()) } else { None }
375                    })
376                })
377                .is_some();
378
379            // 2. File is locked, but fd already have a lock
380            if fd_has_lock {
381                if operation.is_lock_exclusive() == file_lock_is_exclusive {
382                    // Correct lock is already held, return.
383                    return Ok(());
384                } else {
385                    // Incorrect lock is held. Release the lock and loop back to try to reacquire
386                    // it. flock doesn't guarantee atomic lock type switching.
387                    flock_info.retain(|fh| !std::ptr::eq(fh, self));
388                    continue;
389                }
390            }
391
392            // 3. File is locked, and fd doesn't have a lock.
393            if !file_lock_is_exclusive && !operation.is_lock_exclusive() {
394                // The lock is not exclusive, let's grab it.
395                flock_info.locking_handles.push(self.weak_handle.clone());
396                return Ok(());
397            }
398
399            // 4. The operation cannot be done at this time.
400            if !operation.is_blocking() {
401                return error!(EAGAIN);
402            }
403
404            // Register a waiter to be notified when the lock is released. Release the lock on
405            // FlockInfo, and wait.
406            let waiter = Waiter::new();
407            flock_info.wait_queue.wait_async(&waiter);
408            std::mem::drop(flock_info);
409            waiter.wait(locked, current_task)?;
410        }
411    }
412}
413
414// The inner mod is required because bitflags cannot pass the attribute through to the single
415// variant, and attributes cannot be applied to macro invocations.
416mod inner_flags {
417    // Part of the code for the AT_STATX_SYNC_AS_STAT case that's produced by the macro triggers the
418    // lint, but as a whole, the produced code is still correct.
419    #![allow(clippy::bad_bit_mask)] // TODO(b/303500202) Remove once addressed in bitflags.
420    use super::{bitflags, uapi};
421
422    bitflags! {
423        #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
424        pub struct StatxFlags: u32 {
425            const AT_SYMLINK_NOFOLLOW = uapi::AT_SYMLINK_NOFOLLOW;
426            const AT_EMPTY_PATH = uapi::AT_EMPTY_PATH;
427            const AT_NO_AUTOMOUNT = uapi::AT_NO_AUTOMOUNT;
428            const AT_STATX_SYNC_AS_STAT = uapi::AT_STATX_SYNC_AS_STAT;
429            const AT_STATX_FORCE_SYNC = uapi::AT_STATX_FORCE_SYNC;
430            const AT_STATX_DONT_SYNC = uapi::AT_STATX_DONT_SYNC;
431            const STATX_ATTR_VERITY = uapi::STATX_ATTR_VERITY;
432        }
433    }
434}
435
436pub use inner_flags::StatxFlags;
437
438#[derive(Copy, Clone, Debug, PartialEq, Eq)]
439pub enum UnlinkKind {
440    /// Unlink a directory.
441    Directory,
442
443    /// Unlink a non-directory.
444    NonDirectory,
445}
446
447pub enum SymlinkTarget {
448    Path(FsString),
449    Node(NamespaceNode),
450}
451
452#[derive(Clone, Copy, PartialEq, Eq)]
453pub enum XattrOp {
454    /// Set the value of the extended attribute regardless of whether it exists.
455    Set,
456    /// Create a new extended attribute. Fail if it already exists.
457    Create,
458    /// Replace the value of the extended attribute. Fail if it doesn't exist.
459    Replace,
460}
461
462impl XattrOp {
463    pub fn into_flags(self) -> u32 {
464        match self {
465            Self::Set => 0,
466            Self::Create => uapi::XATTR_CREATE,
467            Self::Replace => uapi::XATTR_REPLACE,
468        }
469    }
470}
471
472/// Returns a value, or the size required to contains it.
473#[derive(Clone, Debug, PartialEq)]
474pub enum ValueOrSize<T> {
475    Value(T),
476    Size(usize),
477}
478
479impl<T> ValueOrSize<T> {
480    pub fn map<F, U>(self, f: F) -> ValueOrSize<U>
481    where
482        F: FnOnce(T) -> U,
483    {
484        match self {
485            Self::Size(s) => ValueOrSize::Size(s),
486            Self::Value(v) => ValueOrSize::Value(f(v)),
487        }
488    }
489
490    #[cfg(test)]
491    pub fn unwrap(self) -> T {
492        match self {
493            Self::Size(_) => panic!("Unwrap ValueOrSize that is a Size"),
494            Self::Value(v) => v,
495        }
496    }
497}
498
499impl<T> From<T> for ValueOrSize<T> {
500    fn from(t: T) -> Self {
501        Self::Value(t)
502    }
503}
504
505#[derive(Copy, Clone, Eq, PartialEq, Debug)]
506pub enum FallocMode {
507    Allocate { keep_size: bool },
508    PunchHole,
509    Collapse,
510    Zero { keep_size: bool },
511    InsertRange,
512    UnshareRange,
513}
514
515impl FallocMode {
516    pub fn from_bits(mode: u32) -> Option<Self> {
517        // `fallocate()` allows only the following values for `mode`.
518        if mode == 0 {
519            Some(Self::Allocate { keep_size: false })
520        } else if mode == FALLOC_FL_KEEP_SIZE {
521            Some(Self::Allocate { keep_size: true })
522        } else if mode == FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE {
523            Some(Self::PunchHole)
524        } else if mode == FALLOC_FL_COLLAPSE_RANGE {
525            Some(Self::Collapse)
526        } else if mode == FALLOC_FL_ZERO_RANGE {
527            Some(Self::Zero { keep_size: false })
528        } else if mode == FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE {
529            Some(Self::Zero { keep_size: true })
530        } else if mode == FALLOC_FL_INSERT_RANGE {
531            Some(Self::InsertRange)
532        } else if mode == FALLOC_FL_UNSHARE_RANGE {
533            Some(Self::UnshareRange)
534        } else {
535            None
536        }
537    }
538}
539
540#[derive(Debug, Copy, Clone, PartialEq)]
541pub enum CheckAccessReason {
542    Access,
543    Chdir,
544    Chroot,
545    Exec,
546    ChangeTimestamps { now: bool },
547    InternalPermissionChecks,
548}
549
550pub type LookupVec<T> = SmallVec<[T; 8]>;
551
552pub trait FsNodeOps: Send + Sync + AsAny + 'static {
553    /// Delegate the access check to the node.
554    fn check_access(
555        &self,
556        _locked: &mut Locked<FileOpsCore>,
557        node: &FsNode,
558        current_task: &CurrentTask,
559        access: security::PermissionFlags,
560        info: &DynamicLockDepRwLock<FsNodeInfo>,
561        reason: CheckAccessReason,
562        audit_context: security::Auditable<'_>,
563    ) -> Result<(), Errno> {
564        node.default_check_access_impl(current_task, access, reason, info.read(), audit_context)
565    }
566
567    /// Build the [`DirEntryOps`] for a new [`DirEntry`] that will be associated
568    /// to this node.
569    fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
570        Box::new(DefaultDirEntryOps)
571    }
572
573    /// Build the `FileOps` for the file associated to this node.
574    ///
575    /// The returned FileOps will be used to create a FileObject, which might
576    /// be assigned an FdNumber.
577    fn create_file_ops(
578        &self,
579        locked: &mut Locked<FileOpsCore>,
580        node: &FsNode,
581        _current_task: &CurrentTask,
582        flags: OpenFlags,
583    ) -> Result<Box<dyn FileOps>, Errno>;
584
585    /// Find an existing child node and populate the child parameter. Return the node.
586    ///
587    /// The child parameter is an empty node. Operations other than initialize may panic before
588    /// initialize is called.
589    fn lookup(
590        &self,
591        _locked: &mut Locked<FileOpsCore>,
592        _node: &FsNode,
593        _current_task: &CurrentTask,
594        name: &FsStr,
595    ) -> Result<FsNodeHandle, Errno> {
596        // The default implementation here is suitable for filesystems that have permanent entries;
597        // entries that already exist will get found in the cache and shouldn't get this far.
598        error!(ENOENT, format!("looking for {name}"))
599    }
600
601    /// Returns whether this node supports pipelined lookups.
602    fn has_lookup_pipelined(&self) -> bool {
603        false
604    }
605
606    /// Find multiple children nodes in sequence.
607    ///
608    /// This can be used to pipeline lookups in filesystems that support it.
609    fn lookup_pipelined(
610        &self,
611        _locked: &mut Locked<FileOpsCore>,
612        _node: &FsNode,
613        _current_task: &CurrentTask,
614        _names: &[&FsStr],
615    ) -> LookupVec<Result<FsNodeHandle, Errno>> {
616        panic!("has_lookup_pipelined should be false");
617    }
618
619    /// Create and return the given child node.
620    ///
621    /// The mode field of the FsNodeInfo indicates what kind of child to
622    /// create.
623    ///
624    /// This function is never called with FileMode::IFDIR. The mkdir function
625    /// is used to create directories instead.
626    fn mknod(
627        &self,
628        locked: &mut Locked<FileOpsCore>,
629        _node: &FsNode,
630        _current_task: &CurrentTask,
631        _name: &FsStr,
632        _mode: FileMode,
633        _dev: DeviceId,
634        _owner: FsCred,
635    ) -> Result<FsNodeHandle, Errno>;
636
637    /// Create and return the given child node as a subdirectory.
638    fn mkdir(
639        &self,
640        locked: &mut Locked<FileOpsCore>,
641        _node: &FsNode,
642        _current_task: &CurrentTask,
643        _name: &FsStr,
644        _mode: FileMode,
645        _owner: FsCred,
646    ) -> Result<FsNodeHandle, Errno>;
647
648    /// Creates a symlink with the given `target` path.
649    fn create_symlink(
650        &self,
651        locked: &mut Locked<FileOpsCore>,
652        _node: &FsNode,
653        _current_task: &CurrentTask,
654        _name: &FsStr,
655        _target: &FsStr,
656        _owner: FsCred,
657    ) -> Result<FsNodeHandle, Errno>;
658
659    /// Creates an anonymous file.
660    ///
661    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
662    ///
663    /// Used by O_TMPFILE.
664    fn create_tmpfile(
665        &self,
666        _node: &FsNode,
667        _current_task: &CurrentTask,
668        _mode: FileMode,
669        _owner: FsCred,
670    ) -> Result<FsNodeHandle, Errno> {
671        error!(EOPNOTSUPP)
672    }
673
674    /// Reads the symlink from this node.
675    fn readlink(
676        &self,
677        _locked: &mut Locked<FileOpsCore>,
678        _node: &FsNode,
679        _current_task: &CurrentTask,
680    ) -> Result<SymlinkTarget, Errno> {
681        error!(EINVAL)
682    }
683
684    /// Create a hard link with the given name to the given child.
685    fn link(
686        &self,
687        _locked: &mut Locked<FileOpsCore>,
688        _node: &FsNode,
689        _current_task: &CurrentTask,
690        _name: &FsStr,
691        _child: &FsNodeHandle,
692    ) -> Result<(), Errno> {
693        error!(EPERM)
694    }
695
696    /// Remove the child with the given name, if the child exists.
697    ///
698    /// The UnlinkKind parameter indicates whether the caller intends to unlink
699    /// a directory or a non-directory child.
700    fn unlink(
701        &self,
702        locked: &mut Locked<FileOpsCore>,
703        _node: &FsNode,
704        _current_task: &CurrentTask,
705        _name: &FsStr,
706        _child: &FsNodeHandle,
707    ) -> Result<(), Errno>;
708
709    /// Acquire the necessary append lock for the operations that depend on them.
710    /// Should be done before calling `allocate` or `truncate` to avoid lock ordering issues.
711    fn append_lock_read<'a>(
712        &'a self,
713        locked: &'a mut Locked<BeforeFsNodeAppend>,
714        node: &'a FsNode,
715        current_task: &CurrentTask,
716    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FsNodeAppend>), Errno> {
717        return node.append_lock.read_and(locked, current_task);
718    }
719
720    /// Acquire the necessary append lock for operations that need exclusive access (e.g., write append).
721    fn append_lock_write<'a>(
722        &'a self,
723        locked: &'a mut Locked<BeforeFsNodeAppend>,
724        node: &'a FsNode,
725        current_task: &CurrentTask,
726    ) -> Result<(AppendLockWriteGuard<'a>, &'a mut Locked<FsNodeAppend>), Errno> {
727        return node.append_lock.write_and(locked, current_task);
728    }
729
730    /// Change the length of the file.
731    fn truncate(
732        &self,
733        _locked: &mut Locked<FileOpsCore>,
734        _guard: &AppendLockWriteGuard<'_>,
735        _node: &FsNode,
736        _current_task: &CurrentTask,
737        _length: u64,
738    ) -> Result<(), Errno> {
739        error!(EINVAL)
740    }
741
742    /// Manipulate allocated disk space for the file.
743    fn allocate(
744        &self,
745        _locked: &mut Locked<FileOpsCore>,
746        _guard: &AppendLockWriteGuard<'_>,
747        _node: &FsNode,
748        _current_task: &CurrentTask,
749        _mode: FallocMode,
750        _offset: u64,
751        _length: u64,
752    ) -> Result<(), Errno> {
753        error!(EINVAL)
754    }
755
756    /// Update the supplied info with initial state (e.g. size) for the node.
757    ///
758    /// FsNode calls this method when created, to allow the FsNodeOps to
759    /// set appropriate initial values in the FsNodeInfo.
760    fn initial_info(&self, _info: &mut FsNodeInfo) {}
761
762    /// Update node.info as needed.
763    ///
764    /// FsNode calls this method before converting the FsNodeInfo struct into
765    /// the uapi::stat struct to give the file system a chance to update this data
766    /// before it is used by clients.
767    ///
768    /// File systems that keep the FsNodeInfo up-to-date do not need to
769    /// override this function.
770    ///
771    /// Return a read guard for the updated information.
772    fn fetch_and_refresh_info<'a>(
773        &self,
774        _locked: &mut Locked<FileOpsCore>,
775        _node: &FsNode,
776        _current_task: &CurrentTask,
777        info: &'a DynamicLockDepRwLock<FsNodeInfo>,
778    ) -> Result<LockDepReadGuard<'a, FsNodeInfo>, Errno> {
779        Ok(info.read())
780    }
781
782    /// Syncs cached data to persistent storage.
783    fn sync(&self, _node: &FsNode, _current_task: &CurrentTask) -> Result<(), Errno> {
784        Ok(())
785    }
786
787    /// Update node attributes persistently.
788    fn update_attributes(
789        &self,
790        _locked: &mut Locked<FileOpsCore>,
791        _node: &FsNode,
792        _current_task: &CurrentTask,
793        _info: &FsNodeInfo,
794        _has: zxio_node_attr_has_t,
795    ) -> Result<(), Errno> {
796        Ok(())
797    }
798
799    /// Get an extended attribute on the node.
800    ///
801    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
802    /// instead return the size of the attribute, and can return an ERANGE error if max_size is not
803    /// 0, and lesser than the required size.
804    fn get_xattr(
805        &self,
806        _locked: &mut Locked<FileOpsCore>,
807        _node: &FsNode,
808        _current_task: &CurrentTask,
809        _name: &FsStr,
810        _max_size: usize,
811    ) -> Result<ValueOrSize<FsString>, Errno> {
812        error!(ENOTSUP)
813    }
814
815    /// Set an extended attribute on the node.
816    fn set_xattr(
817        &self,
818        _locked: &mut Locked<FileOpsCore>,
819        _node: &FsNode,
820        _current_task: &CurrentTask,
821        _name: &FsStr,
822        _value: &FsStr,
823        _op: XattrOp,
824    ) -> Result<(), Errno> {
825        error!(ENOTSUP)
826    }
827
828    fn remove_xattr(
829        &self,
830        _locked: &mut Locked<FileOpsCore>,
831        _node: &FsNode,
832        _current_task: &CurrentTask,
833        _name: &FsStr,
834    ) -> Result<(), Errno> {
835        error!(ENOTSUP)
836    }
837
838    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
839    /// instead return the size of the 0 separated string needed to represent the value, and can
840    /// return an ERANGE error if max_size is not 0, and lesser than the required size.
841    fn list_xattrs(
842        &self,
843        _locked: &mut Locked<FileOpsCore>,
844        _node: &FsNode,
845        _current_task: &CurrentTask,
846        _max_size: usize,
847    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
848        error!(ENOTSUP)
849    }
850
851    /// Called when the FsNode is freed by the Kernel.
852    fn forget(
853        self: Box<Self>,
854        _locked: &mut Locked<FileOpsCore>,
855        _current_task: &CurrentTask,
856        _info: FsNodeInfo,
857    ) -> Result<(), Errno> {
858        Ok(())
859    }
860
861    ////////////////////
862    // FS-Verity operations
863
864    /// Marks that FS-Verity is being built. Writes fsverity descriptor and merkle tree, the latter
865    /// computed by the filesystem.
866    /// This should ensure there are no writable file handles. Returns EEXIST if the file was
867    /// already fsverity-enabled. Returns EBUSY if this ioctl was already running on this file.
868    fn enable_fsverity(
869        &self,
870        _locked: &mut Locked<FileOpsCore>,
871        _node: &FsNode,
872        _current_task: &CurrentTask,
873        _descriptor: &fsverity_descriptor,
874    ) -> Result<(), Errno> {
875        error!(ENOTSUP)
876    }
877
878    /// Read fsverity descriptor, if the node is fsverity-enabled. Else returns ENODATA.
879    fn get_fsverity_descriptor(&self, _log_blocksize: u8) -> Result<fsverity_descriptor, Errno> {
880        error!(ENOTSUP)
881    }
882
883    /// The key used to identify this node in the file system's node cache.
884    ///
885    /// For many file systems, this will be the same as the inode number. However, some file
886    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
887    fn node_key(&self, node: &FsNode) -> ino_t {
888        node.ino
889    }
890
891    /// Returns the size of the file.
892    fn get_size(
893        &self,
894        locked: &mut Locked<FileOpsCore>,
895        node: &FsNode,
896        current_task: &CurrentTask,
897    ) -> Result<usize, Errno> {
898        let info = node.fetch_and_refresh_info(locked, current_task)?;
899        Ok(info.size.try_into().map_err(|_| errno!(EINVAL))?)
900    }
901}
902
903impl<T> From<T> for Box<dyn FsNodeOps>
904where
905    T: FsNodeOps,
906{
907    fn from(ops: T) -> Box<dyn FsNodeOps> {
908        Box::new(ops)
909    }
910}
911
912/// Implements [`FsNodeOps`] methods in a way that makes sense for symlinks.
913/// You must implement [`FsNodeOps::readlink`].
914#[macro_export]
915macro_rules! fs_node_impl_symlink {
916    () => {
917        $crate::vfs::fs_node_impl_not_dir!();
918
919        fn create_file_ops(
920            &self,
921            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
922            node: &$crate::vfs::FsNode,
923            _current_task: &CurrentTask,
924            _flags: starnix_uapi::open_flags::OpenFlags,
925        ) -> Result<Box<dyn $crate::vfs::FileOps>, starnix_uapi::errors::Errno> {
926            assert!(node.is_lnk());
927            unreachable!("Symlink nodes cannot be opened.");
928        }
929    };
930}
931
932#[macro_export]
933macro_rules! fs_node_impl_dir_readonly {
934    () => {
935        fn check_access(
936            &self,
937            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
938            node: &$crate::vfs::FsNode,
939            current_task: &$crate::task::CurrentTask,
940            permission_flags: $crate::security::PermissionFlags,
941            info: &starnix_sync::DynamicLockDepRwLock<$crate::vfs::FsNodeInfo>,
942            reason: $crate::vfs::CheckAccessReason,
943            audit_context: $crate::security::Auditable<'_>,
944        ) -> Result<(), starnix_uapi::errors::Errno> {
945            let access = permission_flags.as_access();
946            if access.contains(starnix_uapi::file_mode::Access::WRITE) {
947                return starnix_uapi::error!(
948                    EROFS,
949                    format!("check_access failed: read-only directory")
950                );
951            }
952            node.default_check_access_impl(
953                current_task,
954                permission_flags,
955                reason,
956                info.read(),
957                audit_context,
958            )
959        }
960
961        fn mkdir(
962            &self,
963            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
964            _node: &$crate::vfs::FsNode,
965            _current_task: &$crate::task::CurrentTask,
966            name: &$crate::vfs::FsStr,
967            _mode: starnix_uapi::file_mode::FileMode,
968            _owner: starnix_uapi::auth::FsCred,
969        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
970            starnix_uapi::error!(EROFS, format!("mkdir failed: {:?}", name))
971        }
972
973        fn mknod(
974            &self,
975            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
976            _node: &$crate::vfs::FsNode,
977            _current_task: &$crate::task::CurrentTask,
978            name: &$crate::vfs::FsStr,
979            _mode: starnix_uapi::file_mode::FileMode,
980            _dev: starnix_uapi::device_id::DeviceId,
981            _owner: starnix_uapi::auth::FsCred,
982        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
983            starnix_uapi::error!(EROFS, format!("mknod failed: {:?}", name))
984        }
985
986        fn create_symlink(
987            &self,
988            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
989            _node: &$crate::vfs::FsNode,
990            _current_task: &$crate::task::CurrentTask,
991            name: &$crate::vfs::FsStr,
992            _target: &$crate::vfs::FsStr,
993            _owner: starnix_uapi::auth::FsCred,
994        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
995            starnix_uapi::error!(EROFS, format!("symlink failed: {:?}", name))
996        }
997
998        fn link(
999            &self,
1000            _locked: &mut Locked<FileOpsCore>,
1001            _node: &$crate::vfs::FsNode,
1002            _current_task: &$crate::task::CurrentTask,
1003            name: &$crate::vfs::FsStr,
1004            _child: &$crate::vfs::FsNodeHandle,
1005        ) -> Result<(), starnix_uapi::errors::Errno> {
1006            starnix_uapi::error!(EROFS, format!("link failed: {:?}", name))
1007        }
1008
1009        fn unlink(
1010            &self,
1011            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1012            _node: &$crate::vfs::FsNode,
1013            _current_task: &$crate::task::CurrentTask,
1014            name: &$crate::vfs::FsStr,
1015            _child: &$crate::vfs::FsNodeHandle,
1016        ) -> Result<(), starnix_uapi::errors::Errno> {
1017            starnix_uapi::error!(EROFS, format!("unlink failed: {:?}", name))
1018        }
1019    };
1020}
1021
1022/// Trait that objects can implement if they need to handle extended attribute storage. Allows
1023/// delegating extended attribute operations in [`FsNodeOps`] to another object.
1024///
1025/// See [`fs_node_impl_xattr_delegate`] for usage details.
1026pub trait XattrStorage {
1027    /// Delegate for [`FsNodeOps::get_xattr`].
1028    fn get_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<FsString, Errno>;
1029
1030    /// Delegate for [`FsNodeOps::set_xattr`].
1031    fn set_xattr(
1032        &self,
1033        locked: &mut Locked<FileOpsCore>,
1034        name: &FsStr,
1035        value: &FsStr,
1036        op: XattrOp,
1037    ) -> Result<(), Errno>;
1038
1039    /// Delegate for [`FsNodeOps::remove_xattr`].
1040    fn remove_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<(), Errno>;
1041
1042    /// Delegate for [`FsNodeOps::list_xattrs`].
1043    fn list_xattrs(&self, locked: &mut Locked<FileOpsCore>) -> Result<Vec<FsString>, Errno>;
1044}
1045
1046/// Implements extended attribute ops for [`FsNodeOps`] by delegating to another object which
1047/// implements the [`XattrStorage`] trait or a similar interface. For example:
1048///
1049/// ```
1050/// struct Xattrs {}
1051///
1052/// impl XattrStorage for Xattrs {
1053///     // implement XattrStorage
1054/// }
1055///
1056/// struct Node {
1057///     xattrs: Xattrs
1058/// }
1059///
1060/// impl FsNodeOps for Node {
1061///     // Delegate extended attribute ops in FsNodeOps to self.xattrs
1062///     fs_node_impl_xattr_delegate!(self, self.xattrs);
1063///
1064///     // add other FsNodeOps impls here
1065/// }
1066/// ```
1067#[macro_export]
1068macro_rules! fs_node_impl_xattr_delegate {
1069    ($self:ident, $delegate:expr) => {
1070        fn get_xattr(
1071            &$self,
1072            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1073            _node: &FsNode,
1074            _current_task: &CurrentTask,
1075            name: &$crate::vfs::FsStr,
1076            _size: usize,
1077        ) -> Result<$crate::vfs::ValueOrSize<$crate::vfs::FsString>, starnix_uapi::errors::Errno> {
1078            Ok($delegate.get_xattr(locked, name)?.into())
1079        }
1080
1081        fn set_xattr(
1082            &$self,
1083            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1084            _node: &FsNode,
1085            _current_task: &CurrentTask,
1086            name: &$crate::vfs::FsStr,
1087            value: &$crate::vfs::FsStr,
1088            op: $crate::vfs::XattrOp,
1089        ) -> Result<(), starnix_uapi::errors::Errno> {
1090            $delegate.set_xattr(locked, name, value, op)
1091        }
1092
1093        fn remove_xattr(
1094            &$self,
1095            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1096            _node: &FsNode,
1097            _current_task: &CurrentTask,
1098            name: &$crate::vfs::FsStr,
1099        ) -> Result<(), starnix_uapi::errors::Errno> {
1100            $delegate.remove_xattr(locked, name)
1101        }
1102
1103        fn list_xattrs(
1104            &$self,
1105            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1106            _node: &FsNode,
1107            _current_task: &CurrentTask,
1108            _size: usize,
1109        ) -> Result<$crate::vfs::ValueOrSize<Vec<$crate::vfs::FsString>>, starnix_uapi::errors::Errno> {
1110            Ok($delegate.list_xattrs(locked)?.into())
1111        }
1112    };
1113}
1114
1115/// Stubs out [`FsNodeOps`] methods that only apply to directories.
1116#[macro_export]
1117macro_rules! fs_node_impl_not_dir {
1118    () => {
1119        fn lookup(
1120            &self,
1121            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1122            _node: &$crate::vfs::FsNode,
1123            _current_task: &$crate::task::CurrentTask,
1124            _name: &$crate::vfs::FsStr,
1125        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1126            starnix_uapi::error!(ENOTDIR)
1127        }
1128
1129        fn mknod(
1130            &self,
1131            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1132            _node: &$crate::vfs::FsNode,
1133            _current_task: &$crate::task::CurrentTask,
1134            _name: &$crate::vfs::FsStr,
1135            _mode: starnix_uapi::file_mode::FileMode,
1136            _dev: starnix_uapi::device_id::DeviceId,
1137            _owner: starnix_uapi::auth::FsCred,
1138        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1139            starnix_uapi::error!(ENOTDIR)
1140        }
1141
1142        fn mkdir(
1143            &self,
1144            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1145            _node: &$crate::vfs::FsNode,
1146            _current_task: &$crate::task::CurrentTask,
1147            _name: &$crate::vfs::FsStr,
1148            _mode: starnix_uapi::file_mode::FileMode,
1149            _owner: starnix_uapi::auth::FsCred,
1150        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1151            starnix_uapi::error!(ENOTDIR)
1152        }
1153
1154        fn create_symlink(
1155            &self,
1156            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1157            _node: &$crate::vfs::FsNode,
1158            _current_task: &$crate::task::CurrentTask,
1159            _name: &$crate::vfs::FsStr,
1160            _target: &$crate::vfs::FsStr,
1161            _owner: starnix_uapi::auth::FsCred,
1162        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1163            starnix_uapi::error!(ENOTDIR)
1164        }
1165
1166        fn unlink(
1167            &self,
1168            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1169            _node: &$crate::vfs::FsNode,
1170            _current_task: &$crate::task::CurrentTask,
1171            _name: &$crate::vfs::FsStr,
1172            _child: &$crate::vfs::FsNodeHandle,
1173        ) -> Result<(), starnix_uapi::errors::Errno> {
1174            starnix_uapi::error!(ENOTDIR)
1175        }
1176    };
1177}
1178
1179#[derive(Copy, Clone, Debug, PartialEq, Eq)]
1180pub enum TimeUpdateType {
1181    Now,
1182    Omit,
1183    Time(UtcInstant),
1184}
1185
1186// Public re-export of macros allows them to be used like regular rust items.
1187pub use fs_node_impl_dir_readonly;
1188pub use fs_node_impl_not_dir;
1189pub use fs_node_impl_symlink;
1190pub use fs_node_impl_xattr_delegate;
1191
1192pub struct SpecialNode;
1193
1194impl FsNodeOps for SpecialNode {
1195    fs_node_impl_not_dir!();
1196
1197    fn create_file_ops(
1198        &self,
1199        _locked: &mut Locked<FileOpsCore>,
1200        _node: &FsNode,
1201        _current_task: &CurrentTask,
1202        _flags: OpenFlags,
1203    ) -> Result<Box<dyn FileOps>, Errno> {
1204        unreachable!("Special nodes cannot be opened.");
1205    }
1206}
1207
1208impl FsNode {
1209    /// Returns true if the `fs_node` is private to the `Kernel`/`FileSystem`, in which
1210    /// case both MAC and DAC checks should be skipped.
1211    pub fn is_private(&self) -> bool {
1212        self.flags.contains(FsNodeFlags::IS_PRIVATE)
1213    }
1214
1215    /// Create a node without inserting it into the FileSystem node cache.
1216    ///
1217    /// This is usually not what you want!
1218    /// Only use if you're also using get_or_create_node, like ext4.
1219    pub fn new_uncached(
1220        ino: ino_t,
1221        ops: impl Into<Box<dyn FsNodeOps>>,
1222        fs: &FileSystemHandle,
1223        info: FsNodeInfo,
1224        flags: FsNodeFlags,
1225    ) -> FsNodeHandle {
1226        let ops = ops.into();
1227        FsNodeHandle::new(Self::new_internal(ino, ops, Arc::downgrade(fs), info, flags).into())
1228    }
1229
1230    fn new_internal(
1231        ino: ino_t,
1232        ops: Box<dyn FsNodeOps>,
1233        fs: Weak<FileSystem>,
1234        info: FsNodeInfo,
1235        flags: FsNodeFlags,
1236    ) -> Self {
1237        // Allow the FsNodeOps to populate initial info.
1238        let mut info = info;
1239        ops.initial_info(&mut info);
1240
1241        let fs_lockdep_type =
1242            fs.upgrade().map(|fs| fs.fs_lockdep_type()).unwrap_or(FsLockDepType::Normal);
1243        let info_lock = match fs_lockdep_type {
1244            FsLockDepType::Normal => DynamicLockDepRwLock::new::<FsNodeInfoLevel>(info),
1245            FsLockDepType::Fuse => DynamicLockDepRwLock::new::<FuseFsNodeInfoLevel>(info),
1246            FsLockDepType::Recursive => DynamicLockDepRwLock::new::<FsNodeInfoRecursiveLevel>(info),
1247        };
1248
1249        // The linter will fail in non test mode as it will not see the lock check.
1250        #[allow(clippy::let_and_return)]
1251        {
1252            let result = Self {
1253                ino,
1254                flags,
1255                ops,
1256                fs,
1257                info: info_lock,
1258                append_lock: Default::default(),
1259                rare_data: Default::default(),
1260                write_guard_state: Default::default(),
1261                fsverity: LockDepMutex::new(FsVerityState::None),
1262                security_state: Default::default(),
1263            };
1264            #[cfg(any(test, debug_assertions))]
1265            {
1266                #[allow(
1267                    clippy::undocumented_unsafe_blocks,
1268                    reason = "Force documented unsafe blocks in Starnix"
1269                )]
1270                let locked = unsafe { Unlocked::new() };
1271                let _l1 = result.append_lock.read_for_lock_ordering(locked);
1272                let _l2 = result.info.read();
1273                let _l3 = result.write_guard_state.lock();
1274                let _l4 = result.fsverity.lock();
1275            }
1276            result
1277        }
1278    }
1279
1280    pub fn fs(&self) -> FileSystemHandle {
1281        self.fs.upgrade().expect("FileSystem did not live long enough")
1282    }
1283
1284    pub fn ops(&self) -> &dyn FsNodeOps {
1285        self.ops.as_ref()
1286    }
1287
1288    /// Returns an error if this node is encrypted and locked. Does not require
1289    /// fetch_and_refresh_info because FS_IOC_SET_ENCRYPTION_POLICY updates info and once a node is
1290    /// encrypted, it remains encrypted forever.
1291    pub fn fail_if_locked(
1292        &self,
1293        _current_task: &CurrentTask,
1294        node_info: &FsNodeInfo,
1295    ) -> Result<(), Errno> {
1296        if let Some(wrapping_key_id) = node_info.wrapping_key_id {
1297            let crypt_service = self.fs().crypt_service().ok_or_else(|| errno!(ENOKEY))?;
1298            if !crypt_service.contains_key(EncryptionKeyId::from(wrapping_key_id)) {
1299                return error!(ENOKEY);
1300            }
1301        }
1302        Ok(())
1303    }
1304
1305    /// Returns the `FsNode`'s `FsNodeOps` as a `&T`, or `None` if the downcast fails.
1306    pub fn downcast_ops<T>(&self) -> Option<&T>
1307    where
1308        T: 'static,
1309    {
1310        self.ops().as_any().downcast_ref::<T>()
1311    }
1312
1313    pub fn on_file_closed(&self, file: &FileObjectState) {
1314        if let Some(rare_data) = self.rare_data.get() {
1315            let mut flock_info = rare_data.flock_info.lock();
1316            // This function will drop the flock from `file` because the `WeakFileHandle` for
1317            // `file` will no longer upgrade to an `FileHandle`.
1318            flock_info.retain(|_| true);
1319        }
1320        self.record_lock_release(RecordLockOwner::FileObject(file.id));
1321    }
1322
1323    pub fn record_lock(
1324        &self,
1325        locked: &mut Locked<Unlocked>,
1326        current_task: &CurrentTask,
1327        file: &FileObject,
1328        cmd: RecordLockCommand,
1329        flock: uapi::flock,
1330    ) -> Result<Option<uapi::flock>, Errno> {
1331        self.ensure_rare_data().record_locks.lock(locked, current_task, file, cmd, flock)
1332    }
1333
1334    /// Release all record locks acquired by the given owner.
1335    pub fn record_lock_release(&self, owner: RecordLockOwner) {
1336        if let Some(rare_data) = self.rare_data.get() {
1337            rare_data.record_locks.release_locks(owner);
1338        }
1339    }
1340
1341    pub fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
1342        self.ops().create_dir_entry_ops()
1343    }
1344
1345    pub fn create_file_ops<L>(
1346        &self,
1347        locked: &mut Locked<L>,
1348        current_task: &CurrentTask,
1349        flags: OpenFlags,
1350    ) -> Result<Box<dyn FileOps>, Errno>
1351    where
1352        L: LockEqualOrBefore<FileOpsCore>,
1353    {
1354        let locked = locked.cast_locked::<FileOpsCore>();
1355        self.ops().create_file_ops(locked, self, current_task, flags)
1356    }
1357
1358    pub fn open(
1359        &self,
1360        locked: &mut Locked<Unlocked>,
1361        current_task: &CurrentTask,
1362        namespace_node: &NamespaceNode,
1363        flags: OpenFlags,
1364        access_check: AccessCheck,
1365    ) -> Result<Box<dyn FileOps>, Errno> {
1366        // If O_PATH is set, there is no need to create a real FileOps because
1367        // most file operations are disabled.
1368        if flags.contains(OpenFlags::PATH) {
1369            return Ok(Box::new(OPathOps::new()));
1370        }
1371
1372        let access = access_check.resolve(flags);
1373        if access.is_nontrivial() {
1374            if flags.contains(OpenFlags::NOATIME) {
1375                self.check_o_noatime_allowed(current_task)?;
1376            }
1377
1378            // `flags` doesn't contain any information about the EXEC permission. Instead the syscalls
1379            // used to execute a file (`sys_execve` and `sys_execveat`) call `open()` with the EXEC
1380            // permission request in `access`.
1381            let mut permission_flags = PermissionFlags::from(access);
1382
1383            // The `APPEND` flag exists only in `flags`, to modify the behaviour of
1384            // `PermissionFlags::WRITE`
1385            if flags.contains(OpenFlags::APPEND) {
1386                permission_flags |= security::PermissionFlags::APPEND;
1387            }
1388
1389            // TODO: https://fxbug.dev/455782510 - Remove this once non-open() checks are fully
1390            // enforced.
1391            permission_flags |= security::PermissionFlags::FOR_OPEN;
1392
1393            self.check_access(
1394                locked,
1395                current_task,
1396                &namespace_node.mount,
1397                permission_flags,
1398                CheckAccessReason::InternalPermissionChecks,
1399                namespace_node,
1400            )?;
1401        }
1402
1403        let (mode, rdev) = {
1404            // Don't hold the info lock while calling into open_device or self.ops().
1405            // TODO: The mode and rdev are immutable and shouldn't require a lock to read.
1406            let info = self.info();
1407            (info.mode, info.rdev)
1408        };
1409
1410        match mode & FileMode::IFMT {
1411            FileMode::IFCHR => {
1412                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1413                    return error!(EACCES);
1414                }
1415                current_task.kernel().open_device(
1416                    locked,
1417                    current_task,
1418                    namespace_node,
1419                    flags,
1420                    rdev,
1421                    DeviceMode::Char,
1422                )
1423            }
1424            FileMode::IFBLK => {
1425                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1426                    return error!(EACCES);
1427                }
1428                current_task.kernel().open_device(
1429                    locked,
1430                    current_task,
1431                    namespace_node,
1432                    flags,
1433                    rdev,
1434                    DeviceMode::Block,
1435                )
1436            }
1437            FileMode::IFIFO => Pipe::open(locked, current_task, self.fifo(current_task), flags),
1438            // UNIX domain sockets can't be opened.
1439            FileMode::IFSOCK => error!(ENXIO),
1440            _ => self.create_file_ops(locked, current_task, flags),
1441        }
1442    }
1443
1444    pub fn lookup<L>(
1445        &self,
1446        locked: &mut Locked<L>,
1447        current_task: &CurrentTask,
1448        mount: &MountInfo,
1449        name: &FsStr,
1450    ) -> Result<FsNodeHandle, Errno>
1451    where
1452        L: LockEqualOrBefore<FileOpsCore>,
1453    {
1454        self.check_access(
1455            locked,
1456            current_task,
1457            mount,
1458            Access::EXEC,
1459            CheckAccessReason::InternalPermissionChecks,
1460            &[Auditable::Name(name), std::panic::Location::caller().into()],
1461        )?;
1462        let locked = locked.cast_locked::<FileOpsCore>();
1463        self.ops().lookup(locked, self, current_task, name)
1464    }
1465
1466    pub fn create_node<L>(
1467        &self,
1468        locked: &mut Locked<L>,
1469        current_task: &CurrentTask,
1470        mount: &MountInfo,
1471        name: &FsStr,
1472        mut mode: FileMode,
1473        dev: DeviceId,
1474        mut owner: FsCred,
1475    ) -> Result<FsNodeHandle, Errno>
1476    where
1477        L: LockEqualOrBefore<FileOpsCore>,
1478    {
1479        assert!(
1480            !matches!(mode.fmt(), FileMode::EMPTY | FileMode::IFLNK),
1481            "create_node with missing or symlink node type"
1482        );
1483
1484        self.check_access(
1485            locked,
1486            current_task,
1487            mount,
1488            Access::WRITE,
1489            CheckAccessReason::InternalPermissionChecks,
1490            security::Auditable::Name(name),
1491        )?;
1492
1493        if mode.is_dir() {
1494            // Even though the man page for mknod(2) says that mknod "cannot be used to create
1495            // directories" in starnix the mkdir syscall (`sys_mkdirat`) ends up calling
1496            // create_node.
1497            security::check_fs_node_mkdir_access(current_task, self, mode, name)?;
1498        } else {
1499            // https://man7.org/linux/man-pages/man2/mknod.2.html says on error EPERM:
1500            //
1501            //   mode requested creation of something other than a regular
1502            //   file, FIFO (named pipe), or UNIX domain socket, and the
1503            //   caller is not privileged (Linux: does not have the
1504            //   CAP_MKNOD capability); also returned if the filesystem
1505            //   containing pathname does not support the type of node
1506            //   requested.
1507            match mode.fmt() {
1508                FileMode::IFREG | FileMode::IFIFO | FileMode::IFSOCK => (),
1509                FileMode::IFCHR if dev == DeviceId::NONE => (),
1510                _ => security::check_task_capable(current_task, CAP_MKNOD)?,
1511            }
1512
1513            if mode.is_reg() {
1514                security::check_fs_node_create_access(current_task, self, mode, name)?;
1515            } else {
1516                security::check_fs_node_mknod_access(current_task, self, mode, name, dev)?;
1517            }
1518        }
1519
1520        // Propagate sticky bit(s) from parent directory to the child.
1521        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1522
1523        // Delegate to the `ops` implementation to actually create the node.
1524        let locked = locked.cast_locked::<FileOpsCore>();
1525        let new_node = if mode.is_dir() {
1526            self.ops().mkdir(locked, self, current_task, name, mode, owner)?
1527        } else {
1528            self.ops().mknod(locked, self, current_task, name, mode, dev, owner)?
1529        };
1530
1531        // Allow the LSM to apply a security label to the new node.
1532        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1533
1534        Ok(new_node)
1535    }
1536
1537    pub fn create_symlink<L>(
1538        &self,
1539        locked: &mut Locked<L>,
1540        current_task: &CurrentTask,
1541        mount: &MountInfo,
1542        name: &FsStr,
1543        target: &FsStr,
1544        owner: FsCred,
1545    ) -> Result<FsNodeHandle, Errno>
1546    where
1547        L: LockEqualOrBefore<FileOpsCore>,
1548    {
1549        self.check_access(
1550            locked,
1551            current_task,
1552            mount,
1553            Access::WRITE,
1554            CheckAccessReason::InternalPermissionChecks,
1555            security::Auditable::Name(name),
1556        )?;
1557        security::check_fs_node_symlink_access(current_task, self, name, target)?;
1558
1559        let locked = locked.cast_locked::<FileOpsCore>();
1560        let new_node =
1561            self.ops().create_symlink(locked, self, current_task, name, target, owner)?;
1562
1563        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1564
1565        Ok(new_node)
1566    }
1567
1568    /// Requests that the LSM initialise a security label for the `new_node`, and optionally provide
1569    /// an extended attribute to write to the file to persist it.  If no LSM is enabled, no extended
1570    /// attribute returned, or if the filesystem does not support extended attributes, then the call
1571    /// returns success. All other failure modes return an `Errno` that should be early-returned.
1572    fn init_new_node_security_on_create<L>(
1573        &self,
1574        locked: &mut Locked<L>,
1575        current_task: &CurrentTask,
1576        new_node: &FsNode,
1577        name: &FsStr,
1578    ) -> Result<(), Errno>
1579    where
1580        L: LockEqualOrBefore<FileOpsCore>,
1581    {
1582        let locked = locked.cast_locked::<FileOpsCore>();
1583        security::fs_node_init_on_create(current_task, &new_node, self, name)?
1584            .map(|xattr| {
1585                match new_node.ops().set_xattr(
1586                    locked,
1587                    &new_node,
1588                    current_task,
1589                    xattr.name,
1590                    xattr.value.as_slice().into(),
1591                    XattrOp::Create,
1592                ) {
1593                    Err(e) => {
1594                        if e.code == ENOTSUP {
1595                            // This should only occur if a task has an "fscreate" context set, and
1596                            // creates a new file in a filesystem that does not support xattrs.
1597                            Ok(())
1598                        } else {
1599                            Err(e)
1600                        }
1601                    }
1602                    result => result,
1603                }
1604            })
1605            .unwrap_or_else(|| Ok(()))
1606    }
1607
1608    pub fn create_tmpfile<L>(
1609        &self,
1610        locked: &mut Locked<L>,
1611        current_task: &CurrentTask,
1612        mount: &MountInfo,
1613        mut mode: FileMode,
1614        mut owner: FsCred,
1615        link_behavior: FsNodeLinkBehavior,
1616    ) -> Result<FsNodeHandle, Errno>
1617    where
1618        L: LockEqualOrBefore<FileOpsCore>,
1619    {
1620        self.check_access(
1621            locked,
1622            current_task,
1623            mount,
1624            Access::WRITE,
1625            CheckAccessReason::InternalPermissionChecks,
1626            security::Auditable::Location(std::panic::Location::caller()),
1627        )?;
1628        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1629        let node = self.ops().create_tmpfile(self, current_task, mode, owner)?;
1630        self.init_new_node_security_on_create(locked, current_task, &node, "".into())?;
1631        if link_behavior == FsNodeLinkBehavior::Disallowed {
1632            node.ensure_rare_data().link_behavior.set(link_behavior).unwrap();
1633        }
1634        Ok(node)
1635    }
1636
1637    // This method does not attempt to update the atime of the node.
1638    // Use `NamespaceNode::readlink` which checks the mount flags and updates the atime accordingly.
1639    pub fn readlink<L>(
1640        &self,
1641        locked: &mut Locked<L>,
1642        current_task: &CurrentTask,
1643    ) -> Result<SymlinkTarget, Errno>
1644    where
1645        L: LockEqualOrBefore<FileOpsCore>,
1646    {
1647        // TODO: 378864856 - Is there a permission check here other than security checks?
1648        security::check_fs_node_read_link_access(current_task, self)?;
1649        self.ops().readlink(locked.cast_locked::<FileOpsCore>(), self, current_task)
1650    }
1651
1652    pub fn link<L>(
1653        &self,
1654        locked: &mut Locked<L>,
1655        current_task: &CurrentTask,
1656        mount: &MountInfo,
1657        name: &FsStr,
1658        child: &FsNodeHandle,
1659    ) -> Result<FsNodeHandle, Errno>
1660    where
1661        L: LockEqualOrBefore<FileOpsCore>,
1662    {
1663        self.check_access(
1664            locked,
1665            current_task,
1666            mount,
1667            Access::WRITE,
1668            CheckAccessReason::InternalPermissionChecks,
1669            security::Auditable::Location(std::panic::Location::caller()),
1670        )?;
1671
1672        if child.is_dir() {
1673            return error!(EPERM);
1674        }
1675
1676        if let Some(child_rare_data) = child.rare_data.get() {
1677            if matches!(child_rare_data.link_behavior.get(), Some(FsNodeLinkBehavior::Disallowed)) {
1678                return error!(ENOENT);
1679            }
1680        }
1681
1682        // Check that `current_task` has permission to create the hard link.
1683        //
1684        // See description of /proc/sys/fs/protected_hardlinks in
1685        // https://man7.org/linux/man-pages/man5/proc.5.html for details of the security
1686        // vulnerabilities.
1687        //
1688        let (child_uid, mode) = {
1689            let info = child.info();
1690            (info.uid, info.mode)
1691        };
1692        // Check that the the filesystem UID of the calling process (`current_task`) is the same as
1693        // the UID of the existing file. The check can be bypassed if the calling process has
1694        // `CAP_FOWNER` capability.
1695        if child_uid != current_task.current_creds().fsuid
1696            && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1697        {
1698            // If current_task is not the user of the existing file, it needs to have read and write
1699            // access to the existing file.
1700            child
1701                .check_access(
1702                    locked,
1703                    current_task,
1704                    mount,
1705                    Access::READ | Access::WRITE,
1706                    CheckAccessReason::InternalPermissionChecks,
1707                    security::Auditable::Name(name),
1708                )
1709                .map_err(|e| {
1710                    // `check_access(..)` returns EACCES when the access rights doesn't match - change
1711                    // it to EPERM to match Linux standards.
1712                    if e == EACCES { errno!(EPERM) } else { e }
1713                })?;
1714            // There are also security issues that may arise when users link to setuid, setgid, or
1715            // special files.
1716            if mode.contains(FileMode::ISGID | FileMode::IXGRP) {
1717                return error!(EPERM);
1718            };
1719            if mode.contains(FileMode::ISUID) {
1720                return error!(EPERM);
1721            };
1722            if !mode.contains(FileMode::IFREG) {
1723                return error!(EPERM);
1724            };
1725        }
1726
1727        security::check_fs_node_link_access(current_task, self, child)?;
1728
1729        let locked = locked.cast_locked::<FileOpsCore>();
1730        self.ops().link(locked, self, current_task, name, child)?;
1731        Ok(child.clone())
1732    }
1733
1734    pub fn unlink<L>(
1735        &self,
1736        locked: &mut Locked<L>,
1737        current_task: &CurrentTask,
1738        mount: &MountInfo,
1739        name: &FsStr,
1740        child: &FsNodeHandle,
1741    ) -> Result<(), Errno>
1742    where
1743        L: LockEqualOrBefore<FileOpsCore>,
1744    {
1745        // The user must be able to search and write to the directory.
1746        self.check_access(
1747            locked,
1748            current_task,
1749            mount,
1750            Access::EXEC | Access::WRITE,
1751            CheckAccessReason::InternalPermissionChecks,
1752            security::Auditable::Name(name),
1753        )?;
1754        {
1755            let parent_info = self.info();
1756            // Safe because we acquire the parent directory lock first, and then the child lock
1757            // inside check_sticky_bit. This parent -> child acquisition follows the
1758            // hierarchical lock ordering.
1759            let _token = allow_subclass();
1760            self.check_sticky_bit(current_task, child, &parent_info)?;
1761        }
1762        if child.is_dir() {
1763            security::check_fs_node_rmdir_access(current_task, self, child, name)?;
1764        } else {
1765            security::check_fs_node_unlink_access(current_task, self, child, name)?;
1766        }
1767        let locked = locked.cast_locked::<FileOpsCore>();
1768        self.ops().unlink(locked, self, current_task, name, child)?;
1769        self.update_ctime_mtime();
1770        Ok(())
1771    }
1772
1773    pub fn truncate<L>(
1774        &self,
1775        locked: &mut Locked<L>,
1776        current_task: &CurrentTask,
1777        mount: &MountInfo,
1778        length: u64,
1779    ) -> Result<(), Errno>
1780    where
1781        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1782    {
1783        let mut locked = locked.cast_locked::<BeforeFsNodeAppend>();
1784        if self.is_dir() {
1785            return error!(EISDIR);
1786        }
1787        self.check_access(
1788            &mut locked,
1789            current_task,
1790            mount,
1791            Access::WRITE,
1792            CheckAccessReason::InternalPermissionChecks,
1793            security::Auditable::Location(std::panic::Location::caller()),
1794        )?;
1795
1796        let (guard, locked) = self.ops().append_lock_write(&mut locked, self, current_task)?;
1797        self.truncate_locked(locked, &guard, current_task, length)
1798    }
1799
1800    /// Avoid calling this method directly. You probably want to call `FileObject::ftruncate()`
1801    /// which will also perform all file-descriptor based verifications.
1802    pub fn ftruncate<L>(
1803        &self,
1804        locked: &mut Locked<L>,
1805        current_task: &CurrentTask,
1806        length: u64,
1807    ) -> Result<(), Errno>
1808    where
1809        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1810    {
1811        let locked = locked.cast_locked::<BeforeFsNodeAppend>();
1812
1813        if self.is_dir() {
1814            // When truncating a file descriptor, if the descriptor references a directory,
1815            // return EINVAL. This is different from the truncate() syscall which returns EISDIR.
1816            //
1817            // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#ERRORS
1818            return error!(EINVAL);
1819        }
1820
1821        // For ftruncate, we do not need to check that the file node is writable.
1822        //
1823        // The file object that calls this method must verify that the file was opened
1824        // with write permissions.
1825        //
1826        // This matters because a file could be opened with O_CREAT + O_RDWR + 0444 mode.
1827        // The file descriptor returned from such an operation can be truncated, even
1828        // though the file was created with a read-only mode.
1829        //
1830        // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#DESCRIPTION
1831        // which says:
1832        //
1833        // "With ftruncate(), the file must be open for writing; with truncate(),
1834        // the file must be writable."
1835
1836        let (guard, locked) = self.ops().append_lock_write(locked, self, current_task)?;
1837        self.truncate_locked(locked, &guard, current_task, length)
1838    }
1839
1840    // Called by `truncate` and `ftruncate` above.
1841    pub fn truncate_locked<L>(
1842        &self,
1843        locked: &mut Locked<L>,
1844        guard: &AppendLockWriteGuard<'_>,
1845        current_task: &CurrentTask,
1846        length: u64,
1847    ) -> Result<(), Errno>
1848    where
1849        L: LockEqualOrBefore<FileOpsCore>,
1850    {
1851        let locked = locked.cast_locked::<FileOpsCore>();
1852        if length > MAX_LFS_FILESIZE as u64 {
1853            return error!(EINVAL);
1854        }
1855        if length > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1856            send_standard_signal(locked, current_task, SignalInfo::kernel(SIGXFSZ));
1857            return error!(EFBIG);
1858        }
1859        self.clear_suid_and_sgid_bits(locked, current_task)?;
1860
1861        self.ops().truncate(locked, guard, self, current_task, length)?;
1862        self.update_ctime_mtime();
1863        Ok(())
1864    }
1865
1866    /// Avoid calling this method directly. You probably want to call `FileObject::fallocate()`
1867    /// which will also perform additional verifications.
1868    pub fn fallocate<L>(
1869        &self,
1870        locked: &mut Locked<L>,
1871        current_task: &CurrentTask,
1872        mode: FallocMode,
1873        offset: u64,
1874        length: u64,
1875    ) -> Result<(), Errno>
1876    where
1877        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1878    {
1879        let mut locked = locked.cast_locked::<BeforeFsNodeAppend>();
1880        let (guard, locked) = self.ops().append_lock_write(&mut locked, self, current_task)?;
1881        self.fallocate_locked(locked, &guard, current_task, mode, offset, length)
1882    }
1883
1884    pub fn fallocate_locked<L>(
1885        &self,
1886        locked: &mut Locked<L>,
1887        guard: &AppendLockWriteGuard<'_>,
1888        current_task: &CurrentTask,
1889        mode: FallocMode,
1890        offset: u64,
1891        length: u64,
1892    ) -> Result<(), Errno>
1893    where
1894        L: LockEqualOrBefore<FileOpsCore>,
1895    {
1896        let locked = locked.cast_locked::<FileOpsCore>();
1897        let allocate_size = checked_add_offset_and_length(offset as usize, length as usize)
1898            .map_err(|_| errno!(EFBIG))? as u64;
1899        if allocate_size > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1900            send_standard_signal(locked, current_task, SignalInfo::kernel(SIGXFSZ));
1901            return error!(EFBIG);
1902        }
1903
1904        self.clear_suid_and_sgid_bits(locked, current_task)?;
1905
1906        self.ops().allocate(locked, guard, self, current_task, mode, offset, length)?;
1907        self.update_ctime_mtime();
1908        Ok(())
1909    }
1910
1911    fn update_metadata_for_child(
1912        &self,
1913        current_task: &CurrentTask,
1914        mode: &mut FileMode,
1915        owner: &mut FsCred,
1916    ) {
1917        // The setgid bit on a directory causes the gid to be inherited by new children and the
1918        // setgid bit to be inherited by new child directories. See SetgidDirTest in gvisor.
1919        {
1920            let self_info = self.info();
1921            if self_info.mode.contains(FileMode::ISGID) {
1922                owner.gid = self_info.gid;
1923                if mode.is_dir() {
1924                    *mode |= FileMode::ISGID;
1925                }
1926            }
1927        }
1928
1929        if !mode.is_dir() {
1930            // https://man7.org/linux/man-pages/man7/inode.7.html says:
1931            //
1932            //   For an executable file, the set-group-ID bit causes the
1933            //   effective group ID of a process that executes the file to change
1934            //   as described in execve(2).
1935            //
1936            // We need to check whether the current task has permission to create such a file.
1937            // See a similar check in `FsNode::chmod`.
1938            let current_creds = current_task.current_creds();
1939            if owner.gid != current_creds.fsgid
1940                && !current_creds.is_in_group(owner.gid)
1941                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1942            {
1943                *mode &= !FileMode::ISGID;
1944            }
1945        }
1946    }
1947
1948    /// Checks if O_NOATIME is allowed,
1949    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1950        // Per open(2),
1951        //
1952        //   O_NOATIME (since Linux 2.6.8)
1953        //      ...
1954        //
1955        //      This flag can be employed only if one of the following
1956        //      conditions is true:
1957        //
1958        //      *  The effective UID of the process matches the owner UID
1959        //         of the file.
1960        //
1961        //      *  The calling process has the CAP_FOWNER capability in
1962        //         its user namespace and the owner UID of the file has a
1963        //         mapping in the namespace.
1964        if current_task.current_creds().fsuid != self.info().uid {
1965            security::check_task_capable(current_task, CAP_FOWNER)?;
1966        }
1967        Ok(())
1968    }
1969
1970    pub fn default_check_access_impl(
1971        &self,
1972        current_task: &CurrentTask,
1973        permission_flags: security::PermissionFlags,
1974        reason: CheckAccessReason,
1975        info: LockDepReadGuard<'_, FsNodeInfo>,
1976        audit_context: Auditable<'_>,
1977    ) -> Result<(), Errno> {
1978        let (node_uid, node_gid, mode) = (info.uid, info.gid, info.mode);
1979        std::mem::drop(info);
1980        if let CheckAccessReason::ChangeTimestamps { now } = reason {
1981            // To set the timestamps to the current time the caller must either have write access to
1982            // the file, be the file owner, or hold the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
1983            // To set the timestamps to other values the caller must either be the file owner or hold
1984            // the CAP_FOWNER capability.
1985            if current_task.current_creds().fsuid == node_uid {
1986                return Ok(());
1987            }
1988            if now {
1989                if security::is_task_capable_noaudit(current_task, CAP_FOWNER) {
1990                    return Ok(());
1991                }
1992            } else {
1993                security::check_task_capable(current_task, CAP_FOWNER)?;
1994                return Ok(());
1995            }
1996        }
1997        check_access(self, current_task, permission_flags, node_uid, node_gid, mode)?;
1998        security::fs_node_permission(current_task, self, permission_flags, audit_context)
1999    }
2000
2001    /// Check whether the node can be accessed in the current context with the specified access
2002    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
2003    /// owner or is in the file's group.
2004    pub fn check_access<'a, L>(
2005        &self,
2006        locked: &mut Locked<L>,
2007        current_task: &CurrentTask,
2008        mount: &MountInfo,
2009        access: impl Into<security::PermissionFlags>,
2010        reason: CheckAccessReason,
2011        audit_context: impl Into<security::Auditable<'a>>,
2012    ) -> Result<(), Errno>
2013    where
2014        L: LockEqualOrBefore<FileOpsCore>,
2015    {
2016        let mut permission_flags = access.into();
2017        if permission_flags.contains(security::PermissionFlags::WRITE)
2018            && !self.info().mode.is_special()
2019        {
2020            mount.check_readonly_filesystem()?;
2021        }
2022        if permission_flags.contains(security::PermissionFlags::EXEC) && !self.is_dir() {
2023            mount.check_noexec_filesystem()?;
2024        }
2025        if reason == CheckAccessReason::Access {
2026            permission_flags |= PermissionFlags::ACCESS;
2027        }
2028        self.ops().check_access(
2029            locked.cast_locked::<FileOpsCore>(),
2030            self,
2031            current_task,
2032            permission_flags,
2033            &self.info,
2034            reason,
2035            audit_context.into(),
2036        )
2037    }
2038
2039    /// Check whether the stick bit, `S_ISVTX`, forbids the `current_task` from removing the given
2040    /// `child`. If this node has `S_ISVTX`, then either the child must be owned by the `fsuid` of
2041    /// `current_task` or `current_task` must have `CAP_FOWNER`.
2042    pub fn check_sticky_bit(
2043        &self,
2044        current_task: &CurrentTask,
2045        child: &FsNodeHandle,
2046        info: &FsNodeInfo,
2047    ) -> Result<(), Errno> {
2048        if info.mode.contains(FileMode::ISVTX)
2049            && child.info().uid != current_task.current_creds().fsuid
2050        {
2051            security::check_task_capable(current_task, CAP_FOWNER)?;
2052        }
2053        Ok(())
2054    }
2055
2056    pub fn fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
2057        assert!(self.is_fifo());
2058        self.ensure_rare_data().ensure_fifo(current_task)
2059    }
2060
2061    /// Returns the UNIX domain socket bound to this node, if any.
2062    pub fn bound_socket(&self) -> Option<&SocketHandle> {
2063        if let Some(rare_data) = self.rare_data.get() { rare_data.bound_socket.get() } else { None }
2064    }
2065
2066    /// Register the provided socket as the UNIX domain socket bound to this node.
2067    ///
2068    /// It is a fatal error to call this method again if it has already been called on this node.
2069    pub fn set_bound_socket(&self, socket: SocketHandle) {
2070        assert!(self.ensure_rare_data().bound_socket.set(socket).is_ok());
2071    }
2072
2073    pub fn update_attributes<L, F>(
2074        &self,
2075        locked: &mut Locked<L>,
2076        current_task: &CurrentTask,
2077        mutator: F,
2078    ) -> Result<(), Errno>
2079    where
2080        L: LockEqualOrBefore<FileOpsCore>,
2081        F: FnOnce(&mut FsNodeInfo) -> Result<(), Errno>,
2082    {
2083        let mut info = self.info.write();
2084        let mut new_info = info.clone();
2085        mutator(&mut new_info)?;
2086
2087        let new_access = new_info.mode.user_access()
2088            | new_info.mode.group_access()
2089            | new_info.mode.other_access();
2090
2091        if new_access.intersects(Access::EXEC) {
2092            let write_guard_state = self.write_guard_state.lock();
2093            if let Ok(seals) = write_guard_state.get_seals() {
2094                if seals.contains(SealFlags::NO_EXEC) {
2095                    return error!(EPERM);
2096                }
2097            }
2098        }
2099
2100        // `mutator`s should not update the attribute change time, which is managed by this API.
2101        assert_eq!(info.time_status_change, new_info.time_status_change);
2102        if *info == new_info {
2103            return Ok(());
2104        }
2105        new_info.time_status_change = utc::utc_now();
2106
2107        let mut has = zxio_node_attr_has_t { ..Default::default() };
2108        has.modification_time = info.time_modify != new_info.time_modify;
2109        has.access_time = info.time_access != new_info.time_access;
2110        has.mode = info.mode != new_info.mode;
2111        has.uid = info.uid != new_info.uid;
2112        has.gid = info.gid != new_info.gid;
2113        has.rdev = info.rdev != new_info.rdev;
2114        has.casefold = info.casefold != new_info.casefold;
2115        has.wrapping_key_id = info.wrapping_key_id != new_info.wrapping_key_id;
2116
2117        security::check_fs_node_setattr_access(current_task, &self, &has)?;
2118
2119        // Call `update_attributes(..)` to persist the changes for the following fields.
2120        if has.modification_time
2121            || has.access_time
2122            || has.mode
2123            || has.uid
2124            || has.gid
2125            || has.rdev
2126            || has.casefold
2127            || has.wrapping_key_id
2128        {
2129            let locked = locked.cast_locked::<FileOpsCore>();
2130            self.ops().update_attributes(locked, self, current_task, &new_info, has)?;
2131        }
2132
2133        *info = new_info;
2134        Ok(())
2135    }
2136
2137    /// Set the permissions on this FsNode to the given values.
2138    ///
2139    /// Does not change the IFMT of the node.
2140    pub fn chmod<L>(
2141        &self,
2142        locked: &mut Locked<L>,
2143        current_task: &CurrentTask,
2144        mount: &MountInfo,
2145        mut mode: FileMode,
2146    ) -> Result<(), Errno>
2147    where
2148        L: LockEqualOrBefore<FileOpsCore>,
2149    {
2150        mount.check_readonly_filesystem()?;
2151        self.update_attributes(locked, current_task, |info| {
2152            let current_creds = current_task.current_creds();
2153            if info.uid != current_creds.euid {
2154                security::check_task_capable(current_task, CAP_FOWNER)?;
2155            } else if info.gid != current_creds.egid
2156                && !current_creds.is_in_group(info.gid)
2157                && mode.intersects(FileMode::ISGID)
2158                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
2159            {
2160                mode &= !FileMode::ISGID;
2161            }
2162            info.chmod(mode);
2163            Ok(())
2164        })
2165    }
2166
2167    /// Sets the owner and/or group on this FsNode.
2168    pub fn chown<L>(
2169        &self,
2170        locked: &mut Locked<L>,
2171        current_task: &CurrentTask,
2172        mount: &MountInfo,
2173        owner: Option<uid_t>,
2174        group: Option<gid_t>,
2175    ) -> Result<(), Errno>
2176    where
2177        L: LockEqualOrBefore<FileOpsCore>,
2178    {
2179        mount.check_readonly_filesystem()?;
2180        self.update_attributes(locked, current_task, |info| {
2181            if security::is_task_capable_noaudit(current_task, CAP_CHOWN) {
2182                info.chown(owner, group);
2183                return Ok(());
2184            }
2185
2186            // Nobody can change the owner.
2187            if let Some(uid) = owner {
2188                if info.uid != uid {
2189                    return error!(EPERM);
2190                }
2191            }
2192
2193            let (euid, is_in_group) = {
2194                let current_creds = current_task.current_creds();
2195                (current_creds.euid, group.map(|gid| current_creds.is_in_group(gid)))
2196            };
2197
2198            // The owner can change the group.
2199            if info.uid == euid {
2200                // To a group that it belongs.
2201                if let Some(is_in_group) = is_in_group {
2202                    if !is_in_group {
2203                        return error!(EPERM);
2204                    }
2205                }
2206                info.chown(None, group);
2207                return Ok(());
2208            }
2209
2210            // Any other user can call chown(file, -1, -1)
2211            if owner.is_some() || group.is_some() {
2212                return error!(EPERM);
2213            }
2214
2215            // But not on set-user-ID or set-group-ID files.
2216            // If we were to chown them, they would drop the set-ID bit.
2217            if info.mode.is_reg()
2218                && (info.mode.contains(FileMode::ISUID)
2219                    || info.mode.contains(FileMode::ISGID | FileMode::IXGRP))
2220            {
2221                return error!(EPERM);
2222            }
2223
2224            info.chown(None, None);
2225            Ok(())
2226        })
2227    }
2228
2229    /// Forcefully change the owner and group of this node.
2230    ///
2231    /// # Safety
2232    ///
2233    /// This function skips all the security checks and just updates the owner and group. Also, does
2234    /// not check if the filesystem is read-only and does not update the attribute change time.
2235    ///
2236    /// This function is used to set the owner and group of /proc/pid to the credentials of the
2237    /// current task. Please consider carefully whether you want to use this function for another
2238    /// purpose.
2239    pub unsafe fn force_chown(&self, creds: FsCred) {
2240        self.update_info(|info| {
2241            info.chown(Some(creds.uid), Some(creds.gid));
2242        });
2243    }
2244
2245    /// Whether this node is a regular file.
2246    pub fn is_reg(&self) -> bool {
2247        self.info().mode.is_reg()
2248    }
2249
2250    /// Whether this node is a directory.
2251    pub fn is_dir(&self) -> bool {
2252        self.info().mode.is_dir()
2253    }
2254
2255    /// Whether this node is a socket.
2256    pub fn is_sock(&self) -> bool {
2257        self.info().mode.is_sock()
2258    }
2259
2260    /// Whether this node is a FIFO.
2261    pub fn is_fifo(&self) -> bool {
2262        self.info().mode.is_fifo()
2263    }
2264
2265    /// Whether this node is a symbolic link.
2266    pub fn is_lnk(&self) -> bool {
2267        self.info().mode.is_lnk()
2268    }
2269
2270    pub fn dev(&self) -> DeviceId {
2271        self.fs().dev_id
2272    }
2273
2274    pub fn stat<L>(
2275        &self,
2276        locked: &mut Locked<L>,
2277        current_task: &CurrentTask,
2278    ) -> Result<uapi::stat, Errno>
2279    where
2280        L: LockEqualOrBefore<FileOpsCore>,
2281    {
2282        security::check_fs_node_getattr_access(current_task, self)?;
2283
2284        let info = self.fetch_and_refresh_info(locked, current_task)?;
2285
2286        let time_to_kernel_timespec_pair = |t| {
2287            let timespec { tv_sec, tv_nsec } = timespec_from_time(t);
2288            let time = tv_sec.try_into().map_err(|_| errno!(EINVAL))?;
2289            let time_nsec = tv_nsec.try_into().map_err(|_| errno!(EINVAL))?;
2290            Ok((time, time_nsec))
2291        };
2292
2293        let (st_atime, st_atime_nsec) = time_to_kernel_timespec_pair(info.time_access)?;
2294        let (st_mtime, st_mtime_nsec) = time_to_kernel_timespec_pair(info.time_modify)?;
2295        let (st_ctime, st_ctime_nsec) = time_to_kernel_timespec_pair(info.time_status_change)?;
2296
2297        Ok(uapi::stat {
2298            st_dev: self.dev().bits(),
2299            st_ino: self.ino,
2300            st_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2301            st_mode: info.mode.bits(),
2302            st_uid: info.uid,
2303            st_gid: info.gid,
2304            st_rdev: info.rdev.bits(),
2305            st_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2306            st_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2307            st_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2308            st_atime,
2309            st_atime_nsec,
2310            st_mtime,
2311            st_mtime_nsec,
2312            st_ctime,
2313            st_ctime_nsec,
2314            ..Default::default()
2315        })
2316    }
2317
2318    /// Returns the current size of the file.  This is inherently racy, so any caller that
2319    /// might want to use the value returned should hold their own locks if necessary.  For
2320    /// example, if using the value here to implement append (which is the case at the time
2321    /// of writing this comment), locks must be held to prevent the file size being changed
2322    /// concurrently.
2323    // TODO(https://fxbug.dev/454730248): This is probably the wrong way to implement O_APPEND.
2324    pub fn get_size<L>(
2325        &self,
2326        locked: &mut Locked<L>,
2327        current_task: &CurrentTask,
2328    ) -> Result<usize, Errno>
2329    where
2330        L: LockEqualOrBefore<FileOpsCore>,
2331    {
2332        self.ops().get_size(locked.cast_locked::<FileOpsCore>(), self, current_task)
2333    }
2334
2335    fn statx_timestamp_from_time(time: UtcInstant) -> statx_timestamp {
2336        let nanos = time.into_nanos();
2337        statx_timestamp {
2338            tv_sec: nanos / NANOS_PER_SECOND,
2339            tv_nsec: (nanos % NANOS_PER_SECOND) as u32,
2340            ..Default::default()
2341        }
2342    }
2343
2344    pub fn statx<L>(
2345        &self,
2346        locked: &mut Locked<L>,
2347        current_task: &CurrentTask,
2348        flags: StatxFlags,
2349        mask: u32,
2350    ) -> Result<statx, Errno>
2351    where
2352        L: LockEqualOrBefore<FileOpsCore>,
2353    {
2354        security::check_fs_node_getattr_access(current_task, self)?;
2355
2356        // Ignore mask for now and fill in all of the fields.
2357        let info = if flags.contains(StatxFlags::AT_STATX_DONT_SYNC) {
2358            self.info()
2359        } else {
2360            self.fetch_and_refresh_info(locked, current_task)?
2361        };
2362        if mask & STATX__RESERVED == STATX__RESERVED {
2363            return error!(EINVAL);
2364        }
2365
2366        track_stub!(TODO("https://fxbug.dev/302594110"), "statx attributes");
2367        let stx_mnt_id = 0;
2368        let mut stx_attributes = 0;
2369        let stx_attributes_mask = STATX_ATTR_VERITY as u64;
2370
2371        if matches!(*self.fsverity.lock(), FsVerityState::FsVerity) {
2372            stx_attributes |= STATX_ATTR_VERITY as u64;
2373        }
2374
2375        Ok(statx {
2376            stx_mask: STATX_NLINK
2377                | STATX_UID
2378                | STATX_GID
2379                | STATX_ATIME
2380                | STATX_MTIME
2381                | STATX_CTIME
2382                | STATX_INO
2383                | STATX_SIZE
2384                | STATX_BLOCKS
2385                | STATX_BASIC_STATS,
2386            stx_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2387            stx_attributes,
2388            stx_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2389            stx_uid: info.uid,
2390            stx_gid: info.gid,
2391            stx_mode: info.mode.bits().try_into().map_err(|_| errno!(EINVAL))?,
2392            stx_ino: self.ino,
2393            stx_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2394            stx_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2395            stx_attributes_mask,
2396            stx_ctime: Self::statx_timestamp_from_time(info.time_status_change),
2397            stx_mtime: Self::statx_timestamp_from_time(info.time_modify),
2398            stx_atime: Self::statx_timestamp_from_time(info.time_access),
2399
2400            stx_rdev_major: info.rdev.major(),
2401            stx_rdev_minor: info.rdev.minor(),
2402
2403            stx_dev_major: self.fs().dev_id.major(),
2404            stx_dev_minor: self.fs().dev_id.minor(),
2405            stx_mnt_id,
2406            ..Default::default()
2407        })
2408    }
2409
2410    /// Checks whether `current_task` has capabilities required for the specified `access` to the
2411    /// extended attribute `name`.
2412    fn check_xattr_access<L>(
2413        &self,
2414        locked: &mut Locked<L>,
2415        current_task: &CurrentTask,
2416        mount: &MountInfo,
2417        name: &FsStr,
2418        access: Access,
2419    ) -> Result<(), Errno>
2420    where
2421        L: LockEqualOrBefore<FileOpsCore>,
2422    {
2423        assert!(access == Access::READ || access == Access::WRITE);
2424
2425        let enodata_if_read =
2426            |e: Errno| if access == Access::READ && e.code == EPERM { errno!(ENODATA) } else { e };
2427
2428        // man xattr(7) describes the different access checks applied to each extended attribute
2429        // namespace.
2430        if name.starts_with(XATTR_USER_PREFIX.to_bytes()) {
2431            {
2432                let info = self.info();
2433                if !info.mode.is_reg() && !info.mode.is_dir() {
2434                    return Err(enodata_if_read(errno!(EPERM)));
2435                }
2436            }
2437
2438            // TODO: https://fxbug.dev/460734830 - Perform capability check(s) if file has sticky
2439            // bit set.
2440
2441            self.check_access(
2442                locked,
2443                current_task,
2444                mount,
2445                access,
2446                CheckAccessReason::InternalPermissionChecks,
2447                security::Auditable::Name(name),
2448            )?;
2449        } else if name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()) {
2450            // Trusted extended attributes require `CAP_SYS_ADMIN` to read or write.
2451            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2452        } else if name.starts_with(XATTR_SYSTEM_PREFIX.to_bytes()) {
2453            // System extended attributes have attribute-specific access policy.
2454            // TODO: https://fxbug.dev/460734830 -  Revise how system extended attributes are
2455            // access-controlled.
2456            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2457        } else if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2458            if access == Access::WRITE {
2459                // Writes require `CAP_SYS_ADMIN`, unless the LSM owning `name` specifies to skip.
2460                if !security::fs_node_xattr_skipcap(current_task, name) {
2461                    security::check_task_capable(current_task, CAP_SYS_ADMIN)
2462                        .map_err(enodata_if_read)?;
2463                }
2464            }
2465        } else {
2466            panic!("Unknown extended attribute prefix: {}", name);
2467        }
2468        Ok(())
2469    }
2470
2471    pub fn get_xattr<L>(
2472        &self,
2473        locked: &mut Locked<L>,
2474        current_task: &CurrentTask,
2475        mount: &MountInfo,
2476        name: &FsStr,
2477        max_size: usize,
2478    ) -> Result<ValueOrSize<FsString>, Errno>
2479    where
2480        L: LockEqualOrBefore<FileOpsCore>,
2481    {
2482        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2483        self.check_xattr_access(locked, current_task, mount, name, Access::READ)?;
2484
2485        // LSM access checks must be performed after discretionary checks.
2486        security::check_fs_node_getxattr_access(current_task, self, name)?;
2487
2488        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2489            // If the attribute is in the security.* domain then allow the LSM to handle the
2490            // request, or to delegate to `FsNodeOps::get_xattr()`.
2491            security::fs_node_getsecurity(locked, current_task, self, name, max_size)
2492        } else {
2493            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2494            self.ops().get_xattr(
2495                locked.cast_locked::<FileOpsCore>(),
2496                self,
2497                current_task,
2498                name,
2499                max_size,
2500            )
2501        }
2502    }
2503
2504    pub fn set_xattr<L>(
2505        &self,
2506        locked: &mut Locked<L>,
2507        current_task: &CurrentTask,
2508        mount: &MountInfo,
2509        name: &FsStr,
2510        value: &FsStr,
2511        op: XattrOp,
2512    ) -> Result<(), Errno>
2513    where
2514        L: LockEqualOrBefore<FileOpsCore>,
2515    {
2516        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2517        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2518
2519        // LSM access checks must be performed after discretionary checks.
2520        security::check_fs_node_setxattr_access(current_task, self, name, value, op)?;
2521
2522        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2523            // If the attribute is in the security.* domain then allow the LSM to handle the
2524            // request, or to delegate to `FsNodeOps::set_xattr()`.
2525            security::fs_node_setsecurity(locked, current_task, self, name, value, op)
2526        } else {
2527            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2528            self.ops().set_xattr(
2529                locked.cast_locked::<FileOpsCore>(),
2530                self,
2531                current_task,
2532                name,
2533                value,
2534                op,
2535            )
2536        }
2537    }
2538
2539    pub fn remove_xattr<L>(
2540        &self,
2541        locked: &mut Locked<L>,
2542        current_task: &CurrentTask,
2543        mount: &MountInfo,
2544        name: &FsStr,
2545    ) -> Result<(), Errno>
2546    where
2547        L: LockEqualOrBefore<FileOpsCore>,
2548    {
2549        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2550        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2551
2552        // LSM access checks must be performed after discretionary checks.
2553        security::check_fs_node_removexattr_access(current_task, self, name)?;
2554        self.ops().remove_xattr(locked.cast_locked::<FileOpsCore>(), self, current_task, name)
2555    }
2556
2557    pub fn list_xattrs<L>(
2558        &self,
2559        locked: &mut Locked<L>,
2560        current_task: &CurrentTask,
2561        max_size: usize,
2562    ) -> Result<ValueOrSize<Vec<FsString>>, Errno>
2563    where
2564        L: LockEqualOrBefore<FileOpsCore>,
2565    {
2566        security::check_fs_node_listxattr_access(current_task, self)?;
2567        Ok(self
2568            .ops()
2569            .list_xattrs(locked.cast_locked::<FileOpsCore>(), self, current_task, max_size)?
2570            .map(|mut v| {
2571                // Extended attributes may be listed even if the caller would not be able to read
2572                // (or modify) the attribute's value.
2573                // trusted.* attributes are only accessible with CAP_SYS_ADMIN and are omitted by
2574                // `listxattr()` unless the caller holds CAP_SYS_ADMIN.
2575                if !security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN) {
2576                    v.retain(|name| !name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()));
2577                }
2578                v
2579            }))
2580    }
2581
2582    /// Returns current `FsNodeInfo`.
2583    pub fn info(&self) -> LockDepReadGuard<'_, FsNodeInfo> {
2584        self.info.read()
2585    }
2586
2587    /// Returns a reference to the `info` lock itself.
2588    ///
2589    /// This should ONLY be used by `RenameGuard` to perform ordered write locking on independent
2590    /// nodes.
2591    pub(super) fn info_lock(&self) -> &DynamicLockDepRwLock<FsNodeInfo> {
2592        &self.info
2593    }
2594
2595    /// Refreshes the `FsNodeInfo` if necessary and returns a read guard.
2596    pub fn fetch_and_refresh_info<L>(
2597        &self,
2598        locked: &mut Locked<L>,
2599        current_task: &CurrentTask,
2600    ) -> Result<LockDepReadGuard<'_, FsNodeInfo>, Errno>
2601    where
2602        L: LockEqualOrBefore<FileOpsCore>,
2603    {
2604        self.ops().fetch_and_refresh_info(
2605            locked.cast_locked::<FileOpsCore>(),
2606            self,
2607            current_task,
2608            &self.info,
2609        )
2610    }
2611
2612    pub fn update_info<F, T>(&self, mutator: F) -> T
2613    where
2614        F: FnOnce(&mut FsNodeInfo) -> T,
2615    {
2616        let mut info = self.info.write();
2617        mutator(&mut info)
2618    }
2619
2620    /// Clear the SUID and SGID bits unless the `current_task` has `CAP_FSETID`
2621    pub fn clear_suid_and_sgid_bits<L>(
2622        &self,
2623        locked: &mut Locked<L>,
2624        current_task: &CurrentTask,
2625    ) -> Result<(), Errno>
2626    where
2627        L: LockEqualOrBefore<FileOpsCore>,
2628    {
2629        if !security::is_task_capable_noaudit(current_task, CAP_FSETID) {
2630            self.update_attributes(locked, current_task, |info| {
2631                info.clear_suid_and_sgid_bits();
2632                Ok(())
2633            })?;
2634        }
2635        Ok(())
2636    }
2637
2638    /// Update the ctime and mtime of a file to now.
2639    pub fn update_ctime_mtime(&self) {
2640        if self.fs().manages_timestamps() {
2641            return;
2642        }
2643        self.update_info(|info| {
2644            let now = utc::utc_now();
2645            info.time_status_change = now;
2646            info.time_modify = now;
2647        });
2648    }
2649
2650    /// Update the ctime of a file to now.
2651    pub fn update_ctime(&self) {
2652        if self.fs().manages_timestamps() {
2653            return;
2654        }
2655        self.update_info(|info| {
2656            let now = utc::utc_now();
2657            info.time_status_change = now;
2658        });
2659    }
2660
2661    /// Update the atime and mtime if the `current_task` has write access, is the file owner, or
2662    /// holds either the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
2663    pub fn update_atime_mtime<L>(
2664        &self,
2665        locked: &mut Locked<L>,
2666        current_task: &CurrentTask,
2667        mount: &MountInfo,
2668        atime: TimeUpdateType,
2669        mtime: TimeUpdateType,
2670    ) -> Result<(), Errno>
2671    where
2672        L: LockEqualOrBefore<FileOpsCore>,
2673    {
2674        // If the filesystem is read-only, this always fail.
2675        mount.check_readonly_filesystem()?;
2676
2677        let now = matches!((atime, mtime), (TimeUpdateType::Now, TimeUpdateType::Now));
2678        self.check_access(
2679            locked,
2680            current_task,
2681            mount,
2682            Access::WRITE,
2683            CheckAccessReason::ChangeTimestamps { now },
2684            security::Auditable::Location(std::panic::Location::caller()),
2685        )?;
2686
2687        if !matches!((atime, mtime), (TimeUpdateType::Omit, TimeUpdateType::Omit)) {
2688            // This function is called by `utimes(..)` which will update the access and
2689            // modification time. We need to call `update_attributes()` to update the mtime of
2690            // filesystems that manages file timestamps.
2691            self.update_attributes(locked, current_task, |info| {
2692                let now = utc::utc_now();
2693                let get_time = |time: TimeUpdateType| match time {
2694                    TimeUpdateType::Now => Some(now),
2695                    TimeUpdateType::Time(t) => Some(t),
2696                    TimeUpdateType::Omit => None,
2697                };
2698                if let Some(time) = get_time(atime) {
2699                    info.time_access = time;
2700                }
2701                if let Some(time) = get_time(mtime) {
2702                    info.time_modify = time;
2703                }
2704                Ok(())
2705            })?;
2706        }
2707        Ok(())
2708    }
2709
2710    /// The key used to identify this node in the file system's node cache.
2711    ///
2712    /// For many file systems, this will be the same as the inode number. However, some file
2713    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
2714    pub fn node_key(&self) -> ino_t {
2715        self.ops().node_key(self)
2716    }
2717
2718    fn ensure_rare_data(&self) -> &FsNodeRareData {
2719        self.rare_data.get_or_init(|| Box::new(FsNodeRareData::default()))
2720    }
2721
2722    /// Returns the set of watchers for this node.
2723    ///
2724    /// Only call this function if you require this node to actually store a list of watchers. If
2725    /// you just wish to notify any watchers that might exist, please use `notify` instead.
2726    pub fn ensure_watchers(&self) -> &inotify::InotifyWatchers {
2727        &self.ensure_rare_data().watchers
2728    }
2729
2730    /// Notify the watchers of the given event.
2731    pub fn notify(
2732        &self,
2733        event_mask: InotifyMask,
2734        cookie: u32,
2735        name: &FsStr,
2736        mode: FileMode,
2737        is_dead: bool,
2738    ) {
2739        if let Some(rare_data) = self.rare_data.get() {
2740            rare_data.watchers.notify(event_mask, cookie, name, mode, is_dead);
2741        }
2742    }
2743
2744    /// Calls through to the filesystem to enable fs-verity on this file.
2745    pub fn enable_fsverity<L>(
2746        &self,
2747        locked: &mut Locked<L>,
2748        current_task: &CurrentTask,
2749        descriptor: &fsverity_descriptor,
2750    ) -> Result<(), Errno>
2751    where
2752        L: LockEqualOrBefore<FileOpsCore>,
2753    {
2754        let locked = locked.cast_locked::<FileOpsCore>();
2755        self.ops().enable_fsverity(locked, self, current_task, descriptor)
2756    }
2757}
2758
2759impl std::fmt::Debug for FsNode {
2760    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2761        f.debug_struct("FsNode")
2762            .field("fs", &self.fs().name())
2763            .field("info", &*self.info())
2764            .field("ops_ty", &self.ops().type_name())
2765            .finish()
2766    }
2767}
2768
2769impl Releasable for FsNode {
2770    type Context<'a> = CurrentTaskAndLocked<'a>;
2771
2772    fn release<'a>(self, context: CurrentTaskAndLocked<'a>) {
2773        let (locked, current_task) = context;
2774        if let Some(fs) = self.fs.upgrade() {
2775            fs.remove_node(&self);
2776        }
2777        if let Err(err) = self.ops.forget(
2778            locked.cast_locked::<FileOpsCore>(),
2779            current_task,
2780            self.info.into_inner(),
2781        ) {
2782            log_error!("Error on FsNodeOps::forget: {err:?}");
2783        }
2784    }
2785}
2786
2787fn check_access(
2788    fs_node: &FsNode,
2789    current_task: &CurrentTask,
2790    permission_flags: security::PermissionFlags,
2791    node_uid: uid_t,
2792    node_gid: gid_t,
2793    mode: FileMode,
2794) -> Result<(), Errno> {
2795    // Determine which of the access bits apply to the `current_task`.
2796    let (fsuid, is_in_group) = {
2797        let current_creds = current_task.current_creds();
2798        (current_creds.fsuid, current_creds.is_in_group(node_gid))
2799    };
2800    let granted = if fsuid == node_uid {
2801        mode.user_access()
2802    } else if is_in_group {
2803        mode.group_access()
2804    } else {
2805        mode.other_access()
2806    };
2807
2808    let access = permission_flags.as_access();
2809    if granted.contains(access) {
2810        return Ok(());
2811    }
2812
2813    // Callers with CAP_DAC_READ_SEARCH override can read files & directories, and traverse
2814    // directories to which they lack permission.
2815    let mut requested = access & !granted;
2816
2817    // If this check was triggered by `access()`, or a variant, then check for a `dontaudit`
2818    // statement for the `audit_access` permission for this caller & file.
2819    let have_dont_audit = OnceBool::new();
2820    let has_capability = move |current_task, capability| {
2821        let dont_audit = have_dont_audit.get_or_init(|| {
2822            permission_flags.contains(PermissionFlags::ACCESS)
2823                && security::has_dontaudit_access(current_task, fs_node)
2824        });
2825        if dont_audit {
2826            security::is_task_capable_noaudit(current_task, capability)
2827        } else {
2828            security::check_task_capable(current_task, capability).is_ok()
2829        }
2830    };
2831
2832    // CAP_DAC_READ_SEARCH allows bypass of read checks, and directory traverse (eXecute) checks.
2833    let dac_read_search_access =
2834        if mode.is_dir() { Access::READ | Access::EXEC } else { Access::READ };
2835    if dac_read_search_access.intersects(requested)
2836        && has_capability(current_task, CAP_DAC_READ_SEARCH)
2837    {
2838        requested.remove(dac_read_search_access);
2839    }
2840    if requested.is_empty() {
2841        return Ok(());
2842    }
2843
2844    // CAP_DAC_OVERRIDE allows bypass of all checks (though see the comment for file-execute).
2845    let mut dac_override_access = Access::READ | Access::WRITE;
2846    dac_override_access |= if mode.is_dir() {
2847        Access::EXEC
2848    } else {
2849        // File execute access checks may not be bypassed unless at least one executable bit is set.
2850        (mode.user_access() | mode.group_access() | mode.other_access()) & Access::EXEC
2851    };
2852    if dac_override_access.intersects(requested) && has_capability(current_task, CAP_DAC_OVERRIDE) {
2853        requested.remove(dac_override_access);
2854    }
2855    if requested.is_empty() {
2856        return Ok(());
2857    }
2858
2859    return error!(EACCES);
2860}
2861
2862#[cfg(test)]
2863mod tests {
2864    use super::*;
2865    use crate::device::mem::mem_device_init;
2866    use crate::testing::*;
2867    use crate::vfs::buffers::VecOutputBuffer;
2868    use starnix_uapi::auth::Credentials;
2869    use starnix_uapi::file_mode::mode;
2870
2871    #[::fuchsia::test]
2872    async fn open_device_file() {
2873        spawn_kernel_and_run(async |locked, current_task| {
2874            mem_device_init(locked, &*current_task).expect("mem_device_init");
2875
2876            // Create a device file that points to the `zero` device (which is automatically
2877            // registered in the kernel).
2878            current_task
2879                .fs()
2880                .root()
2881                .create_node(
2882                    locked,
2883                    &current_task,
2884                    "zero".into(),
2885                    mode!(IFCHR, 0o666),
2886                    DeviceId::ZERO,
2887                )
2888                .expect("create_node");
2889
2890            const CONTENT_LEN: usize = 10;
2891            let mut buffer = VecOutputBuffer::new(CONTENT_LEN);
2892
2893            // Read from the zero device.
2894            let device_file = current_task
2895                .open_file(locked, "zero".into(), OpenFlags::RDONLY)
2896                .expect("open device file");
2897            device_file.read(locked, &current_task, &mut buffer).expect("read from zero");
2898
2899            // Assert the contents.
2900            assert_eq!(&[0; CONTENT_LEN], buffer.data());
2901        })
2902        .await;
2903    }
2904
2905    #[::fuchsia::test]
2906    async fn node_info_is_reflected_in_stat() {
2907        spawn_kernel_and_run(async |locked, current_task| {
2908            // Create a node.
2909            let node = &current_task
2910                .fs()
2911                .root()
2912                .create_node(locked, &current_task, "zero".into(), FileMode::IFCHR, DeviceId::ZERO)
2913                .expect("create_node")
2914                .entry
2915                .node;
2916            node.update_info(|info| {
2917                info.mode = FileMode::IFSOCK;
2918                info.size = 1;
2919                info.blocks = 2;
2920                info.blksize = 4;
2921                info.uid = 9;
2922                info.gid = 10;
2923                info.link_count = 11;
2924                info.time_status_change = UtcInstant::from_nanos(1);
2925                info.time_access = UtcInstant::from_nanos(2);
2926                info.time_modify = UtcInstant::from_nanos(3);
2927                info.rdev = DeviceId::new(13, 13);
2928            });
2929            let stat = node.stat(locked, &current_task).expect("stat");
2930
2931            assert_eq!(stat.st_mode, FileMode::IFSOCK.bits());
2932            assert_eq!(stat.st_size, 1);
2933            assert_eq!(stat.st_blksize, 4);
2934            assert_eq!(stat.st_blocks, 2);
2935            assert_eq!(stat.st_uid, 9);
2936            assert_eq!(stat.st_gid, 10);
2937            assert_eq!(stat.st_nlink, 11);
2938            assert_eq!(stat.st_ctime, 0);
2939            assert_eq!(stat.st_ctime_nsec, 1);
2940            assert_eq!(stat.st_atime, 0);
2941            assert_eq!(stat.st_atime_nsec, 2);
2942            assert_eq!(stat.st_mtime, 0);
2943            assert_eq!(stat.st_mtime_nsec, 3);
2944            assert_eq!(stat.st_rdev, DeviceId::new(13, 13).bits());
2945        })
2946        .await;
2947    }
2948
2949    #[::fuchsia::test]
2950    fn test_flock_operation() {
2951        assert!(FlockOperation::from_flags(0).is_err());
2952        assert!(FlockOperation::from_flags(u32::MAX).is_err());
2953
2954        let operation1 = FlockOperation::from_flags(LOCK_SH).expect("from_flags");
2955        assert!(!operation1.is_unlock());
2956        assert!(!operation1.is_lock_exclusive());
2957        assert!(operation1.is_blocking());
2958
2959        let operation2 = FlockOperation::from_flags(LOCK_EX | LOCK_NB).expect("from_flags");
2960        assert!(!operation2.is_unlock());
2961        assert!(operation2.is_lock_exclusive());
2962        assert!(!operation2.is_blocking());
2963
2964        let operation3 = FlockOperation::from_flags(LOCK_UN).expect("from_flags");
2965        assert!(operation3.is_unlock());
2966        assert!(!operation3.is_lock_exclusive());
2967        assert!(operation3.is_blocking());
2968    }
2969
2970    #[::fuchsia::test]
2971    async fn test_check_access() {
2972        spawn_kernel_and_run(async |locked, current_task| {
2973            let mut creds = Credentials::with_ids(1, 2);
2974            creds.groups = vec![3, 4];
2975            current_task.set_creds(creds);
2976
2977            // Create a node.
2978            let node = &current_task
2979                .fs()
2980                .root()
2981                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
2982                .expect("create_node")
2983                .entry
2984                .node;
2985            let check_access = |locked: &mut Locked<Unlocked>,
2986                                uid: uid_t,
2987                                gid: gid_t,
2988                                perm: u32,
2989                                access: Access| {
2990                node.update_info(|info| {
2991                    info.mode = mode!(IFREG, perm);
2992                    info.uid = uid;
2993                    info.gid = gid;
2994                });
2995                node.check_access(
2996                    locked,
2997                    &current_task,
2998                    &MountInfo::detached(),
2999                    access,
3000                    CheckAccessReason::InternalPermissionChecks,
3001                    security::Auditable::Location(std::panic::Location::caller()),
3002                )
3003            };
3004
3005            assert_eq!(check_access(locked, 0, 0, 0o700, Access::EXEC), error!(EACCES));
3006            assert_eq!(check_access(locked, 0, 0, 0o700, Access::READ), error!(EACCES));
3007            assert_eq!(check_access(locked, 0, 0, 0o700, Access::WRITE), error!(EACCES));
3008
3009            assert_eq!(check_access(locked, 0, 0, 0o070, Access::EXEC), error!(EACCES));
3010            assert_eq!(check_access(locked, 0, 0, 0o070, Access::READ), error!(EACCES));
3011            assert_eq!(check_access(locked, 0, 0, 0o070, Access::WRITE), error!(EACCES));
3012
3013            assert_eq!(check_access(locked, 0, 0, 0o007, Access::EXEC), Ok(()));
3014            assert_eq!(check_access(locked, 0, 0, 0o007, Access::READ), Ok(()));
3015            assert_eq!(check_access(locked, 0, 0, 0o007, Access::WRITE), Ok(()));
3016
3017            assert_eq!(check_access(locked, 1, 0, 0o700, Access::EXEC), Ok(()));
3018            assert_eq!(check_access(locked, 1, 0, 0o700, Access::READ), Ok(()));
3019            assert_eq!(check_access(locked, 1, 0, 0o700, Access::WRITE), Ok(()));
3020
3021            assert_eq!(check_access(locked, 1, 0, 0o100, Access::EXEC), Ok(()));
3022            assert_eq!(check_access(locked, 1, 0, 0o100, Access::READ), error!(EACCES));
3023            assert_eq!(check_access(locked, 1, 0, 0o100, Access::WRITE), error!(EACCES));
3024
3025            assert_eq!(check_access(locked, 1, 0, 0o200, Access::EXEC), error!(EACCES));
3026            assert_eq!(check_access(locked, 1, 0, 0o200, Access::READ), error!(EACCES));
3027            assert_eq!(check_access(locked, 1, 0, 0o200, Access::WRITE), Ok(()));
3028
3029            assert_eq!(check_access(locked, 1, 0, 0o400, Access::EXEC), error!(EACCES));
3030            assert_eq!(check_access(locked, 1, 0, 0o400, Access::READ), Ok(()));
3031            assert_eq!(check_access(locked, 1, 0, 0o400, Access::WRITE), error!(EACCES));
3032
3033            assert_eq!(check_access(locked, 0, 2, 0o700, Access::EXEC), error!(EACCES));
3034            assert_eq!(check_access(locked, 0, 2, 0o700, Access::READ), error!(EACCES));
3035            assert_eq!(check_access(locked, 0, 2, 0o700, Access::WRITE), error!(EACCES));
3036
3037            assert_eq!(check_access(locked, 0, 2, 0o070, Access::EXEC), Ok(()));
3038            assert_eq!(check_access(locked, 0, 2, 0o070, Access::READ), Ok(()));
3039            assert_eq!(check_access(locked, 0, 2, 0o070, Access::WRITE), Ok(()));
3040
3041            assert_eq!(check_access(locked, 0, 3, 0o070, Access::EXEC), Ok(()));
3042            assert_eq!(check_access(locked, 0, 3, 0o070, Access::READ), Ok(()));
3043            assert_eq!(check_access(locked, 0, 3, 0o070, Access::WRITE), Ok(()));
3044        })
3045        .await;
3046    }
3047
3048    #[::fuchsia::test]
3049    async fn set_security_xattr_fails_without_security_module_or_root() {
3050        spawn_kernel_and_run(async |locked, current_task| {
3051            let mut creds = Credentials::with_ids(1, 2);
3052            creds.groups = vec![3, 4];
3053            current_task.set_creds(creds);
3054
3055            // Create a node.
3056            let node = &current_task
3057                .fs()
3058                .root()
3059                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
3060                .expect("create_node")
3061                .entry
3062                .node;
3063
3064            // Give read-write-execute access.
3065            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3066
3067            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3068            // should fail.
3069            assert_eq!(
3070                node.set_xattr(
3071                    locked,
3072                    &current_task,
3073                    &MountInfo::detached(),
3074                    "security.name".into(),
3075                    "security_label".into(),
3076                    XattrOp::Create,
3077                ),
3078                error!(EPERM)
3079            );
3080        })
3081        .await;
3082    }
3083
3084    #[::fuchsia::test]
3085    async fn set_non_user_xattr_fails_without_security_module_or_root() {
3086        spawn_kernel_and_run(async |locked, current_task| {
3087            let mut creds = Credentials::with_ids(1, 2);
3088            creds.groups = vec![3, 4];
3089            current_task.set_creds(creds);
3090
3091            // Create a node.
3092            let node = &current_task
3093                .fs()
3094                .root()
3095                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
3096                .expect("create_node")
3097                .entry
3098                .node;
3099
3100            // Give read-write-execute access.
3101            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3102
3103            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3104            // should fail.
3105            assert_eq!(
3106                node.set_xattr(
3107                    locked,
3108                    &current_task,
3109                    &MountInfo::detached(),
3110                    "trusted.name".into(),
3111                    "some data".into(),
3112                    XattrOp::Create,
3113                ),
3114                error!(EPERM)
3115            );
3116        })
3117        .await;
3118    }
3119
3120    #[::fuchsia::test]
3121    async fn get_security_xattr_succeeds_without_read_access() {
3122        spawn_kernel_and_run(async |locked, current_task| {
3123            let mut creds = Credentials::with_ids(1, 2);
3124            creds.groups = vec![3, 4];
3125            current_task.set_creds(creds);
3126
3127            // Create a node.
3128            let node = &current_task
3129                .fs()
3130                .root()
3131                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
3132                .expect("create_node")
3133                .entry
3134                .node;
3135
3136            // Only give read access to the root and give root access to the current task.
3137            node.update_info(|info| info.mode = mode!(IFREG, 0o100));
3138            current_task.set_creds(Credentials::with_ids(0, 0));
3139
3140            // Setting the label should succeed even without write access to the file.
3141            assert_eq!(
3142                node.set_xattr(
3143                    locked,
3144                    &current_task,
3145                    &MountInfo::detached(),
3146                    "security.name".into(),
3147                    "security_label".into(),
3148                    XattrOp::Create,
3149                ),
3150                Ok(())
3151            );
3152
3153            // Remove root access from the current task.
3154            current_task.set_creds(Credentials::with_ids(1, 1));
3155
3156            // Getting the label should succeed even without read access to the file.
3157            assert_eq!(
3158                node.get_xattr(
3159                    locked,
3160                    &current_task,
3161                    &MountInfo::detached(),
3162                    "security.name".into(),
3163                    4096
3164                ),
3165                Ok(ValueOrSize::Value("security_label".into()))
3166            );
3167        })
3168        .await;
3169    }
3170}