Skip to main content

starnix_core/vfs/
fs_node.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::device::DeviceMode;
6use crate::mm::PAGE_SIZE;
7use crate::security::{self, Auditable, PermissionFlags};
8use crate::signals::{SignalInfo, send_standard_signal};
9use crate::task::{CurrentTask, CurrentTaskAndLocked, WaitQueue, Waiter, register_delayed_release};
10use crate::time::utc;
11use crate::vfs::fsverity::FsVerityState;
12use crate::vfs::pipe::{Pipe, PipeHandle};
13use crate::vfs::rw_queue::{RwQueue, RwQueueReadGuard};
14use crate::vfs::socket::SocketHandle;
15use crate::vfs::{
16    DefaultDirEntryOps, DirEntryOps, FileObject, FileObjectState, FileOps, FileSystem,
17    FileSystemHandle, FileWriteGuardState, FsStr, FsString, MAX_LFS_FILESIZE, MountInfo,
18    NamespaceNode, OPathOps, RecordLockCommand, RecordLockOwner, RecordLocks, WeakFileHandle,
19    checked_add_offset_and_length, inotify,
20};
21use bitflags::bitflags;
22use fuchsia_runtime::UtcInstant;
23use linux_uapi::{XATTR_SECURITY_PREFIX, XATTR_SYSTEM_PREFIX, XATTR_TRUSTED_PREFIX};
24use once_cell::race::OnceBool;
25use starnix_crypt::EncryptionKeyId;
26use starnix_lifecycle::{ObjectReleaser, ReleaserAction};
27use starnix_logging::{log_error, track_stub};
28use starnix_sync::{
29    BeforeFsNodeAppend, FileOpsCore, FsNodeAppend, LockBefore, LockEqualOrBefore, Locked, Mutex,
30    RwLock, RwLockReadGuard, Unlocked,
31};
32use starnix_types::ownership::{Releasable, ReleaseGuard};
33use starnix_types::time::{NANOS_PER_SECOND, timespec_from_time};
34use starnix_uapi::as_any::AsAny;
35use starnix_uapi::auth::{
36    CAP_CHOWN, CAP_DAC_OVERRIDE, CAP_DAC_READ_SEARCH, CAP_FOWNER, CAP_FSETID, CAP_MKNOD,
37    CAP_SYS_ADMIN, CAP_SYS_RESOURCE, FsCred, UserAndOrGroupId,
38};
39use starnix_uapi::device_type::DeviceType;
40use starnix_uapi::errors::{EACCES, ENOTSUP, EPERM, Errno};
41use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
42use starnix_uapi::inotify_mask::InotifyMask;
43use starnix_uapi::mount_flags::MountFlags;
44use starnix_uapi::open_flags::OpenFlags;
45use starnix_uapi::resource_limits::Resource;
46use starnix_uapi::seal_flags::SealFlags;
47use starnix_uapi::signals::SIGXFSZ;
48use starnix_uapi::{
49    FALLOC_FL_COLLAPSE_RANGE, FALLOC_FL_INSERT_RANGE, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE,
50    FALLOC_FL_UNSHARE_RANGE, FALLOC_FL_ZERO_RANGE, LOCK_EX, LOCK_NB, LOCK_SH, LOCK_UN,
51    STATX__RESERVED, STATX_ATIME, STATX_ATTR_VERITY, STATX_BASIC_STATS, STATX_BLOCKS, STATX_CTIME,
52    STATX_GID, STATX_INO, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_UID, XATTR_USER_PREFIX,
53    errno, error, fsverity_descriptor, gid_t, ino_t, statx, statx_timestamp, timespec, uapi, uid_t,
54};
55use std::sync::atomic::Ordering;
56use std::sync::{Arc, OnceLock, Weak};
57use syncio::zxio_node_attr_has_t;
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum FsNodeLinkBehavior {
61    Allowed,
62    Disallowed,
63}
64
65impl Default for FsNodeLinkBehavior {
66    fn default() -> Self {
67        FsNodeLinkBehavior::Allowed
68    }
69}
70
71pub enum AppendLockGuard<'a> {
72    Read(RwQueueReadGuard<'a, FsNodeAppend>),
73    AlreadyLocked(&'a AppendLockGuard<'a>),
74}
75
76pub trait AppendLockStrategy<L> {
77    /// Helper method for acquiring append lock in `truncate`/`allocate`. Acquires the lock when it's not already acquired.
78    fn lock<'a>(
79        &'a self,
80        locked: &'a mut Locked<L>,
81        current_task: &CurrentTask,
82        node: &'a FsNode,
83    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno>;
84}
85
86struct RealAppendLockStrategy {}
87
88impl AppendLockStrategy<BeforeFsNodeAppend> for RealAppendLockStrategy {
89    fn lock<'a>(
90        &'a self,
91        locked: &'a mut Locked<BeforeFsNodeAppend>,
92        current_task: &CurrentTask,
93        node: &'a FsNode,
94    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno> {
95        let (guard, new_locked) = node.ops().append_lock_read(locked, node, current_task)?;
96        Ok((AppendLockGuard::Read(guard), new_locked.cast_locked()))
97    }
98}
99
100pub struct AlreadyLockedAppendLockStrategy<'a> {
101    // Keep the reference to the guard, which will be returned in subsequent attempts to acquire this lock.
102    guard: &'a AppendLockGuard<'a>,
103}
104
105impl<'a> AlreadyLockedAppendLockStrategy<'a> {
106    pub fn new(guard: &'a AppendLockGuard<'a>) -> Self {
107        Self { guard }
108    }
109}
110
111impl AppendLockStrategy<FileOpsCore> for AlreadyLockedAppendLockStrategy<'_> {
112    fn lock<'a>(
113        &'a self,
114        locked: &'a mut Locked<FileOpsCore>,
115        _current_task: &CurrentTask,
116        _node: &'a FsNode,
117    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno> {
118        Ok((AppendLockGuard::AlreadyLocked(self.guard), locked.cast_locked::<FileOpsCore>()))
119    }
120}
121
122pub struct FsNode {
123    /// The inode number for this FsNode.
124    pub ino: ino_t,
125
126    /// The FsNodeOps for this FsNode.
127    ///
128    /// The FsNodeOps are implemented by the individual file systems to provide
129    /// specific behaviors for this FsNode.
130    ops: Box<dyn FsNodeOps>,
131
132    /// The FileSystem that owns this FsNode's tree.
133    fs: Weak<FileSystem>,
134
135    /// A RwLock to synchronize append operations for this node.
136    ///
137    /// FileObjects writing with O_APPEND should grab a write() lock on this
138    /// field to ensure they operate sequentially. FileObjects writing without
139    /// O_APPEND should grab read() lock so that they can operate in parallel.
140    pub append_lock: RwQueue<FsNodeAppend>,
141
142    /// Mutable information about this node.
143    ///
144    /// This data is used to populate the uapi::stat structure.
145    info: RwLock<FsNodeInfo>,
146
147    /// Data associated with an FsNode that is rarely needed.
148    rare_data: OnceLock<Box<FsNodeRareData>>,
149
150    /// Tracks lock state for this file.
151    pub write_guard_state: Mutex<FileWriteGuardState>,
152
153    /// Cached FsVerity state associated with this node.
154    pub fsverity: Mutex<FsVerityState>,
155
156    /// The security state associated with this node. Must always be acquired last
157    /// relative to other `FsNode` locks.
158    pub security_state: security::FsNodeState,
159}
160
161#[derive(Default)]
162struct FsNodeRareData {
163    /// The pipe located at this node, if any.
164    ///
165    /// Used if, and only if, the node has a mode of FileMode::IFIFO.
166    fifo: OnceLock<PipeHandle>,
167
168    /// The UNIX domain socket bound to this node, if any.
169    bound_socket: OnceLock<SocketHandle>,
170
171    /// Information about the locking information on this node.
172    ///
173    /// No other lock on this object may be taken while this lock is held.
174    flock_info: Mutex<FlockInfo>,
175
176    /// Records locks associated with this node.
177    record_locks: RecordLocks,
178
179    /// Whether this node can be linked into a directory.
180    ///
181    /// Only set for nodes created with `O_TMPFILE`.
182    link_behavior: OnceLock<FsNodeLinkBehavior>,
183
184    /// Inotify watchers on this node. See inotify(7).
185    watchers: inotify::InotifyWatchers,
186}
187
188impl FsNodeRareData {
189    fn ensure_fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
190        self.fifo.get_or_init(|| {
191            let mut default_pipe_capacity = (*PAGE_SIZE * 16) as usize;
192            if !security::is_task_capable_noaudit(current_task, CAP_SYS_RESOURCE) {
193                let kernel = current_task.kernel();
194                let max_size = kernel.system_limits.pipe_max_size.load(Ordering::Relaxed);
195                default_pipe_capacity = std::cmp::min(default_pipe_capacity, max_size);
196            }
197            Pipe::new(default_pipe_capacity)
198        })
199    }
200}
201
202pub enum FsNodeReleaserAction {}
203impl ReleaserAction<FsNode> for FsNodeReleaserAction {
204    fn release(fs_node: ReleaseGuard<FsNode>) {
205        register_delayed_release(fs_node);
206    }
207}
208pub type FsNodeReleaser = ObjectReleaser<FsNode, FsNodeReleaserAction>;
209pub type FsNodeHandle = Arc<FsNodeReleaser>;
210pub type WeakFsNodeHandle = Weak<FsNodeReleaser>;
211
212#[derive(Debug, Default, Clone, PartialEq)]
213pub struct FsNodeInfo {
214    pub mode: FileMode,
215    pub link_count: usize,
216    pub uid: uid_t,
217    pub gid: gid_t,
218    pub rdev: DeviceType,
219    pub size: usize,
220    pub blksize: usize,
221    pub blocks: usize,
222    pub time_status_change: UtcInstant,
223    pub time_access: UtcInstant,
224    pub time_modify: UtcInstant,
225    pub casefold: bool,
226
227    // If this node is fscrypt encrypted, stores the id of the user wrapping key used to encrypt it.
228    pub wrapping_key_id: Option<[u8; 16]>,
229
230    // Used to indicate to filesystems that manage timestamps that an access has occurred and to
231    // update the node's atime.
232    // This only impacts accesses within Starnix. Most Fuchsia programs are not expected to maintain
233    // access times. If the file handle is transferred out of Starnix, there may be inconsistencies.
234    pub pending_time_access_update: bool,
235}
236
237impl FsNodeInfo {
238    pub fn new(mode: FileMode, owner: FsCred) -> Self {
239        let now = utc::utc_now();
240        Self {
241            mode,
242            link_count: if mode.is_dir() { 2 } else { 1 },
243            uid: owner.uid,
244            gid: owner.gid,
245            blksize: DEFAULT_BYTES_PER_BLOCK,
246            time_status_change: now,
247            time_access: now,
248            time_modify: now,
249            ..Default::default()
250        }
251    }
252
253    pub fn storage_size(&self) -> usize {
254        self.blksize.saturating_mul(self.blocks)
255    }
256
257    pub fn chmod(&mut self, mode: FileMode) {
258        self.mode = (self.mode & !FileMode::PERMISSIONS) | (mode & FileMode::PERMISSIONS);
259    }
260
261    pub fn chown(&mut self, owner: Option<uid_t>, group: Option<gid_t>) {
262        if let Some(owner) = owner {
263            self.uid = owner;
264        }
265        if let Some(group) = group {
266            self.gid = group;
267        }
268        // Clear the setuid and setgid bits if the file is executable and a regular file.
269        if self.mode.is_reg() {
270            self.mode &= !FileMode::ISUID;
271            self.clear_sgid_bit();
272        }
273    }
274
275    fn clear_sgid_bit(&mut self) {
276        // If the group execute bit is not set, the setgid bit actually indicates mandatory
277        // locking and should not be cleared.
278        if self.mode.intersects(FileMode::IXGRP) {
279            self.mode &= !FileMode::ISGID;
280        }
281    }
282
283    fn clear_suid_and_sgid_bits(&mut self) {
284        self.mode &= !FileMode::ISUID;
285        self.clear_sgid_bit();
286    }
287
288    pub fn cred(&self) -> FsCred {
289        FsCred { uid: self.uid, gid: self.gid }
290    }
291
292    pub fn suid_and_sgid(
293        &self,
294        current_task: &CurrentTask,
295        fs_node: &FsNode,
296    ) -> Result<UserAndOrGroupId, Errno> {
297        let uid = self.mode.contains(FileMode::ISUID).then_some(self.uid);
298
299        // See <https://man7.org/linux/man-pages/man7/inode.7.html>:
300        //
301        //   For an executable file, the set-group-ID bit causes the
302        //   effective group ID of a process that executes the file to change
303        //   as described in execve(2).  For a file that does not have the
304        //   group execution bit (S_IXGRP) set, the set-group-ID bit indicates
305        //   mandatory file/record locking.
306        let gid = self.mode.contains(FileMode::ISGID | FileMode::IXGRP).then_some(self.gid);
307
308        let maybe_set_id = UserAndOrGroupId { uid, gid };
309        if maybe_set_id.is_some() {
310            // Check that uid and gid actually have execute access before
311            // returning them as the SUID or SGID.
312            check_access(
313                fs_node,
314                current_task,
315                security::PermissionFlags::EXEC,
316                self.uid,
317                self.gid,
318                self.mode,
319            )?;
320        }
321        Ok(maybe_set_id)
322    }
323}
324
325#[derive(Default)]
326struct FlockInfo {
327    /// Whether the node is currently locked. The meaning of the different values are:
328    /// - `None`: The node is not locked.
329    /// - `Some(false)`: The node is locked non exclusively.
330    /// - `Some(true)`: The node is locked exclusively.
331    locked_exclusive: Option<bool>,
332    /// The FileObject that hold the lock.
333    locking_handles: Vec<WeakFileHandle>,
334    /// The queue to notify process waiting on the lock.
335    wait_queue: WaitQueue,
336}
337
338impl FlockInfo {
339    /// Removes all file handle not holding `predicate` from the list of object holding the lock. If
340    /// this empties the list, unlocks the node and notifies all waiting processes.
341    pub fn retain<F>(&mut self, predicate: F)
342    where
343        F: Fn(&FileObject) -> bool,
344    {
345        if !self.locking_handles.is_empty() {
346            self.locking_handles
347                .retain(|w| if let Some(fh) = w.upgrade() { predicate(&fh) } else { false });
348            if self.locking_handles.is_empty() {
349                self.locked_exclusive = None;
350                self.wait_queue.notify_all();
351            }
352        }
353    }
354}
355
356/// `st_blksize` is measured in units of 512 bytes.
357pub const DEFAULT_BYTES_PER_BLOCK: usize = 512;
358
359pub struct FlockOperation {
360    operation: u32,
361}
362
363impl FlockOperation {
364    pub fn from_flags(operation: u32) -> Result<Self, Errno> {
365        if operation & !(LOCK_SH | LOCK_EX | LOCK_UN | LOCK_NB) != 0 {
366            return error!(EINVAL);
367        }
368        if [LOCK_SH, LOCK_EX, LOCK_UN].iter().filter(|&&o| operation & o == o).count() != 1 {
369            return error!(EINVAL);
370        }
371        Ok(Self { operation })
372    }
373
374    pub fn is_unlock(&self) -> bool {
375        self.operation & LOCK_UN > 0
376    }
377
378    pub fn is_lock_exclusive(&self) -> bool {
379        self.operation & LOCK_EX > 0
380    }
381
382    pub fn is_blocking(&self) -> bool {
383        self.operation & LOCK_NB == 0
384    }
385}
386
387impl FileObject {
388    /// Advisory locking.
389    ///
390    /// See flock(2).
391    pub fn flock(
392        &self,
393        locked: &mut Locked<Unlocked>,
394        current_task: &CurrentTask,
395        operation: FlockOperation,
396    ) -> Result<(), Errno> {
397        if self.flags().contains(OpenFlags::PATH) {
398            return error!(EBADF);
399        }
400        loop {
401            let mut flock_info = self.name.entry.node.ensure_rare_data().flock_info.lock();
402            if operation.is_unlock() {
403                flock_info.retain(|fh| !std::ptr::eq(fh, self));
404                return Ok(());
405            }
406            // Operation is a locking operation.
407            // 1. File is not locked
408            if flock_info.locked_exclusive.is_none() {
409                flock_info.locked_exclusive = Some(operation.is_lock_exclusive());
410                flock_info.locking_handles.push(self.weak_handle.clone());
411                return Ok(());
412            }
413
414            let file_lock_is_exclusive = flock_info.locked_exclusive == Some(true);
415            let fd_has_lock = flock_info
416                .locking_handles
417                .iter()
418                .find_map(|w| {
419                    w.upgrade().and_then(|fh| {
420                        if std::ptr::eq(&fh as &FileObject, self) { Some(()) } else { None }
421                    })
422                })
423                .is_some();
424
425            // 2. File is locked, but fd already have a lock
426            if fd_has_lock {
427                if operation.is_lock_exclusive() == file_lock_is_exclusive {
428                    // Correct lock is already held, return.
429                    return Ok(());
430                } else {
431                    // Incorrect lock is held. Release the lock and loop back to try to reacquire
432                    // it. flock doesn't guarantee atomic lock type switching.
433                    flock_info.retain(|fh| !std::ptr::eq(fh, self));
434                    continue;
435                }
436            }
437
438            // 3. File is locked, and fd doesn't have a lock.
439            if !file_lock_is_exclusive && !operation.is_lock_exclusive() {
440                // The lock is not exclusive, let's grab it.
441                flock_info.locking_handles.push(self.weak_handle.clone());
442                return Ok(());
443            }
444
445            // 4. The operation cannot be done at this time.
446            if !operation.is_blocking() {
447                return error!(EAGAIN);
448            }
449
450            // Register a waiter to be notified when the lock is released. Release the lock on
451            // FlockInfo, and wait.
452            let waiter = Waiter::new();
453            flock_info.wait_queue.wait_async(&waiter);
454            std::mem::drop(flock_info);
455            waiter.wait(locked, current_task)?;
456        }
457    }
458}
459
460// The inner mod is required because bitflags cannot pass the attribute through to the single
461// variant, and attributes cannot be applied to macro invocations.
462mod inner_flags {
463    // Part of the code for the AT_STATX_SYNC_AS_STAT case that's produced by the macro triggers the
464    // lint, but as a whole, the produced code is still correct.
465    #![allow(clippy::bad_bit_mask)] // TODO(b/303500202) Remove once addressed in bitflags.
466    use super::{bitflags, uapi};
467
468    bitflags! {
469        #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
470        pub struct StatxFlags: u32 {
471            const AT_SYMLINK_NOFOLLOW = uapi::AT_SYMLINK_NOFOLLOW;
472            const AT_EMPTY_PATH = uapi::AT_EMPTY_PATH;
473            const AT_NO_AUTOMOUNT = uapi::AT_NO_AUTOMOUNT;
474            const AT_STATX_SYNC_AS_STAT = uapi::AT_STATX_SYNC_AS_STAT;
475            const AT_STATX_FORCE_SYNC = uapi::AT_STATX_FORCE_SYNC;
476            const AT_STATX_DONT_SYNC = uapi::AT_STATX_DONT_SYNC;
477            const STATX_ATTR_VERITY = uapi::STATX_ATTR_VERITY;
478        }
479    }
480}
481
482pub use inner_flags::StatxFlags;
483
484#[derive(Copy, Clone, Debug, PartialEq, Eq)]
485pub enum UnlinkKind {
486    /// Unlink a directory.
487    Directory,
488
489    /// Unlink a non-directory.
490    NonDirectory,
491}
492
493pub enum SymlinkTarget {
494    Path(FsString),
495    Node(NamespaceNode),
496}
497
498#[derive(Clone, Copy, PartialEq, Eq)]
499pub enum XattrOp {
500    /// Set the value of the extended attribute regardless of whether it exists.
501    Set,
502    /// Create a new extended attribute. Fail if it already exists.
503    Create,
504    /// Replace the value of the extended attribute. Fail if it doesn't exist.
505    Replace,
506}
507
508impl XattrOp {
509    pub fn into_flags(self) -> u32 {
510        match self {
511            Self::Set => 0,
512            Self::Create => uapi::XATTR_CREATE,
513            Self::Replace => uapi::XATTR_REPLACE,
514        }
515    }
516}
517
518/// Returns a value, or the size required to contains it.
519#[derive(Clone, Debug, PartialEq)]
520pub enum ValueOrSize<T> {
521    Value(T),
522    Size(usize),
523}
524
525impl<T> ValueOrSize<T> {
526    pub fn map<F, U>(self, f: F) -> ValueOrSize<U>
527    where
528        F: FnOnce(T) -> U,
529    {
530        match self {
531            Self::Size(s) => ValueOrSize::Size(s),
532            Self::Value(v) => ValueOrSize::Value(f(v)),
533        }
534    }
535
536    #[cfg(test)]
537    pub fn unwrap(self) -> T {
538        match self {
539            Self::Size(_) => panic!("Unwrap ValueOrSize that is a Size"),
540            Self::Value(v) => v,
541        }
542    }
543}
544
545impl<T> From<T> for ValueOrSize<T> {
546    fn from(t: T) -> Self {
547        Self::Value(t)
548    }
549}
550
551#[derive(Copy, Clone, Eq, PartialEq, Debug)]
552pub enum FallocMode {
553    Allocate { keep_size: bool },
554    PunchHole,
555    Collapse,
556    Zero { keep_size: bool },
557    InsertRange,
558    UnshareRange,
559}
560
561impl FallocMode {
562    pub fn from_bits(mode: u32) -> Option<Self> {
563        // `fallocate()` allows only the following values for `mode`.
564        if mode == 0 {
565            Some(Self::Allocate { keep_size: false })
566        } else if mode == FALLOC_FL_KEEP_SIZE {
567            Some(Self::Allocate { keep_size: true })
568        } else if mode == FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE {
569            Some(Self::PunchHole)
570        } else if mode == FALLOC_FL_COLLAPSE_RANGE {
571            Some(Self::Collapse)
572        } else if mode == FALLOC_FL_ZERO_RANGE {
573            Some(Self::Zero { keep_size: false })
574        } else if mode == FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE {
575            Some(Self::Zero { keep_size: true })
576        } else if mode == FALLOC_FL_INSERT_RANGE {
577            Some(Self::InsertRange)
578        } else if mode == FALLOC_FL_UNSHARE_RANGE {
579            Some(Self::UnshareRange)
580        } else {
581            None
582        }
583    }
584}
585
586#[derive(Debug, Copy, Clone, PartialEq)]
587pub enum CheckAccessReason {
588    Access,
589    Chdir,
590    Chroot,
591    Exec,
592    ChangeTimestamps { now: bool },
593    InternalPermissionChecks,
594}
595
596pub trait FsNodeOps: Send + Sync + AsAny + 'static {
597    /// Delegate the access check to the node.
598    fn check_access(
599        &self,
600        _locked: &mut Locked<FileOpsCore>,
601        node: &FsNode,
602        current_task: &CurrentTask,
603        access: security::PermissionFlags,
604        info: &RwLock<FsNodeInfo>,
605        reason: CheckAccessReason,
606        audit_context: security::Auditable<'_>,
607    ) -> Result<(), Errno> {
608        node.default_check_access_impl(current_task, access, reason, info.read(), audit_context)
609    }
610
611    /// Build the [`DirEntryOps`] for a new [`DirEntry`] that will be associated
612    /// to this node.
613    fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
614        Box::new(DefaultDirEntryOps)
615    }
616
617    /// Build the `FileOps` for the file associated to this node.
618    ///
619    /// The returned FileOps will be used to create a FileObject, which might
620    /// be assigned an FdNumber.
621    fn create_file_ops(
622        &self,
623        locked: &mut Locked<FileOpsCore>,
624        node: &FsNode,
625        _current_task: &CurrentTask,
626        flags: OpenFlags,
627    ) -> Result<Box<dyn FileOps>, Errno>;
628
629    /// Find an existing child node and populate the child parameter. Return the node.
630    ///
631    /// The child parameter is an empty node. Operations other than initialize may panic before
632    /// initialize is called.
633    fn lookup(
634        &self,
635        _locked: &mut Locked<FileOpsCore>,
636        _node: &FsNode,
637        _current_task: &CurrentTask,
638        name: &FsStr,
639    ) -> Result<FsNodeHandle, Errno> {
640        // The default implementation here is suitable for filesystems that have permanent entries;
641        // entries that already exist will get found in the cache and shouldn't get this far.
642        error!(ENOENT, format!("looking for {name}"))
643    }
644
645    /// Create and return the given child node.
646    ///
647    /// The mode field of the FsNodeInfo indicates what kind of child to
648    /// create.
649    ///
650    /// This function is never called with FileMode::IFDIR. The mkdir function
651    /// is used to create directories instead.
652    fn mknod(
653        &self,
654        locked: &mut Locked<FileOpsCore>,
655        _node: &FsNode,
656        _current_task: &CurrentTask,
657        _name: &FsStr,
658        _mode: FileMode,
659        _dev: DeviceType,
660        _owner: FsCred,
661    ) -> Result<FsNodeHandle, Errno>;
662
663    /// Create and return the given child node as a subdirectory.
664    fn mkdir(
665        &self,
666        locked: &mut Locked<FileOpsCore>,
667        _node: &FsNode,
668        _current_task: &CurrentTask,
669        _name: &FsStr,
670        _mode: FileMode,
671        _owner: FsCred,
672    ) -> Result<FsNodeHandle, Errno>;
673
674    /// Creates a symlink with the given `target` path.
675    fn create_symlink(
676        &self,
677        locked: &mut Locked<FileOpsCore>,
678        _node: &FsNode,
679        _current_task: &CurrentTask,
680        _name: &FsStr,
681        _target: &FsStr,
682        _owner: FsCred,
683    ) -> Result<FsNodeHandle, Errno>;
684
685    /// Creates an anonymous file.
686    ///
687    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
688    ///
689    /// Used by O_TMPFILE.
690    fn create_tmpfile(
691        &self,
692        _node: &FsNode,
693        _current_task: &CurrentTask,
694        _mode: FileMode,
695        _owner: FsCred,
696    ) -> Result<FsNodeHandle, Errno> {
697        error!(EOPNOTSUPP)
698    }
699
700    /// Reads the symlink from this node.
701    fn readlink(
702        &self,
703        _locked: &mut Locked<FileOpsCore>,
704        _node: &FsNode,
705        _current_task: &CurrentTask,
706    ) -> Result<SymlinkTarget, Errno> {
707        error!(EINVAL)
708    }
709
710    /// Create a hard link with the given name to the given child.
711    fn link(
712        &self,
713        _locked: &mut Locked<FileOpsCore>,
714        _node: &FsNode,
715        _current_task: &CurrentTask,
716        _name: &FsStr,
717        _child: &FsNodeHandle,
718    ) -> Result<(), Errno> {
719        error!(EPERM)
720    }
721
722    /// Remove the child with the given name, if the child exists.
723    ///
724    /// The UnlinkKind parameter indicates whether the caller intends to unlink
725    /// a directory or a non-directory child.
726    fn unlink(
727        &self,
728        locked: &mut Locked<FileOpsCore>,
729        _node: &FsNode,
730        _current_task: &CurrentTask,
731        _name: &FsStr,
732        _child: &FsNodeHandle,
733    ) -> Result<(), Errno>;
734
735    /// Acquire the necessary append lock for the operations that depend on them.
736    /// Should be done before calling `allocate` or `truncate` to avoid lock ordering issues.
737    fn append_lock_read<'a>(
738        &'a self,
739        locked: &'a mut Locked<BeforeFsNodeAppend>,
740        node: &'a FsNode,
741        current_task: &CurrentTask,
742    ) -> Result<(RwQueueReadGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
743        return node.append_lock.read_and(locked, current_task);
744    }
745
746    /// Change the length of the file.
747    fn truncate(
748        &self,
749        _locked: &mut Locked<FileOpsCore>,
750        _guard: &AppendLockGuard<'_>,
751        _node: &FsNode,
752        _current_task: &CurrentTask,
753        _length: u64,
754    ) -> Result<(), Errno> {
755        error!(EINVAL)
756    }
757
758    /// Manipulate allocated disk space for the file.
759    fn allocate(
760        &self,
761        _locked: &mut Locked<FileOpsCore>,
762        _guard: &AppendLockGuard<'_>,
763        _node: &FsNode,
764        _current_task: &CurrentTask,
765        _mode: FallocMode,
766        _offset: u64,
767        _length: u64,
768    ) -> Result<(), Errno> {
769        error!(EINVAL)
770    }
771
772    /// Update the supplied info with initial state (e.g. size) for the node.
773    ///
774    /// FsNode calls this method when created, to allow the FsNodeOps to
775    /// set appropriate initial values in the FsNodeInfo.
776    fn initial_info(&self, _info: &mut FsNodeInfo) {}
777
778    /// Update node.info as needed.
779    ///
780    /// FsNode calls this method before converting the FsNodeInfo struct into
781    /// the uapi::stat struct to give the file system a chance to update this data
782    /// before it is used by clients.
783    ///
784    /// File systems that keep the FsNodeInfo up-to-date do not need to
785    /// override this function.
786    ///
787    /// Return a read guard for the updated information.
788    fn fetch_and_refresh_info<'a>(
789        &self,
790        _locked: &mut Locked<FileOpsCore>,
791        _node: &FsNode,
792        _current_task: &CurrentTask,
793        info: &'a RwLock<FsNodeInfo>,
794    ) -> Result<RwLockReadGuard<'a, FsNodeInfo>, Errno> {
795        Ok(info.read())
796    }
797
798    /// Syncs cached data to persistent storage.
799    fn sync(&self, _node: &FsNode, _current_task: &CurrentTask) -> Result<(), Errno> {
800        Ok(())
801    }
802
803    /// Update node attributes persistently.
804    fn update_attributes(
805        &self,
806        _locked: &mut Locked<FileOpsCore>,
807        _node: &FsNode,
808        _current_task: &CurrentTask,
809        _info: &FsNodeInfo,
810        _has: zxio_node_attr_has_t,
811    ) -> Result<(), Errno> {
812        Ok(())
813    }
814
815    /// Get an extended attribute on the node.
816    ///
817    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
818    /// instead return the size of the attribute, and can return an ERANGE error if max_size is not
819    /// 0, and lesser than the required size.
820    fn get_xattr(
821        &self,
822        _locked: &mut Locked<FileOpsCore>,
823        _node: &FsNode,
824        _current_task: &CurrentTask,
825        _name: &FsStr,
826        _max_size: usize,
827    ) -> Result<ValueOrSize<FsString>, Errno> {
828        error!(ENOTSUP)
829    }
830
831    /// Set an extended attribute on the node.
832    fn set_xattr(
833        &self,
834        _locked: &mut Locked<FileOpsCore>,
835        _node: &FsNode,
836        _current_task: &CurrentTask,
837        _name: &FsStr,
838        _value: &FsStr,
839        _op: XattrOp,
840    ) -> Result<(), Errno> {
841        error!(ENOTSUP)
842    }
843
844    fn remove_xattr(
845        &self,
846        _locked: &mut Locked<FileOpsCore>,
847        _node: &FsNode,
848        _current_task: &CurrentTask,
849        _name: &FsStr,
850    ) -> Result<(), Errno> {
851        error!(ENOTSUP)
852    }
853
854    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
855    /// instead return the size of the 0 separated string needed to represent the value, and can
856    /// return an ERANGE error if max_size is not 0, and lesser than the required size.
857    fn list_xattrs(
858        &self,
859        _locked: &mut Locked<FileOpsCore>,
860        _node: &FsNode,
861        _current_task: &CurrentTask,
862        _max_size: usize,
863    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
864        error!(ENOTSUP)
865    }
866
867    /// Called when the FsNode is freed by the Kernel.
868    fn forget(
869        self: Box<Self>,
870        _locked: &mut Locked<FileOpsCore>,
871        _current_task: &CurrentTask,
872        _info: FsNodeInfo,
873    ) -> Result<(), Errno> {
874        Ok(())
875    }
876
877    ////////////////////
878    // FS-Verity operations
879
880    /// Marks that FS-Verity is being built. Writes fsverity descriptor and merkle tree, the latter
881    /// computed by the filesystem.
882    /// This should ensure there are no writable file handles. Returns EEXIST if the file was
883    /// already fsverity-enabled. Returns EBUSY if this ioctl was already running on this file.
884    fn enable_fsverity(
885        &self,
886        _locked: &mut Locked<FileOpsCore>,
887        _node: &FsNode,
888        _current_task: &CurrentTask,
889        _descriptor: &fsverity_descriptor,
890    ) -> Result<(), Errno> {
891        error!(ENOTSUP)
892    }
893
894    /// Read fsverity descriptor, if the node is fsverity-enabled. Else returns ENODATA.
895    fn get_fsverity_descriptor(&self, _log_blocksize: u8) -> Result<fsverity_descriptor, Errno> {
896        error!(ENOTSUP)
897    }
898
899    /// Returns a descriptive name for this node, suitable to report to userspace in situations
900    /// where the node's path is unavailable (e.g. because it is anonymous, and has no path).
901    /// If no name is returned then a default name of the form "<class:[<node_id>]" will be used.
902    fn internal_name(&self, _node: &FsNode) -> Option<FsString> {
903        None
904    }
905
906    /// The key used to identify this node in the file system's node cache.
907    ///
908    /// For many file systems, this will be the same as the inode number. However, some file
909    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
910    fn node_key(&self, node: &FsNode) -> ino_t {
911        node.ino
912    }
913}
914
915impl<T> From<T> for Box<dyn FsNodeOps>
916where
917    T: FsNodeOps,
918{
919    fn from(ops: T) -> Box<dyn FsNodeOps> {
920        Box::new(ops)
921    }
922}
923
924/// Implements [`FsNodeOps`] methods in a way that makes sense for symlinks.
925/// You must implement [`FsNodeOps::readlink`].
926#[macro_export]
927macro_rules! fs_node_impl_symlink {
928    () => {
929        $crate::vfs::fs_node_impl_not_dir!();
930
931        fn create_file_ops(
932            &self,
933            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
934            node: &$crate::vfs::FsNode,
935            _current_task: &CurrentTask,
936            _flags: starnix_uapi::open_flags::OpenFlags,
937        ) -> Result<Box<dyn $crate::vfs::FileOps>, starnix_uapi::errors::Errno> {
938            assert!(node.is_lnk());
939            unreachable!("Symlink nodes cannot be opened.");
940        }
941    };
942}
943
944#[macro_export]
945macro_rules! fs_node_impl_dir_readonly {
946    () => {
947        fn check_access(
948            &self,
949            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
950            node: &$crate::vfs::FsNode,
951            current_task: &$crate::task::CurrentTask,
952            permission_flags: $crate::security::PermissionFlags,
953            info: &starnix_sync::RwLock<$crate::vfs::FsNodeInfo>,
954            reason: $crate::vfs::CheckAccessReason,
955            audit_context: $crate::security::Auditable<'_>,
956        ) -> Result<(), starnix_uapi::errors::Errno> {
957            let access = permission_flags.as_access();
958            if access.contains(starnix_uapi::file_mode::Access::WRITE) {
959                return starnix_uapi::error!(
960                    EROFS,
961                    format!("check_access failed: read-only directory")
962                );
963            }
964            node.default_check_access_impl(
965                current_task,
966                permission_flags,
967                reason,
968                info.read(),
969                audit_context,
970            )
971        }
972
973        fn mkdir(
974            &self,
975            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
976            _node: &$crate::vfs::FsNode,
977            _current_task: &$crate::task::CurrentTask,
978            name: &$crate::vfs::FsStr,
979            _mode: starnix_uapi::file_mode::FileMode,
980            _owner: starnix_uapi::auth::FsCred,
981        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
982            starnix_uapi::error!(EROFS, format!("mkdir failed: {:?}", name))
983        }
984
985        fn mknod(
986            &self,
987            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
988            _node: &$crate::vfs::FsNode,
989            _current_task: &$crate::task::CurrentTask,
990            name: &$crate::vfs::FsStr,
991            _mode: starnix_uapi::file_mode::FileMode,
992            _dev: starnix_uapi::device_type::DeviceType,
993            _owner: starnix_uapi::auth::FsCred,
994        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
995            starnix_uapi::error!(EROFS, format!("mknod failed: {:?}", name))
996        }
997
998        fn create_symlink(
999            &self,
1000            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1001            _node: &$crate::vfs::FsNode,
1002            _current_task: &$crate::task::CurrentTask,
1003            name: &$crate::vfs::FsStr,
1004            _target: &$crate::vfs::FsStr,
1005            _owner: starnix_uapi::auth::FsCred,
1006        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1007            starnix_uapi::error!(EROFS, format!("symlink failed: {:?}", name))
1008        }
1009
1010        fn link(
1011            &self,
1012            _locked: &mut Locked<FileOpsCore>,
1013            _node: &$crate::vfs::FsNode,
1014            _current_task: &$crate::task::CurrentTask,
1015            name: &$crate::vfs::FsStr,
1016            _child: &$crate::vfs::FsNodeHandle,
1017        ) -> Result<(), starnix_uapi::errors::Errno> {
1018            starnix_uapi::error!(EROFS, format!("link failed: {:?}", name))
1019        }
1020
1021        fn unlink(
1022            &self,
1023            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1024            _node: &$crate::vfs::FsNode,
1025            _current_task: &$crate::task::CurrentTask,
1026            name: &$crate::vfs::FsStr,
1027            _child: &$crate::vfs::FsNodeHandle,
1028        ) -> Result<(), starnix_uapi::errors::Errno> {
1029            starnix_uapi::error!(EROFS, format!("unlink failed: {:?}", name))
1030        }
1031    };
1032}
1033
1034/// Trait that objects can implement if they need to handle extended attribute storage. Allows
1035/// delegating extended attribute operations in [`FsNodeOps`] to another object.
1036///
1037/// See [`fs_node_impl_xattr_delegate`] for usage details.
1038pub trait XattrStorage {
1039    /// Delegate for [`FsNodeOps::get_xattr`].
1040    fn get_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<FsString, Errno>;
1041
1042    /// Delegate for [`FsNodeOps::set_xattr`].
1043    fn set_xattr(
1044        &self,
1045        locked: &mut Locked<FileOpsCore>,
1046        name: &FsStr,
1047        value: &FsStr,
1048        op: XattrOp,
1049    ) -> Result<(), Errno>;
1050
1051    /// Delegate for [`FsNodeOps::remove_xattr`].
1052    fn remove_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<(), Errno>;
1053
1054    /// Delegate for [`FsNodeOps::list_xattrs`].
1055    fn list_xattrs(&self, locked: &mut Locked<FileOpsCore>) -> Result<Vec<FsString>, Errno>;
1056}
1057
1058/// Implements extended attribute ops for [`FsNodeOps`] by delegating to another object which
1059/// implements the [`XattrStorage`] trait or a similar interface. For example:
1060///
1061/// ```
1062/// struct Xattrs {}
1063///
1064/// impl XattrStorage for Xattrs {
1065///     // implement XattrStorage
1066/// }
1067///
1068/// struct Node {
1069///     xattrs: Xattrs
1070/// }
1071///
1072/// impl FsNodeOps for Node {
1073///     // Delegate extended attribute ops in FsNodeOps to self.xattrs
1074///     fs_node_impl_xattr_delegate!(self, self.xattrs);
1075///
1076///     // add other FsNodeOps impls here
1077/// }
1078/// ```
1079#[macro_export]
1080macro_rules! fs_node_impl_xattr_delegate {
1081    ($self:ident, $delegate:expr) => {
1082        fn get_xattr(
1083            &$self,
1084            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1085            _node: &FsNode,
1086            _current_task: &CurrentTask,
1087            name: &$crate::vfs::FsStr,
1088            _size: usize,
1089        ) -> Result<$crate::vfs::ValueOrSize<$crate::vfs::FsString>, starnix_uapi::errors::Errno> {
1090            Ok($delegate.get_xattr(locked, name)?.into())
1091        }
1092
1093        fn set_xattr(
1094            &$self,
1095            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1096            _node: &FsNode,
1097            _current_task: &CurrentTask,
1098            name: &$crate::vfs::FsStr,
1099            value: &$crate::vfs::FsStr,
1100            op: $crate::vfs::XattrOp,
1101        ) -> Result<(), starnix_uapi::errors::Errno> {
1102            $delegate.set_xattr(locked, name, value, op)
1103        }
1104
1105        fn remove_xattr(
1106            &$self,
1107            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1108            _node: &FsNode,
1109            _current_task: &CurrentTask,
1110            name: &$crate::vfs::FsStr,
1111        ) -> Result<(), starnix_uapi::errors::Errno> {
1112            $delegate.remove_xattr(locked, name)
1113        }
1114
1115        fn list_xattrs(
1116            &$self,
1117            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1118            _node: &FsNode,
1119            _current_task: &CurrentTask,
1120            _size: usize,
1121        ) -> Result<$crate::vfs::ValueOrSize<Vec<$crate::vfs::FsString>>, starnix_uapi::errors::Errno> {
1122            Ok($delegate.list_xattrs(locked)?.into())
1123        }
1124    };
1125}
1126
1127/// Stubs out [`FsNodeOps`] methods that only apply to directories.
1128#[macro_export]
1129macro_rules! fs_node_impl_not_dir {
1130    () => {
1131        fn lookup(
1132            &self,
1133            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1134            _node: &$crate::vfs::FsNode,
1135            _current_task: &$crate::task::CurrentTask,
1136            _name: &$crate::vfs::FsStr,
1137        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1138            starnix_uapi::error!(ENOTDIR)
1139        }
1140
1141        fn mknod(
1142            &self,
1143            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1144            _node: &$crate::vfs::FsNode,
1145            _current_task: &$crate::task::CurrentTask,
1146            _name: &$crate::vfs::FsStr,
1147            _mode: starnix_uapi::file_mode::FileMode,
1148            _dev: starnix_uapi::device_type::DeviceType,
1149            _owner: starnix_uapi::auth::FsCred,
1150        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1151            starnix_uapi::error!(ENOTDIR)
1152        }
1153
1154        fn mkdir(
1155            &self,
1156            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1157            _node: &$crate::vfs::FsNode,
1158            _current_task: &$crate::task::CurrentTask,
1159            _name: &$crate::vfs::FsStr,
1160            _mode: starnix_uapi::file_mode::FileMode,
1161            _owner: starnix_uapi::auth::FsCred,
1162        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1163            starnix_uapi::error!(ENOTDIR)
1164        }
1165
1166        fn create_symlink(
1167            &self,
1168            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1169            _node: &$crate::vfs::FsNode,
1170            _current_task: &$crate::task::CurrentTask,
1171            _name: &$crate::vfs::FsStr,
1172            _target: &$crate::vfs::FsStr,
1173            _owner: starnix_uapi::auth::FsCred,
1174        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1175            starnix_uapi::error!(ENOTDIR)
1176        }
1177
1178        fn unlink(
1179            &self,
1180            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1181            _node: &$crate::vfs::FsNode,
1182            _current_task: &$crate::task::CurrentTask,
1183            _name: &$crate::vfs::FsStr,
1184            _child: &$crate::vfs::FsNodeHandle,
1185        ) -> Result<(), starnix_uapi::errors::Errno> {
1186            starnix_uapi::error!(ENOTDIR)
1187        }
1188    };
1189}
1190
1191#[derive(Copy, Clone, Debug, PartialEq, Eq)]
1192pub enum TimeUpdateType {
1193    Now,
1194    Omit,
1195    Time(UtcInstant),
1196}
1197
1198// Public re-export of macros allows them to be used like regular rust items.
1199pub use {
1200    fs_node_impl_dir_readonly, fs_node_impl_not_dir, fs_node_impl_symlink,
1201    fs_node_impl_xattr_delegate,
1202};
1203
1204pub struct SpecialNode;
1205
1206impl FsNodeOps for SpecialNode {
1207    fs_node_impl_not_dir!();
1208
1209    fn create_file_ops(
1210        &self,
1211        _locked: &mut Locked<FileOpsCore>,
1212        _node: &FsNode,
1213        _current_task: &CurrentTask,
1214        _flags: OpenFlags,
1215    ) -> Result<Box<dyn FileOps>, Errno> {
1216        unreachable!("Special nodes cannot be opened.");
1217    }
1218}
1219
1220impl FsNode {
1221    /// Create a node without inserting it into the FileSystem node cache.
1222    ///
1223    /// This is usually not what you want!
1224    /// Only use if you're also using get_or_create_node, like ext4.
1225    pub fn new_uncached(
1226        ino: ino_t,
1227        ops: impl Into<Box<dyn FsNodeOps>>,
1228        fs: &FileSystemHandle,
1229        info: FsNodeInfo,
1230    ) -> FsNodeHandle {
1231        let ops = ops.into();
1232        FsNodeHandle::new(Self::new_internal(ino, ops, Arc::downgrade(fs), info).into())
1233    }
1234
1235    fn new_internal(
1236        ino: ino_t,
1237        ops: Box<dyn FsNodeOps>,
1238        fs: Weak<FileSystem>,
1239        info: FsNodeInfo,
1240    ) -> Self {
1241        // Allow the FsNodeOps to populate initial info.
1242        let info = {
1243            let mut info = info;
1244            ops.initial_info(&mut info);
1245            info
1246        };
1247
1248        // The linter will fail in non test mode as it will not see the lock check.
1249        #[allow(clippy::let_and_return)]
1250        {
1251            let result = Self {
1252                ino,
1253                ops,
1254                fs,
1255                info: RwLock::new(info),
1256                append_lock: Default::default(),
1257                rare_data: Default::default(),
1258                write_guard_state: Default::default(),
1259                fsverity: Mutex::new(FsVerityState::None),
1260                security_state: Default::default(),
1261            };
1262            #[cfg(any(test, debug_assertions))]
1263            {
1264                #[allow(
1265                    clippy::undocumented_unsafe_blocks,
1266                    reason = "Force documented unsafe blocks in Starnix"
1267                )]
1268                let locked = unsafe { Unlocked::new() };
1269                let _l1 = result.append_lock.read_for_lock_ordering(locked);
1270                let _l2 = result.info.read();
1271                let _l3 = result.write_guard_state.lock();
1272                let _l4 = result.fsverity.lock();
1273                // TODO(https://fxbug.dev/367585803): Add lock levels to SELinux implementation.
1274                let _l5 = result.security_state.lock();
1275            }
1276            result
1277        }
1278    }
1279
1280    pub fn fs(&self) -> FileSystemHandle {
1281        self.fs.upgrade().expect("FileSystem did not live long enough")
1282    }
1283
1284    pub fn ops(&self) -> &dyn FsNodeOps {
1285        self.ops.as_ref()
1286    }
1287
1288    /// Returns an error if this node is encrypted and locked. Does not require
1289    /// fetch_and_refresh_info because FS_IOC_SET_ENCRYPTION_POLICY updates info and once a node is
1290    /// encrypted, it remains encrypted forever.
1291    pub fn fail_if_locked(&self, _current_task: &CurrentTask) -> Result<(), Errno> {
1292        let node_info = self.info();
1293        if let Some(wrapping_key_id) = node_info.wrapping_key_id {
1294            let crypt_service = self.fs().crypt_service().ok_or_else(|| errno!(ENOKEY))?;
1295            if !crypt_service.contains_key(EncryptionKeyId::from(wrapping_key_id)) {
1296                return error!(ENOKEY);
1297            }
1298        }
1299        Ok(())
1300    }
1301
1302    /// Returns the `FsNode`'s `FsNodeOps` as a `&T`, or `None` if the downcast fails.
1303    pub fn downcast_ops<T>(&self) -> Option<&T>
1304    where
1305        T: 'static,
1306    {
1307        self.ops().as_any().downcast_ref::<T>()
1308    }
1309
1310    pub fn on_file_closed(&self, file: &FileObjectState) {
1311        if let Some(rare_data) = self.rare_data.get() {
1312            let mut flock_info = rare_data.flock_info.lock();
1313            // This function will drop the flock from `file` because the `WeakFileHandle` for
1314            // `file` will no longer upgrade to an `FileHandle`.
1315            flock_info.retain(|_| true);
1316        }
1317        self.record_lock_release(RecordLockOwner::FileObject(file.id));
1318    }
1319
1320    pub fn record_lock(
1321        &self,
1322        locked: &mut Locked<Unlocked>,
1323        current_task: &CurrentTask,
1324        file: &FileObject,
1325        cmd: RecordLockCommand,
1326        flock: uapi::flock,
1327    ) -> Result<Option<uapi::flock>, Errno> {
1328        self.ensure_rare_data().record_locks.lock(locked, current_task, file, cmd, flock)
1329    }
1330
1331    /// Release all record locks acquired by the given owner.
1332    pub fn record_lock_release(&self, owner: RecordLockOwner) {
1333        if let Some(rare_data) = self.rare_data.get() {
1334            rare_data.record_locks.release_locks(owner);
1335        }
1336    }
1337
1338    pub fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
1339        self.ops().create_dir_entry_ops()
1340    }
1341
1342    pub fn create_file_ops<L>(
1343        &self,
1344        locked: &mut Locked<L>,
1345        current_task: &CurrentTask,
1346        flags: OpenFlags,
1347    ) -> Result<Box<dyn FileOps>, Errno>
1348    where
1349        L: LockEqualOrBefore<FileOpsCore>,
1350    {
1351        let locked = locked.cast_locked::<FileOpsCore>();
1352        self.ops().create_file_ops(locked, self, current_task, flags)
1353    }
1354
1355    pub fn open(
1356        &self,
1357        locked: &mut Locked<Unlocked>,
1358        current_task: &CurrentTask,
1359        namespace_node: &NamespaceNode,
1360        flags: OpenFlags,
1361        access_check: AccessCheck,
1362    ) -> Result<Box<dyn FileOps>, Errno> {
1363        // If O_PATH is set, there is no need to create a real FileOps because
1364        // most file operations are disabled.
1365        if flags.contains(OpenFlags::PATH) {
1366            return Ok(Box::new(OPathOps::new()));
1367        }
1368
1369        let access = access_check.resolve(flags);
1370        if access.is_nontrivial() {
1371            if flags.contains(OpenFlags::NOATIME) {
1372                self.check_o_noatime_allowed(current_task)?;
1373            }
1374
1375            // `flags` doesn't contain any information about the EXEC permission. Instead the syscalls
1376            // used to execute a file (`sys_execve` and `sys_execveat`) call `open()` with the EXEC
1377            // permission request in `access`.
1378            let mut permission_flags = PermissionFlags::from(access);
1379
1380            // The `APPEND` flag exists only in `flags`, to modify the behaviour of
1381            // `PermissionFlags::WRITE`
1382            if flags.contains(OpenFlags::APPEND) {
1383                permission_flags |= security::PermissionFlags::APPEND;
1384            }
1385
1386            // TODO: https://fxbug.dev/455782510 - Remove this once non-open() checks are fully
1387            // enforced.
1388            permission_flags |= security::PermissionFlags::FOR_OPEN;
1389
1390            self.check_access(
1391                locked,
1392                current_task,
1393                &namespace_node.mount,
1394                permission_flags,
1395                CheckAccessReason::InternalPermissionChecks,
1396                namespace_node,
1397            )?;
1398        }
1399
1400        let (mode, rdev) = {
1401            // Don't hold the info lock while calling into open_device or self.ops().
1402            // TODO: The mode and rdev are immutable and shouldn't require a lock to read.
1403            let info = self.info();
1404            (info.mode, info.rdev)
1405        };
1406
1407        match mode & FileMode::IFMT {
1408            FileMode::IFCHR => {
1409                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1410                    return error!(EACCES);
1411                }
1412                current_task.kernel().open_device(
1413                    locked,
1414                    current_task,
1415                    namespace_node,
1416                    flags,
1417                    rdev,
1418                    DeviceMode::Char,
1419                )
1420            }
1421            FileMode::IFBLK => {
1422                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1423                    return error!(EACCES);
1424                }
1425                current_task.kernel().open_device(
1426                    locked,
1427                    current_task,
1428                    namespace_node,
1429                    flags,
1430                    rdev,
1431                    DeviceMode::Block,
1432                )
1433            }
1434            FileMode::IFIFO => Pipe::open(locked, current_task, self.fifo(current_task), flags),
1435            // UNIX domain sockets can't be opened.
1436            FileMode::IFSOCK => error!(ENXIO),
1437            _ => self.create_file_ops(locked, current_task, flags),
1438        }
1439    }
1440
1441    pub fn lookup<L>(
1442        &self,
1443        locked: &mut Locked<L>,
1444        current_task: &CurrentTask,
1445        mount: &MountInfo,
1446        name: &FsStr,
1447    ) -> Result<FsNodeHandle, Errno>
1448    where
1449        L: LockEqualOrBefore<FileOpsCore>,
1450    {
1451        self.check_access(
1452            locked,
1453            current_task,
1454            mount,
1455            Access::EXEC,
1456            CheckAccessReason::InternalPermissionChecks,
1457            &[Auditable::Name(name), std::panic::Location::caller().into()],
1458        )?;
1459        let locked = locked.cast_locked::<FileOpsCore>();
1460        self.ops().lookup(locked, self, current_task, name)
1461    }
1462
1463    pub fn create_node<L>(
1464        &self,
1465        locked: &mut Locked<L>,
1466        current_task: &CurrentTask,
1467        mount: &MountInfo,
1468        name: &FsStr,
1469        mut mode: FileMode,
1470        dev: DeviceType,
1471        mut owner: FsCred,
1472    ) -> Result<FsNodeHandle, Errno>
1473    where
1474        L: LockEqualOrBefore<FileOpsCore>,
1475    {
1476        assert!(mode & FileMode::IFMT != FileMode::EMPTY, "mknod called without node type.");
1477        self.check_access(
1478            locked,
1479            current_task,
1480            mount,
1481            Access::WRITE,
1482            CheckAccessReason::InternalPermissionChecks,
1483            security::Auditable::Name(name),
1484        )?;
1485        if mode.is_reg() {
1486            security::check_fs_node_create_access(current_task, self, mode, name)?;
1487        } else if mode.is_dir() {
1488            // Even though the man page for mknod(2) says that mknod "cannot be used to create
1489            // directories" in starnix the mkdir syscall (`sys_mkdirat`) ends up calling
1490            //create_node.
1491            security::check_fs_node_mkdir_access(current_task, self, mode, name)?;
1492        } else if !matches!(
1493            mode.fmt(),
1494            FileMode::IFCHR | FileMode::IFBLK | FileMode::IFIFO | FileMode::IFSOCK
1495        ) {
1496            security::check_fs_node_mknod_access(current_task, self, mode, name, dev)?;
1497        }
1498
1499        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1500
1501        let new_node = if mode.is_dir() {
1502            let locked = locked.cast_locked::<FileOpsCore>();
1503            self.ops().mkdir(locked, self, current_task, name, mode, owner)?
1504        } else {
1505            // https://man7.org/linux/man-pages/man2/mknod.2.html says on error EPERM:
1506            //
1507            //   mode requested creation of something other than a regular
1508            //   file, FIFO (named pipe), or UNIX domain socket, and the
1509            //   caller is not privileged (Linux: does not have the
1510            //   CAP_MKNOD capability); also returned if the filesystem
1511            //   containing pathname does not support the type of node
1512            //   requested.
1513            if !matches!(mode.fmt(), FileMode::IFREG | FileMode::IFIFO | FileMode::IFSOCK) {
1514                security::check_task_capable(current_task, CAP_MKNOD)?;
1515            }
1516            let locked = locked.cast_locked::<FileOpsCore>();
1517            self.ops().mknod(locked, self, current_task, name, mode, dev, owner)?
1518        };
1519
1520        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1521
1522        Ok(new_node)
1523    }
1524
1525    pub fn create_symlink<L>(
1526        &self,
1527        locked: &mut Locked<L>,
1528        current_task: &CurrentTask,
1529        mount: &MountInfo,
1530        name: &FsStr,
1531        target: &FsStr,
1532        owner: FsCred,
1533    ) -> Result<FsNodeHandle, Errno>
1534    where
1535        L: LockEqualOrBefore<FileOpsCore>,
1536    {
1537        self.check_access(
1538            locked,
1539            current_task,
1540            mount,
1541            Access::WRITE,
1542            CheckAccessReason::InternalPermissionChecks,
1543            security::Auditable::Name(name),
1544        )?;
1545        security::check_fs_node_symlink_access(current_task, self, name, target)?;
1546
1547        let locked = locked.cast_locked::<FileOpsCore>();
1548        let new_node =
1549            self.ops().create_symlink(locked, self, current_task, name, target, owner)?;
1550
1551        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1552
1553        Ok(new_node)
1554    }
1555
1556    /// Requests that the LSM initialise a security label for the `new_node`, and optionally provide
1557    /// an extended attribute to write to the file to persist it.  If no LSM is enabled, no extended
1558    /// attribute returned, or if the filesystem does not support extended attributes, then the call
1559    /// returns success. All other failure modes return an `Errno` that should be early-returned.
1560    fn init_new_node_security_on_create<L>(
1561        &self,
1562        locked: &mut Locked<L>,
1563        current_task: &CurrentTask,
1564        new_node: &FsNode,
1565        name: &FsStr,
1566    ) -> Result<(), Errno>
1567    where
1568        L: LockEqualOrBefore<FileOpsCore>,
1569    {
1570        let locked = locked.cast_locked::<FileOpsCore>();
1571        security::fs_node_init_on_create(current_task, &new_node, self, name)?
1572            .map(|xattr| {
1573                match new_node.ops().set_xattr(
1574                    locked,
1575                    &new_node,
1576                    current_task,
1577                    xattr.name,
1578                    xattr.value.as_slice().into(),
1579                    XattrOp::Create,
1580                ) {
1581                    Err(e) => {
1582                        if e.code == ENOTSUP {
1583                            // This should only occur if a task has an "fscreate" context set, and
1584                            // creates a new file in a filesystem that does not support xattrs.
1585                            Ok(())
1586                        } else {
1587                            Err(e)
1588                        }
1589                    }
1590                    result => result,
1591                }
1592            })
1593            .unwrap_or_else(|| Ok(()))
1594    }
1595
1596    pub fn create_tmpfile<L>(
1597        &self,
1598        locked: &mut Locked<L>,
1599        current_task: &CurrentTask,
1600        mount: &MountInfo,
1601        mut mode: FileMode,
1602        mut owner: FsCred,
1603        link_behavior: FsNodeLinkBehavior,
1604    ) -> Result<FsNodeHandle, Errno>
1605    where
1606        L: LockEqualOrBefore<FileOpsCore>,
1607    {
1608        self.check_access(
1609            locked,
1610            current_task,
1611            mount,
1612            Access::WRITE,
1613            CheckAccessReason::InternalPermissionChecks,
1614            security::Auditable::Location(std::panic::Location::caller()),
1615        )?;
1616        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1617        let node = self.ops().create_tmpfile(self, current_task, mode, owner)?;
1618        self.init_new_node_security_on_create(locked, current_task, &node, "".into())?;
1619        if link_behavior == FsNodeLinkBehavior::Disallowed {
1620            node.ensure_rare_data().link_behavior.set(link_behavior).unwrap();
1621        }
1622        Ok(node)
1623    }
1624
1625    // This method does not attempt to update the atime of the node.
1626    // Use `NamespaceNode::readlink` which checks the mount flags and updates the atime accordingly.
1627    pub fn readlink<L>(
1628        &self,
1629        locked: &mut Locked<L>,
1630        current_task: &CurrentTask,
1631    ) -> Result<SymlinkTarget, Errno>
1632    where
1633        L: LockEqualOrBefore<FileOpsCore>,
1634    {
1635        // TODO: 378864856 - Is there a permission check here other than security checks?
1636        security::check_fs_node_read_link_access(current_task, self)?;
1637        self.ops().readlink(locked.cast_locked::<FileOpsCore>(), self, current_task)
1638    }
1639
1640    pub fn link<L>(
1641        &self,
1642        locked: &mut Locked<L>,
1643        current_task: &CurrentTask,
1644        mount: &MountInfo,
1645        name: &FsStr,
1646        child: &FsNodeHandle,
1647    ) -> Result<FsNodeHandle, Errno>
1648    where
1649        L: LockEqualOrBefore<FileOpsCore>,
1650    {
1651        self.check_access(
1652            locked,
1653            current_task,
1654            mount,
1655            Access::WRITE,
1656            CheckAccessReason::InternalPermissionChecks,
1657            security::Auditable::Location(std::panic::Location::caller()),
1658        )?;
1659
1660        if child.is_dir() {
1661            return error!(EPERM);
1662        }
1663
1664        if let Some(child_rare_data) = child.rare_data.get() {
1665            if matches!(child_rare_data.link_behavior.get(), Some(FsNodeLinkBehavior::Disallowed)) {
1666                return error!(ENOENT);
1667            }
1668        }
1669
1670        // Check that `current_task` has permission to create the hard link.
1671        //
1672        // See description of /proc/sys/fs/protected_hardlinks in
1673        // https://man7.org/linux/man-pages/man5/proc.5.html for details of the security
1674        // vulnerabilities.
1675        //
1676        let (child_uid, mode) = {
1677            let info = child.info();
1678            (info.uid, info.mode)
1679        };
1680        // Check that the the filesystem UID of the calling process (`current_task`) is the same as
1681        // the UID of the existing file. The check can be bypassed if the calling process has
1682        // `CAP_FOWNER` capability.
1683        if child_uid != current_task.current_creds().fsuid
1684            && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1685        {
1686            // If current_task is not the user of the existing file, it needs to have read and write
1687            // access to the existing file.
1688            child
1689                .check_access(
1690                    locked,
1691                    current_task,
1692                    mount,
1693                    Access::READ | Access::WRITE,
1694                    CheckAccessReason::InternalPermissionChecks,
1695                    security::Auditable::Name(name),
1696                )
1697                .map_err(|e| {
1698                    // `check_access(..)` returns EACCES when the access rights doesn't match - change
1699                    // it to EPERM to match Linux standards.
1700                    if e == EACCES { errno!(EPERM) } else { e }
1701                })?;
1702            // There are also security issues that may arise when users link to setuid, setgid, or
1703            // special files.
1704            if mode.contains(FileMode::ISGID | FileMode::IXGRP) {
1705                return error!(EPERM);
1706            };
1707            if mode.contains(FileMode::ISUID) {
1708                return error!(EPERM);
1709            };
1710            if !mode.contains(FileMode::IFREG) {
1711                return error!(EPERM);
1712            };
1713        }
1714
1715        security::check_fs_node_link_access(current_task, self, child)?;
1716
1717        let locked = locked.cast_locked::<FileOpsCore>();
1718        self.ops().link(locked, self, current_task, name, child)?;
1719        Ok(child.clone())
1720    }
1721
1722    pub fn unlink<L>(
1723        &self,
1724        locked: &mut Locked<L>,
1725        current_task: &CurrentTask,
1726        mount: &MountInfo,
1727        name: &FsStr,
1728        child: &FsNodeHandle,
1729    ) -> Result<(), Errno>
1730    where
1731        L: LockEqualOrBefore<FileOpsCore>,
1732    {
1733        // The user must be able to search and write to the directory.
1734        self.check_access(
1735            locked,
1736            current_task,
1737            mount,
1738            Access::EXEC | Access::WRITE,
1739            CheckAccessReason::InternalPermissionChecks,
1740            security::Auditable::Name(name),
1741        )?;
1742        self.check_sticky_bit(current_task, child)?;
1743        if child.is_dir() {
1744            security::check_fs_node_rmdir_access(current_task, self, child, name)?;
1745        } else {
1746            security::check_fs_node_unlink_access(current_task, self, child, name)?;
1747        }
1748        let locked = locked.cast_locked::<FileOpsCore>();
1749        self.ops().unlink(locked, self, current_task, name, child)?;
1750        self.update_ctime_mtime();
1751        Ok(())
1752    }
1753
1754    pub fn truncate<L>(
1755        &self,
1756        locked: &mut Locked<L>,
1757        current_task: &CurrentTask,
1758        mount: &MountInfo,
1759        length: u64,
1760    ) -> Result<(), Errno>
1761    where
1762        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1763    {
1764        self.truncate_with_strategy(locked, RealAppendLockStrategy {}, current_task, mount, length)
1765    }
1766
1767    pub fn truncate_with_strategy<L, M>(
1768        &self,
1769        locked: &mut Locked<L>,
1770        strategy: impl AppendLockStrategy<M>,
1771        current_task: &CurrentTask,
1772        mount: &MountInfo,
1773        length: u64,
1774    ) -> Result<(), Errno>
1775    where
1776        M: LockEqualOrBefore<FileOpsCore>,
1777        L: LockEqualOrBefore<M>,
1778    {
1779        if self.is_dir() {
1780            return error!(EISDIR);
1781        }
1782
1783        {
1784            let locked = locked.cast_locked::<M>();
1785            self.check_access(
1786                locked,
1787                current_task,
1788                mount,
1789                Access::WRITE,
1790                CheckAccessReason::InternalPermissionChecks,
1791                security::Auditable::Location(std::panic::Location::caller()),
1792            )?;
1793        }
1794
1795        self.truncate_common(locked, strategy, current_task, length)
1796    }
1797
1798    /// Avoid calling this method directly. You probably want to call `FileObject::ftruncate()`
1799    /// which will also perform all file-descriptor based verifications.
1800    pub fn ftruncate<L>(
1801        &self,
1802        locked: &mut Locked<L>,
1803        current_task: &CurrentTask,
1804        length: u64,
1805    ) -> Result<(), Errno>
1806    where
1807        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1808    {
1809        if self.is_dir() {
1810            // When truncating a file descriptor, if the descriptor references a directory,
1811            // return EINVAL. This is different from the truncate() syscall which returns EISDIR.
1812            //
1813            // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#ERRORS
1814            return error!(EINVAL);
1815        }
1816
1817        // For ftruncate, we do not need to check that the file node is writable.
1818        //
1819        // The file object that calls this method must verify that the file was opened
1820        // with write permissions.
1821        //
1822        // This matters because a file could be opened with O_CREAT + O_RDWR + 0444 mode.
1823        // The file descriptor returned from such an operation can be truncated, even
1824        // though the file was created with a read-only mode.
1825        //
1826        // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#DESCRIPTION
1827        // which says:
1828        //
1829        // "With ftruncate(), the file must be open for writing; with truncate(),
1830        // the file must be writable."
1831
1832        self.truncate_common(locked, RealAppendLockStrategy {}, current_task, length)
1833    }
1834
1835    // Called by `truncate` and `ftruncate` above.
1836    fn truncate_common<L, M>(
1837        &self,
1838        locked: &mut Locked<L>,
1839        strategy: impl AppendLockStrategy<M>,
1840        current_task: &CurrentTask,
1841        length: u64,
1842    ) -> Result<(), Errno>
1843    where
1844        M: LockEqualOrBefore<FileOpsCore>,
1845        L: LockEqualOrBefore<M>,
1846    {
1847        if length > MAX_LFS_FILESIZE as u64 {
1848            return error!(EINVAL);
1849        }
1850        {
1851            let locked = locked.cast_locked::<M>().cast_locked::<FileOpsCore>();
1852            if length > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1853                send_standard_signal(locked, current_task, SignalInfo::kernel(SIGXFSZ));
1854                return error!(EFBIG);
1855            }
1856        }
1857        let locked = locked.cast_locked::<M>();
1858        self.clear_suid_and_sgid_bits(locked, current_task)?;
1859        // We have to take the append lock since otherwise it would be possible to truncate and for
1860        // an append to continue using the old size.
1861        let (guard, locked) = strategy.lock(locked, current_task, self)?;
1862        self.ops().truncate(locked, &guard, self, current_task, length)?;
1863        self.update_ctime_mtime();
1864        Ok(())
1865    }
1866
1867    /// Avoid calling this method directly. You probably want to call `FileObject::fallocate()`
1868    /// which will also perform additional verifications.
1869    pub fn fallocate<L>(
1870        &self,
1871        locked: &mut Locked<L>,
1872        current_task: &CurrentTask,
1873        mode: FallocMode,
1874        offset: u64,
1875        length: u64,
1876    ) -> Result<(), Errno>
1877    where
1878        L: LockBefore<BeforeFsNodeAppend>,
1879    {
1880        self.fallocate_with_strategy(
1881            locked,
1882            RealAppendLockStrategy {},
1883            current_task,
1884            mode,
1885            offset,
1886            length,
1887        )
1888    }
1889
1890    pub fn fallocate_with_strategy<L, M>(
1891        &self,
1892        locked: &mut Locked<L>,
1893        strategy: impl AppendLockStrategy<M>,
1894        current_task: &CurrentTask,
1895        mode: FallocMode,
1896        offset: u64,
1897        length: u64,
1898    ) -> Result<(), Errno>
1899    where
1900        M: LockEqualOrBefore<FileOpsCore>,
1901        L: LockEqualOrBefore<M>,
1902    {
1903        let allocate_size = checked_add_offset_and_length(offset as usize, length as usize)
1904            .map_err(|_| errno!(EFBIG))? as u64;
1905        {
1906            let locked = locked.cast_locked::<M>().cast_locked::<FileOpsCore>();
1907            if allocate_size > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1908                send_standard_signal(locked, current_task, SignalInfo::kernel(SIGXFSZ));
1909                return error!(EFBIG);
1910            }
1911        }
1912
1913        let locked = locked.cast_locked::<M>();
1914        self.clear_suid_and_sgid_bits(locked, current_task)?;
1915        let (guard, locked) = strategy.lock(locked, current_task, self)?;
1916        self.ops().allocate(locked, &guard, self, current_task, mode, offset, length)?;
1917        self.update_ctime_mtime();
1918        Ok(())
1919    }
1920
1921    fn update_metadata_for_child(
1922        &self,
1923        current_task: &CurrentTask,
1924        mode: &mut FileMode,
1925        owner: &mut FsCred,
1926    ) {
1927        // The setgid bit on a directory causes the gid to be inherited by new children and the
1928        // setgid bit to be inherited by new child directories. See SetgidDirTest in gvisor.
1929        {
1930            let self_info = self.info();
1931            if self_info.mode.contains(FileMode::ISGID) {
1932                owner.gid = self_info.gid;
1933                if mode.is_dir() {
1934                    *mode |= FileMode::ISGID;
1935                }
1936            }
1937        }
1938
1939        if !mode.is_dir() {
1940            // https://man7.org/linux/man-pages/man7/inode.7.html says:
1941            //
1942            //   For an executable file, the set-group-ID bit causes the
1943            //   effective group ID of a process that executes the file to change
1944            //   as described in execve(2).
1945            //
1946            // We need to check whether the current task has permission to create such a file.
1947            // See a similar check in `FsNode::chmod`.
1948            let current_creds = current_task.current_creds();
1949            if owner.gid != current_creds.fsgid
1950                && !current_creds.is_in_group(owner.gid)
1951                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1952            {
1953                *mode &= !FileMode::ISGID;
1954            }
1955        }
1956    }
1957
1958    /// Checks if O_NOATIME is allowed,
1959    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1960        // Per open(2),
1961        //
1962        //   O_NOATIME (since Linux 2.6.8)
1963        //      ...
1964        //
1965        //      This flag can be employed only if one of the following
1966        //      conditions is true:
1967        //
1968        //      *  The effective UID of the process matches the owner UID
1969        //         of the file.
1970        //
1971        //      *  The calling process has the CAP_FOWNER capability in
1972        //         its user namespace and the owner UID of the file has a
1973        //         mapping in the namespace.
1974        if current_task.current_creds().fsuid != self.info().uid {
1975            security::check_task_capable(current_task, CAP_FOWNER)?;
1976        }
1977        Ok(())
1978    }
1979
1980    pub fn default_check_access_impl(
1981        &self,
1982        current_task: &CurrentTask,
1983        permission_flags: security::PermissionFlags,
1984        reason: CheckAccessReason,
1985        info: RwLockReadGuard<'_, FsNodeInfo>,
1986        audit_context: Auditable<'_>,
1987    ) -> Result<(), Errno> {
1988        let (node_uid, node_gid, mode) = (info.uid, info.gid, info.mode);
1989        std::mem::drop(info);
1990        if let CheckAccessReason::ChangeTimestamps { now } = reason {
1991            // To set the timestamps to the current time the caller must either have write access to
1992            // the file, be the file owner, or hold the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
1993            // To set the timestamps to other values the caller must either be the file owner or hold
1994            // the CAP_FOWNER capability.
1995            if current_task.current_creds().fsuid == node_uid {
1996                return Ok(());
1997            }
1998            if now {
1999                if security::is_task_capable_noaudit(current_task, CAP_FOWNER) {
2000                    return Ok(());
2001                }
2002            } else {
2003                security::check_task_capable(current_task, CAP_FOWNER)?;
2004                return Ok(());
2005            }
2006        }
2007        check_access(self, current_task, permission_flags, node_uid, node_gid, mode)?;
2008        security::fs_node_permission(current_task, self, permission_flags, audit_context)
2009    }
2010
2011    /// Check whether the node can be accessed in the current context with the specified access
2012    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
2013    /// owner or is in the file's group.
2014    pub fn check_access<'a, L>(
2015        &self,
2016        locked: &mut Locked<L>,
2017        current_task: &CurrentTask,
2018        mount: &MountInfo,
2019        access: impl Into<security::PermissionFlags>,
2020        reason: CheckAccessReason,
2021        audit_context: impl Into<security::Auditable<'a>>,
2022    ) -> Result<(), Errno>
2023    where
2024        L: LockEqualOrBefore<FileOpsCore>,
2025    {
2026        let mut permission_flags = access.into();
2027        if permission_flags.contains(security::PermissionFlags::WRITE) {
2028            mount.check_readonly_filesystem()?;
2029        }
2030        if permission_flags.contains(security::PermissionFlags::EXEC) && !self.is_dir() {
2031            mount.check_noexec_filesystem()?;
2032        }
2033        if reason == CheckAccessReason::Access {
2034            permission_flags |= PermissionFlags::ACCESS;
2035        }
2036        self.ops().check_access(
2037            locked.cast_locked::<FileOpsCore>(),
2038            self,
2039            current_task,
2040            permission_flags,
2041            &self.info,
2042            reason,
2043            audit_context.into(),
2044        )
2045    }
2046
2047    /// Check whether the stick bit, `S_ISVTX`, forbids the `current_task` from removing the given
2048    /// `child`. If this node has `S_ISVTX`, then either the child must be owned by the `fsuid` of
2049    /// `current_task` or `current_task` must have `CAP_FOWNER`.
2050    pub fn check_sticky_bit(
2051        &self,
2052        current_task: &CurrentTask,
2053        child: &FsNodeHandle,
2054    ) -> Result<(), Errno> {
2055        if self.info().mode.contains(FileMode::ISVTX)
2056            && child.info().uid != current_task.current_creds().fsuid
2057        {
2058            security::check_task_capable(current_task, CAP_FOWNER)?;
2059        }
2060        Ok(())
2061    }
2062
2063    pub fn fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
2064        assert!(self.is_fifo());
2065        self.ensure_rare_data().ensure_fifo(current_task)
2066    }
2067
2068    /// Returns the UNIX domain socket bound to this node, if any.
2069    pub fn bound_socket(&self) -> Option<&SocketHandle> {
2070        if let Some(rare_data) = self.rare_data.get() { rare_data.bound_socket.get() } else { None }
2071    }
2072
2073    /// Register the provided socket as the UNIX domain socket bound to this node.
2074    ///
2075    /// It is a fatal error to call this method again if it has already been called on this node.
2076    pub fn set_bound_socket(&self, socket: SocketHandle) {
2077        assert!(self.ensure_rare_data().bound_socket.set(socket).is_ok());
2078    }
2079
2080    pub fn update_attributes<L, F>(
2081        &self,
2082        locked: &mut Locked<L>,
2083        current_task: &CurrentTask,
2084        mutator: F,
2085    ) -> Result<(), Errno>
2086    where
2087        L: LockEqualOrBefore<FileOpsCore>,
2088        F: FnOnce(&mut FsNodeInfo) -> Result<(), Errno>,
2089    {
2090        let mut info = self.info.write();
2091        let mut new_info = info.clone();
2092        mutator(&mut new_info)?;
2093
2094        let new_access = new_info.mode.user_access()
2095            | new_info.mode.group_access()
2096            | new_info.mode.other_access();
2097
2098        if new_access.intersects(Access::EXEC) {
2099            let write_guard_state = self.write_guard_state.lock();
2100            if let Ok(seals) = write_guard_state.get_seals() {
2101                if seals.contains(SealFlags::NO_EXEC) {
2102                    return error!(EPERM);
2103                }
2104            }
2105        }
2106
2107        // `mutator`s should not update the attribute change time, which is managed by this API.
2108        assert_eq!(info.time_status_change, new_info.time_status_change);
2109        if *info == new_info {
2110            return Ok(());
2111        }
2112        new_info.time_status_change = utc::utc_now();
2113
2114        let mut has = zxio_node_attr_has_t { ..Default::default() };
2115        has.modification_time = info.time_modify != new_info.time_modify;
2116        has.access_time = info.time_access != new_info.time_access;
2117        has.mode = info.mode != new_info.mode;
2118        has.uid = info.uid != new_info.uid;
2119        has.gid = info.gid != new_info.gid;
2120        has.rdev = info.rdev != new_info.rdev;
2121        has.casefold = info.casefold != new_info.casefold;
2122        has.wrapping_key_id = info.wrapping_key_id != new_info.wrapping_key_id;
2123
2124        security::check_fs_node_setattr_access(current_task, &self, &has)?;
2125
2126        // Call `update_attributes(..)` to persist the changes for the following fields.
2127        if has.modification_time
2128            || has.access_time
2129            || has.mode
2130            || has.uid
2131            || has.gid
2132            || has.rdev
2133            || has.casefold
2134            || has.wrapping_key_id
2135        {
2136            let locked = locked.cast_locked::<FileOpsCore>();
2137            self.ops().update_attributes(locked, self, current_task, &new_info, has)?;
2138        }
2139
2140        *info = new_info;
2141        Ok(())
2142    }
2143
2144    /// Set the permissions on this FsNode to the given values.
2145    ///
2146    /// Does not change the IFMT of the node.
2147    pub fn chmod<L>(
2148        &self,
2149        locked: &mut Locked<L>,
2150        current_task: &CurrentTask,
2151        mount: &MountInfo,
2152        mut mode: FileMode,
2153    ) -> Result<(), Errno>
2154    where
2155        L: LockEqualOrBefore<FileOpsCore>,
2156    {
2157        mount.check_readonly_filesystem()?;
2158        self.update_attributes(locked, current_task, |info| {
2159            let current_creds = current_task.current_creds();
2160            if info.uid != current_creds.euid {
2161                security::check_task_capable(current_task, CAP_FOWNER)?;
2162            } else if info.gid != current_creds.egid
2163                && !current_creds.is_in_group(info.gid)
2164                && mode.intersects(FileMode::ISGID)
2165                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
2166            {
2167                mode &= !FileMode::ISGID;
2168            }
2169            info.chmod(mode);
2170            Ok(())
2171        })
2172    }
2173
2174    /// Sets the owner and/or group on this FsNode.
2175    pub fn chown<L>(
2176        &self,
2177        locked: &mut Locked<L>,
2178        current_task: &CurrentTask,
2179        mount: &MountInfo,
2180        owner: Option<uid_t>,
2181        group: Option<gid_t>,
2182    ) -> Result<(), Errno>
2183    where
2184        L: LockEqualOrBefore<FileOpsCore>,
2185    {
2186        mount.check_readonly_filesystem()?;
2187        self.update_attributes(locked, current_task, |info| {
2188            if security::is_task_capable_noaudit(current_task, CAP_CHOWN) {
2189                info.chown(owner, group);
2190                return Ok(());
2191            }
2192
2193            // Nobody can change the owner.
2194            if let Some(uid) = owner {
2195                if info.uid != uid {
2196                    return error!(EPERM);
2197                }
2198            }
2199
2200            let (euid, is_in_group) = {
2201                let current_creds = current_task.current_creds();
2202                (current_creds.euid, group.map(|gid| current_creds.is_in_group(gid)))
2203            };
2204
2205            // The owner can change the group.
2206            if info.uid == euid {
2207                // To a group that it belongs.
2208                if let Some(is_in_group) = is_in_group {
2209                    if !is_in_group {
2210                        return error!(EPERM);
2211                    }
2212                }
2213                info.chown(None, group);
2214                return Ok(());
2215            }
2216
2217            // Any other user can call chown(file, -1, -1)
2218            if owner.is_some() || group.is_some() {
2219                return error!(EPERM);
2220            }
2221
2222            // But not on set-user-ID or set-group-ID files.
2223            // If we were to chown them, they would drop the set-ID bit.
2224            if info.mode.is_reg()
2225                && (info.mode.contains(FileMode::ISUID)
2226                    || info.mode.contains(FileMode::ISGID | FileMode::IXGRP))
2227            {
2228                return error!(EPERM);
2229            }
2230
2231            info.chown(None, None);
2232            Ok(())
2233        })
2234    }
2235
2236    /// Forcefully change the owner and group of this node.
2237    ///
2238    /// # Safety
2239    ///
2240    /// This function skips all the security checks and just updates the owner and group. Also, does
2241    /// not check if the filesystem is read-only and does not update the attribute change time.
2242    ///
2243    /// This function is used to set the owner and group of /proc/pid to the credentials of the
2244    /// current task. Please consider carefully whether you want to use this function for another
2245    /// purpose.
2246    pub unsafe fn force_chown(&self, creds: FsCred) {
2247        self.update_info(|info| {
2248            info.chown(Some(creds.uid), Some(creds.gid));
2249        });
2250    }
2251
2252    /// Whether this node is a regular file.
2253    pub fn is_reg(&self) -> bool {
2254        self.info().mode.is_reg()
2255    }
2256
2257    /// Whether this node is a directory.
2258    pub fn is_dir(&self) -> bool {
2259        self.info().mode.is_dir()
2260    }
2261
2262    /// Whether this node is a socket.
2263    pub fn is_sock(&self) -> bool {
2264        self.info().mode.is_sock()
2265    }
2266
2267    /// Whether this node is a FIFO.
2268    pub fn is_fifo(&self) -> bool {
2269        self.info().mode.is_fifo()
2270    }
2271
2272    /// Whether this node is a symbolic link.
2273    pub fn is_lnk(&self) -> bool {
2274        self.info().mode.is_lnk()
2275    }
2276
2277    pub fn dev(&self) -> DeviceType {
2278        self.fs().dev_id
2279    }
2280
2281    pub fn stat<L>(
2282        &self,
2283        locked: &mut Locked<L>,
2284        current_task: &CurrentTask,
2285    ) -> Result<uapi::stat, Errno>
2286    where
2287        L: LockEqualOrBefore<FileOpsCore>,
2288    {
2289        security::check_fs_node_getattr_access(current_task, self)?;
2290
2291        let info = self.fetch_and_refresh_info(locked, current_task)?;
2292
2293        let time_to_kernel_timespec_pair = |t| {
2294            let timespec { tv_sec, tv_nsec } = timespec_from_time(t);
2295            let time = tv_sec.try_into().map_err(|_| errno!(EINVAL))?;
2296            let time_nsec = tv_nsec.try_into().map_err(|_| errno!(EINVAL))?;
2297            Ok((time, time_nsec))
2298        };
2299
2300        let (st_atime, st_atime_nsec) = time_to_kernel_timespec_pair(info.time_access)?;
2301        let (st_mtime, st_mtime_nsec) = time_to_kernel_timespec_pair(info.time_modify)?;
2302        let (st_ctime, st_ctime_nsec) = time_to_kernel_timespec_pair(info.time_status_change)?;
2303
2304        Ok(uapi::stat {
2305            st_dev: self.dev().bits(),
2306            st_ino: self.ino,
2307            st_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2308            st_mode: info.mode.bits(),
2309            st_uid: info.uid,
2310            st_gid: info.gid,
2311            st_rdev: info.rdev.bits(),
2312            st_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2313            st_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2314            st_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2315            st_atime,
2316            st_atime_nsec,
2317            st_mtime,
2318            st_mtime_nsec,
2319            st_ctime,
2320            st_ctime_nsec,
2321            ..Default::default()
2322        })
2323    }
2324
2325    // TODO(https://fxbug.dev/454730248): This is probably the wrong way to implement O_APPEND.
2326    pub fn get_size<L>(
2327        &self,
2328        locked: &mut Locked<L>,
2329        current_task: &CurrentTask,
2330    ) -> Result<usize, Errno>
2331    where
2332        L: LockEqualOrBefore<FileOpsCore>,
2333    {
2334        let info = self.fetch_and_refresh_info(locked, current_task)?;
2335        Ok(info.size.try_into().map_err(|_| errno!(EINVAL))?)
2336    }
2337
2338    fn statx_timestamp_from_time(time: UtcInstant) -> statx_timestamp {
2339        let nanos = time.into_nanos();
2340        statx_timestamp {
2341            tv_sec: nanos / NANOS_PER_SECOND,
2342            tv_nsec: (nanos % NANOS_PER_SECOND) as u32,
2343            ..Default::default()
2344        }
2345    }
2346
2347    pub fn statx<L>(
2348        &self,
2349        locked: &mut Locked<L>,
2350        current_task: &CurrentTask,
2351        flags: StatxFlags,
2352        mask: u32,
2353    ) -> Result<statx, Errno>
2354    where
2355        L: LockEqualOrBefore<FileOpsCore>,
2356    {
2357        security::check_fs_node_getattr_access(current_task, self)?;
2358
2359        // Ignore mask for now and fill in all of the fields.
2360        let info = if flags.contains(StatxFlags::AT_STATX_DONT_SYNC) {
2361            self.info()
2362        } else {
2363            self.fetch_and_refresh_info(locked, current_task)?
2364        };
2365        if mask & STATX__RESERVED == STATX__RESERVED {
2366            return error!(EINVAL);
2367        }
2368
2369        track_stub!(TODO("https://fxbug.dev/302594110"), "statx attributes");
2370        let stx_mnt_id = 0;
2371        let mut stx_attributes = 0;
2372        let stx_attributes_mask = STATX_ATTR_VERITY as u64;
2373
2374        if matches!(*self.fsverity.lock(), FsVerityState::FsVerity) {
2375            stx_attributes |= STATX_ATTR_VERITY as u64;
2376        }
2377
2378        Ok(statx {
2379            stx_mask: STATX_NLINK
2380                | STATX_UID
2381                | STATX_GID
2382                | STATX_ATIME
2383                | STATX_MTIME
2384                | STATX_CTIME
2385                | STATX_INO
2386                | STATX_SIZE
2387                | STATX_BLOCKS
2388                | STATX_BASIC_STATS,
2389            stx_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2390            stx_attributes,
2391            stx_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2392            stx_uid: info.uid,
2393            stx_gid: info.gid,
2394            stx_mode: info.mode.bits().try_into().map_err(|_| errno!(EINVAL))?,
2395            stx_ino: self.ino,
2396            stx_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2397            stx_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2398            stx_attributes_mask,
2399            stx_ctime: Self::statx_timestamp_from_time(info.time_status_change),
2400            stx_mtime: Self::statx_timestamp_from_time(info.time_modify),
2401            stx_atime: Self::statx_timestamp_from_time(info.time_access),
2402
2403            stx_rdev_major: info.rdev.major(),
2404            stx_rdev_minor: info.rdev.minor(),
2405
2406            stx_dev_major: self.fs().dev_id.major(),
2407            stx_dev_minor: self.fs().dev_id.minor(),
2408            stx_mnt_id,
2409            ..Default::default()
2410        })
2411    }
2412
2413    /// Checks whether `current_task` has capabilities required for the specified `access` to the
2414    /// extended attribute `name`.
2415    fn check_xattr_access<L>(
2416        &self,
2417        locked: &mut Locked<L>,
2418        current_task: &CurrentTask,
2419        mount: &MountInfo,
2420        name: &FsStr,
2421        access: Access,
2422    ) -> Result<(), Errno>
2423    where
2424        L: LockEqualOrBefore<FileOpsCore>,
2425    {
2426        assert!(access == Access::READ || access == Access::WRITE);
2427
2428        let enodata_if_read =
2429            |e: Errno| if access == Access::READ && e.code == EPERM { errno!(ENODATA) } else { e };
2430
2431        // man xattr(7) describes the different access checks applied to each extended attribute
2432        // namespace.
2433        if name.starts_with(XATTR_USER_PREFIX.to_bytes()) {
2434            {
2435                let info = self.info();
2436                if !info.mode.is_reg() && !info.mode.is_dir() {
2437                    return Err(enodata_if_read(errno!(EPERM)));
2438                }
2439            }
2440
2441            // TODO: https://fxbug.dev/460734830 - Perform capability check(s) if file has sticky
2442            // bit set.
2443
2444            self.check_access(
2445                locked,
2446                current_task,
2447                mount,
2448                access,
2449                CheckAccessReason::InternalPermissionChecks,
2450                security::Auditable::Name(name),
2451            )?;
2452        } else if name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()) {
2453            // Trusted extended attributes require `CAP_SYS_ADMIN` to read or write.
2454            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2455        } else if name.starts_with(XATTR_SYSTEM_PREFIX.to_bytes()) {
2456            // System extended attributes have attribute-specific access policy.
2457            // TODO: https://fxbug.dev/460734830 -  Revise how system extended attributes are
2458            // access-controlled.
2459            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2460        } else if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2461            if access == Access::WRITE {
2462                // Writes require `CAP_SYS_ADMIN`, unless the LSM owning `name` specifies to skip.
2463                if !security::fs_node_xattr_skipcap(current_task, name) {
2464                    security::check_task_capable(current_task, CAP_SYS_ADMIN)
2465                        .map_err(enodata_if_read)?;
2466                }
2467            }
2468        } else {
2469            panic!("Unknown extended attribute prefix: {}", name);
2470        }
2471        Ok(())
2472    }
2473
2474    pub fn get_xattr<L>(
2475        &self,
2476        locked: &mut Locked<L>,
2477        current_task: &CurrentTask,
2478        mount: &MountInfo,
2479        name: &FsStr,
2480        max_size: usize,
2481    ) -> Result<ValueOrSize<FsString>, Errno>
2482    where
2483        L: LockEqualOrBefore<FileOpsCore>,
2484    {
2485        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2486        self.check_xattr_access(locked, current_task, mount, name, Access::READ)?;
2487
2488        // LSM access checks must be performed after discretionary checks.
2489        security::check_fs_node_getxattr_access(current_task, self, name)?;
2490
2491        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2492            // If the attribute is in the security.* domain then allow the LSM to handle the
2493            // request, or to delegate to `FsNodeOps::get_xattr()`.
2494            security::fs_node_getsecurity(locked, current_task, self, name, max_size)
2495        } else {
2496            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2497            self.ops().get_xattr(
2498                locked.cast_locked::<FileOpsCore>(),
2499                self,
2500                current_task,
2501                name,
2502                max_size,
2503            )
2504        }
2505    }
2506
2507    pub fn set_xattr<L>(
2508        &self,
2509        locked: &mut Locked<L>,
2510        current_task: &CurrentTask,
2511        mount: &MountInfo,
2512        name: &FsStr,
2513        value: &FsStr,
2514        op: XattrOp,
2515    ) -> Result<(), Errno>
2516    where
2517        L: LockEqualOrBefore<FileOpsCore>,
2518    {
2519        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2520        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2521
2522        // LSM access checks must be performed after discretionary checks.
2523        security::check_fs_node_setxattr_access(current_task, self, name, value, op)?;
2524
2525        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2526            // If the attribute is in the security.* domain then allow the LSM to handle the
2527            // request, or to delegate to `FsNodeOps::set_xattr()`.
2528            security::fs_node_setsecurity(locked, current_task, self, name, value, op)
2529        } else {
2530            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2531            self.ops().set_xattr(
2532                locked.cast_locked::<FileOpsCore>(),
2533                self,
2534                current_task,
2535                name,
2536                value,
2537                op,
2538            )
2539        }
2540    }
2541
2542    pub fn remove_xattr<L>(
2543        &self,
2544        locked: &mut Locked<L>,
2545        current_task: &CurrentTask,
2546        mount: &MountInfo,
2547        name: &FsStr,
2548    ) -> Result<(), Errno>
2549    where
2550        L: LockEqualOrBefore<FileOpsCore>,
2551    {
2552        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2553        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2554
2555        // LSM access checks must be performed after discretionary checks.
2556        security::check_fs_node_removexattr_access(current_task, self, name)?;
2557        self.ops().remove_xattr(locked.cast_locked::<FileOpsCore>(), self, current_task, name)
2558    }
2559
2560    pub fn list_xattrs<L>(
2561        &self,
2562        locked: &mut Locked<L>,
2563        current_task: &CurrentTask,
2564        max_size: usize,
2565    ) -> Result<ValueOrSize<Vec<FsString>>, Errno>
2566    where
2567        L: LockEqualOrBefore<FileOpsCore>,
2568    {
2569        security::check_fs_node_listxattr_access(current_task, self)?;
2570        Ok(self
2571            .ops()
2572            .list_xattrs(locked.cast_locked::<FileOpsCore>(), self, current_task, max_size)?
2573            .map(|mut v| {
2574                // Extended attributes may be listed even if the caller would not be able to read
2575                // (or modify) the attribute's value.
2576                // trusted.* attributes are only accessible with CAP_SYS_ADMIN and are omitted by
2577                // `listxattr()` unless the caller holds CAP_SYS_ADMIN.
2578                if !security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN) {
2579                    v.retain(|name| !name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()));
2580                }
2581                v
2582            }))
2583    }
2584
2585    /// Returns current `FsNodeInfo`.
2586    pub fn info(&self) -> RwLockReadGuard<'_, FsNodeInfo> {
2587        self.info.read()
2588    }
2589
2590    /// Refreshes the `FsNodeInfo` if necessary and returns a read guard.
2591    pub fn fetch_and_refresh_info<L>(
2592        &self,
2593        locked: &mut Locked<L>,
2594        current_task: &CurrentTask,
2595    ) -> Result<RwLockReadGuard<'_, FsNodeInfo>, Errno>
2596    where
2597        L: LockEqualOrBefore<FileOpsCore>,
2598    {
2599        self.ops().fetch_and_refresh_info(
2600            locked.cast_locked::<FileOpsCore>(),
2601            self,
2602            current_task,
2603            &self.info,
2604        )
2605    }
2606
2607    pub fn update_info<F, T>(&self, mutator: F) -> T
2608    where
2609        F: FnOnce(&mut FsNodeInfo) -> T,
2610    {
2611        let mut info = self.info.write();
2612        mutator(&mut info)
2613    }
2614
2615    /// Clear the SUID and SGID bits unless the `current_task` has `CAP_FSETID`
2616    pub fn clear_suid_and_sgid_bits<L>(
2617        &self,
2618        locked: &mut Locked<L>,
2619        current_task: &CurrentTask,
2620    ) -> Result<(), Errno>
2621    where
2622        L: LockEqualOrBefore<FileOpsCore>,
2623    {
2624        if !security::is_task_capable_noaudit(current_task, CAP_FSETID) {
2625            self.update_attributes(locked, current_task, |info| {
2626                info.clear_suid_and_sgid_bits();
2627                Ok(())
2628            })?;
2629        }
2630        Ok(())
2631    }
2632
2633    /// Update the ctime and mtime of a file to now.
2634    pub fn update_ctime_mtime(&self) {
2635        if self.fs().manages_timestamps() {
2636            return;
2637        }
2638        self.update_info(|info| {
2639            let now = utc::utc_now();
2640            info.time_status_change = now;
2641            info.time_modify = now;
2642        });
2643    }
2644
2645    /// Update the ctime of a file to now.
2646    pub fn update_ctime(&self) {
2647        if self.fs().manages_timestamps() {
2648            return;
2649        }
2650        self.update_info(|info| {
2651            let now = utc::utc_now();
2652            info.time_status_change = now;
2653        });
2654    }
2655
2656    /// Update the atime and mtime if the `current_task` has write access, is the file owner, or
2657    /// holds either the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
2658    pub fn update_atime_mtime<L>(
2659        &self,
2660        locked: &mut Locked<L>,
2661        current_task: &CurrentTask,
2662        mount: &MountInfo,
2663        atime: TimeUpdateType,
2664        mtime: TimeUpdateType,
2665    ) -> Result<(), Errno>
2666    where
2667        L: LockEqualOrBefore<FileOpsCore>,
2668    {
2669        // If the filesystem is read-only, this always fail.
2670        mount.check_readonly_filesystem()?;
2671
2672        let now = matches!((atime, mtime), (TimeUpdateType::Now, TimeUpdateType::Now));
2673        self.check_access(
2674            locked,
2675            current_task,
2676            mount,
2677            Access::WRITE,
2678            CheckAccessReason::ChangeTimestamps { now },
2679            security::Auditable::Location(std::panic::Location::caller()),
2680        )?;
2681
2682        if !matches!((atime, mtime), (TimeUpdateType::Omit, TimeUpdateType::Omit)) {
2683            // This function is called by `utimes(..)` which will update the access and
2684            // modification time. We need to call `update_attributes()` to update the mtime of
2685            // filesystems that manages file timestamps.
2686            self.update_attributes(locked, current_task, |info| {
2687                let now = utc::utc_now();
2688                let get_time = |time: TimeUpdateType| match time {
2689                    TimeUpdateType::Now => Some(now),
2690                    TimeUpdateType::Time(t) => Some(t),
2691                    TimeUpdateType::Omit => None,
2692                };
2693                if let Some(time) = get_time(atime) {
2694                    info.time_access = time;
2695                }
2696                if let Some(time) = get_time(mtime) {
2697                    info.time_modify = time;
2698                }
2699                Ok(())
2700            })?;
2701        }
2702        Ok(())
2703    }
2704
2705    /// Returns a string describing this `FsNode` in the format used by "/proc/../fd" for anonymous
2706    /// file descriptors. By default this is in the form:
2707    ///   <class>:[<node_id>]
2708    /// though `FsNodeOps` may customize this as required.
2709    pub fn internal_name(&self) -> FsString {
2710        if let Some(name) = self.ops().internal_name(self) {
2711            return name;
2712        };
2713        let class = if self.is_sock() {
2714            "socket"
2715        } else if self.is_fifo() {
2716            "pipe"
2717        } else {
2718            "file"
2719        };
2720        format!("{}:[{}]", class, self.ino).into()
2721    }
2722
2723    /// The key used to identify this node in the file system's node cache.
2724    ///
2725    /// For many file systems, this will be the same as the inode number. However, some file
2726    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
2727    pub fn node_key(&self) -> ino_t {
2728        self.ops().node_key(self)
2729    }
2730
2731    fn ensure_rare_data(&self) -> &FsNodeRareData {
2732        self.rare_data.get_or_init(|| Box::new(FsNodeRareData::default()))
2733    }
2734
2735    /// Returns the set of watchers for this node.
2736    ///
2737    /// Only call this function if you require this node to actually store a list of watchers. If
2738    /// you just wish to notify any watchers that might exist, please use `notify` instead.
2739    pub fn ensure_watchers(&self) -> &inotify::InotifyWatchers {
2740        &self.ensure_rare_data().watchers
2741    }
2742
2743    /// Notify the watchers of the given event.
2744    pub fn notify(
2745        &self,
2746        event_mask: InotifyMask,
2747        cookie: u32,
2748        name: &FsStr,
2749        mode: FileMode,
2750        is_dead: bool,
2751    ) {
2752        if let Some(rare_data) = self.rare_data.get() {
2753            rare_data.watchers.notify(event_mask, cookie, name, mode, is_dead);
2754        }
2755    }
2756
2757    /// Calls through to the filesystem to enable fs-verity on this file.
2758    pub fn enable_fsverity<L>(
2759        &self,
2760        locked: &mut Locked<L>,
2761        current_task: &CurrentTask,
2762        descriptor: &fsverity_descriptor,
2763    ) -> Result<(), Errno>
2764    where
2765        L: LockEqualOrBefore<FileOpsCore>,
2766    {
2767        let locked = locked.cast_locked::<FileOpsCore>();
2768        self.ops().enable_fsverity(locked, self, current_task, descriptor)
2769    }
2770}
2771
2772impl std::fmt::Debug for FsNode {
2773    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2774        f.debug_struct("FsNode")
2775            .field("fs", &self.fs().name())
2776            .field("info", &*self.info())
2777            .field("ops_ty", &self.ops().type_name())
2778            .finish()
2779    }
2780}
2781
2782impl Releasable for FsNode {
2783    type Context<'a> = CurrentTaskAndLocked<'a>;
2784
2785    fn release<'a>(self, context: CurrentTaskAndLocked<'a>) {
2786        let (locked, current_task) = context;
2787        if let Some(fs) = self.fs.upgrade() {
2788            fs.remove_node(&self);
2789        }
2790        if let Err(err) = self.ops.forget(
2791            locked.cast_locked::<FileOpsCore>(),
2792            current_task,
2793            self.info.into_inner(),
2794        ) {
2795            log_error!("Error on FsNodeOps::forget: {err:?}");
2796        }
2797    }
2798}
2799
2800fn check_access(
2801    fs_node: &FsNode,
2802    current_task: &CurrentTask,
2803    permission_flags: security::PermissionFlags,
2804    node_uid: uid_t,
2805    node_gid: gid_t,
2806    mode: FileMode,
2807) -> Result<(), Errno> {
2808    // Determine which of the access bits apply to the `current_task`.
2809    let (fsuid, is_in_group) = {
2810        let current_creds = current_task.current_creds();
2811        (current_creds.fsuid, current_creds.is_in_group(node_gid))
2812    };
2813    let granted = if fsuid == node_uid {
2814        mode.user_access()
2815    } else if is_in_group {
2816        mode.group_access()
2817    } else {
2818        mode.other_access()
2819    };
2820
2821    let access = permission_flags.as_access();
2822    if granted.contains(access) {
2823        return Ok(());
2824    }
2825
2826    // Callers with CAP_DAC_READ_SEARCH override can read files & directories, and traverse
2827    // directories to which they lack permission.
2828    let mut requested = access & !granted;
2829
2830    // If this check was triggered by `access()`, or a variant, then check for a `dontaudit`
2831    // statement for the `audit_access` permission for this caller & file.
2832    let have_dont_audit = OnceBool::new();
2833    let has_capability = move |current_task, capability| {
2834        let dont_audit = have_dont_audit.get_or_init(|| {
2835            permission_flags.contains(PermissionFlags::ACCESS)
2836                && security::has_dontaudit_access(current_task, fs_node)
2837        });
2838        if dont_audit {
2839            security::is_task_capable_noaudit(current_task, capability)
2840        } else {
2841            security::check_task_capable(current_task, capability).is_ok()
2842        }
2843    };
2844
2845    // CAP_DAC_READ_SEARCH allows bypass of read checks, and directory traverse (eXecute) checks.
2846    let dac_read_search_access =
2847        if mode.is_dir() { Access::READ | Access::EXEC } else { Access::READ };
2848    if dac_read_search_access.intersects(requested)
2849        && has_capability(current_task, CAP_DAC_READ_SEARCH)
2850    {
2851        requested.remove(dac_read_search_access);
2852    }
2853    if requested.is_empty() {
2854        return Ok(());
2855    }
2856
2857    // CAP_DAC_OVERRIDE allows bypass of all checks (though see the comment for file-execute).
2858    let mut dac_override_access = Access::READ | Access::WRITE;
2859    dac_override_access |= if mode.is_dir() {
2860        Access::EXEC
2861    } else {
2862        // File execute access checks may not be bypassed unless at least one executable bit is set.
2863        (mode.user_access() | mode.group_access() | mode.other_access()) & Access::EXEC
2864    };
2865    if dac_override_access.intersects(requested) && has_capability(current_task, CAP_DAC_OVERRIDE) {
2866        requested.remove(dac_override_access);
2867    }
2868    if requested.is_empty() {
2869        return Ok(());
2870    }
2871
2872    return error!(EACCES);
2873}
2874
2875#[cfg(test)]
2876mod tests {
2877    use super::*;
2878    use crate::device::mem::mem_device_init;
2879    use crate::testing::*;
2880    use crate::vfs::buffers::VecOutputBuffer;
2881    use starnix_uapi::auth::Credentials;
2882    use starnix_uapi::file_mode::mode;
2883
2884    #[::fuchsia::test]
2885    async fn open_device_file() {
2886        spawn_kernel_and_run(async |locked, current_task| {
2887            mem_device_init(locked, &*current_task).expect("mem_device_init");
2888
2889            // Create a device file that points to the `zero` device (which is automatically
2890            // registered in the kernel).
2891            current_task
2892                .fs()
2893                .root()
2894                .create_node(
2895                    locked,
2896                    &current_task,
2897                    "zero".into(),
2898                    mode!(IFCHR, 0o666),
2899                    DeviceType::ZERO,
2900                )
2901                .expect("create_node");
2902
2903            const CONTENT_LEN: usize = 10;
2904            let mut buffer = VecOutputBuffer::new(CONTENT_LEN);
2905
2906            // Read from the zero device.
2907            let device_file = current_task
2908                .open_file(locked, "zero".into(), OpenFlags::RDONLY)
2909                .expect("open device file");
2910            device_file.read(locked, &current_task, &mut buffer).expect("read from zero");
2911
2912            // Assert the contents.
2913            assert_eq!(&[0; CONTENT_LEN], buffer.data());
2914        })
2915        .await;
2916    }
2917
2918    #[::fuchsia::test]
2919    async fn node_info_is_reflected_in_stat() {
2920        spawn_kernel_and_run(async |locked, current_task| {
2921            // Create a node.
2922            let node = &current_task
2923                .fs()
2924                .root()
2925                .create_node(
2926                    locked,
2927                    &current_task,
2928                    "zero".into(),
2929                    FileMode::IFCHR,
2930                    DeviceType::ZERO,
2931                )
2932                .expect("create_node")
2933                .entry
2934                .node;
2935            node.update_info(|info| {
2936                info.mode = FileMode::IFSOCK;
2937                info.size = 1;
2938                info.blocks = 2;
2939                info.blksize = 4;
2940                info.uid = 9;
2941                info.gid = 10;
2942                info.link_count = 11;
2943                info.time_status_change = UtcInstant::from_nanos(1);
2944                info.time_access = UtcInstant::from_nanos(2);
2945                info.time_modify = UtcInstant::from_nanos(3);
2946                info.rdev = DeviceType::new(13, 13);
2947            });
2948            let stat = node.stat(locked, &current_task).expect("stat");
2949
2950            assert_eq!(stat.st_mode, FileMode::IFSOCK.bits());
2951            assert_eq!(stat.st_size, 1);
2952            assert_eq!(stat.st_blksize, 4);
2953            assert_eq!(stat.st_blocks, 2);
2954            assert_eq!(stat.st_uid, 9);
2955            assert_eq!(stat.st_gid, 10);
2956            assert_eq!(stat.st_nlink, 11);
2957            assert_eq!(stat.st_ctime, 0);
2958            assert_eq!(stat.st_ctime_nsec, 1);
2959            assert_eq!(stat.st_atime, 0);
2960            assert_eq!(stat.st_atime_nsec, 2);
2961            assert_eq!(stat.st_mtime, 0);
2962            assert_eq!(stat.st_mtime_nsec, 3);
2963            assert_eq!(stat.st_rdev, DeviceType::new(13, 13).bits());
2964        })
2965        .await;
2966    }
2967
2968    #[::fuchsia::test]
2969    fn test_flock_operation() {
2970        assert!(FlockOperation::from_flags(0).is_err());
2971        assert!(FlockOperation::from_flags(u32::MAX).is_err());
2972
2973        let operation1 = FlockOperation::from_flags(LOCK_SH).expect("from_flags");
2974        assert!(!operation1.is_unlock());
2975        assert!(!operation1.is_lock_exclusive());
2976        assert!(operation1.is_blocking());
2977
2978        let operation2 = FlockOperation::from_flags(LOCK_EX | LOCK_NB).expect("from_flags");
2979        assert!(!operation2.is_unlock());
2980        assert!(operation2.is_lock_exclusive());
2981        assert!(!operation2.is_blocking());
2982
2983        let operation3 = FlockOperation::from_flags(LOCK_UN).expect("from_flags");
2984        assert!(operation3.is_unlock());
2985        assert!(!operation3.is_lock_exclusive());
2986        assert!(operation3.is_blocking());
2987    }
2988
2989    #[::fuchsia::test]
2990    async fn test_check_access() {
2991        spawn_kernel_and_run(async |locked, current_task| {
2992            let mut creds = Credentials::with_ids(1, 2);
2993            creds.groups = vec![3, 4];
2994            current_task.set_creds(creds);
2995
2996            // Create a node.
2997            let node = &current_task
2998                .fs()
2999                .root()
3000                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3001                .expect("create_node")
3002                .entry
3003                .node;
3004            let check_access = |locked: &mut Locked<Unlocked>,
3005                                uid: uid_t,
3006                                gid: gid_t,
3007                                perm: u32,
3008                                access: Access| {
3009                node.update_info(|info| {
3010                    info.mode = mode!(IFREG, perm);
3011                    info.uid = uid;
3012                    info.gid = gid;
3013                });
3014                node.check_access(
3015                    locked,
3016                    &current_task,
3017                    &MountInfo::detached(),
3018                    access,
3019                    CheckAccessReason::InternalPermissionChecks,
3020                    security::Auditable::Location(std::panic::Location::caller()),
3021                )
3022            };
3023
3024            assert_eq!(check_access(locked, 0, 0, 0o700, Access::EXEC), error!(EACCES));
3025            assert_eq!(check_access(locked, 0, 0, 0o700, Access::READ), error!(EACCES));
3026            assert_eq!(check_access(locked, 0, 0, 0o700, Access::WRITE), error!(EACCES));
3027
3028            assert_eq!(check_access(locked, 0, 0, 0o070, Access::EXEC), error!(EACCES));
3029            assert_eq!(check_access(locked, 0, 0, 0o070, Access::READ), error!(EACCES));
3030            assert_eq!(check_access(locked, 0, 0, 0o070, Access::WRITE), error!(EACCES));
3031
3032            assert_eq!(check_access(locked, 0, 0, 0o007, Access::EXEC), Ok(()));
3033            assert_eq!(check_access(locked, 0, 0, 0o007, Access::READ), Ok(()));
3034            assert_eq!(check_access(locked, 0, 0, 0o007, Access::WRITE), Ok(()));
3035
3036            assert_eq!(check_access(locked, 1, 0, 0o700, Access::EXEC), Ok(()));
3037            assert_eq!(check_access(locked, 1, 0, 0o700, Access::READ), Ok(()));
3038            assert_eq!(check_access(locked, 1, 0, 0o700, Access::WRITE), Ok(()));
3039
3040            assert_eq!(check_access(locked, 1, 0, 0o100, Access::EXEC), Ok(()));
3041            assert_eq!(check_access(locked, 1, 0, 0o100, Access::READ), error!(EACCES));
3042            assert_eq!(check_access(locked, 1, 0, 0o100, Access::WRITE), error!(EACCES));
3043
3044            assert_eq!(check_access(locked, 1, 0, 0o200, Access::EXEC), error!(EACCES));
3045            assert_eq!(check_access(locked, 1, 0, 0o200, Access::READ), error!(EACCES));
3046            assert_eq!(check_access(locked, 1, 0, 0o200, Access::WRITE), Ok(()));
3047
3048            assert_eq!(check_access(locked, 1, 0, 0o400, Access::EXEC), error!(EACCES));
3049            assert_eq!(check_access(locked, 1, 0, 0o400, Access::READ), Ok(()));
3050            assert_eq!(check_access(locked, 1, 0, 0o400, Access::WRITE), error!(EACCES));
3051
3052            assert_eq!(check_access(locked, 0, 2, 0o700, Access::EXEC), error!(EACCES));
3053            assert_eq!(check_access(locked, 0, 2, 0o700, Access::READ), error!(EACCES));
3054            assert_eq!(check_access(locked, 0, 2, 0o700, Access::WRITE), error!(EACCES));
3055
3056            assert_eq!(check_access(locked, 0, 2, 0o070, Access::EXEC), Ok(()));
3057            assert_eq!(check_access(locked, 0, 2, 0o070, Access::READ), Ok(()));
3058            assert_eq!(check_access(locked, 0, 2, 0o070, Access::WRITE), Ok(()));
3059
3060            assert_eq!(check_access(locked, 0, 3, 0o070, Access::EXEC), Ok(()));
3061            assert_eq!(check_access(locked, 0, 3, 0o070, Access::READ), Ok(()));
3062            assert_eq!(check_access(locked, 0, 3, 0o070, Access::WRITE), Ok(()));
3063        })
3064        .await;
3065    }
3066
3067    #[::fuchsia::test]
3068    async fn set_security_xattr_fails_without_security_module_or_root() {
3069        spawn_kernel_and_run(async |locked, current_task| {
3070            let mut creds = Credentials::with_ids(1, 2);
3071            creds.groups = vec![3, 4];
3072            current_task.set_creds(creds);
3073
3074            // Create a node.
3075            let node = &current_task
3076                .fs()
3077                .root()
3078                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3079                .expect("create_node")
3080                .entry
3081                .node;
3082
3083            // Give read-write-execute access.
3084            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3085
3086            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3087            // should fail.
3088            assert_eq!(
3089                node.set_xattr(
3090                    locked,
3091                    &current_task,
3092                    &MountInfo::detached(),
3093                    "security.name".into(),
3094                    "security_label".into(),
3095                    XattrOp::Create,
3096                ),
3097                error!(EPERM)
3098            );
3099        })
3100        .await;
3101    }
3102
3103    #[::fuchsia::test]
3104    async fn set_non_user_xattr_fails_without_security_module_or_root() {
3105        spawn_kernel_and_run(async |locked, current_task| {
3106            let mut creds = Credentials::with_ids(1, 2);
3107            creds.groups = vec![3, 4];
3108            current_task.set_creds(creds);
3109
3110            // Create a node.
3111            let node = &current_task
3112                .fs()
3113                .root()
3114                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3115                .expect("create_node")
3116                .entry
3117                .node;
3118
3119            // Give read-write-execute access.
3120            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3121
3122            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3123            // should fail.
3124            assert_eq!(
3125                node.set_xattr(
3126                    locked,
3127                    &current_task,
3128                    &MountInfo::detached(),
3129                    "trusted.name".into(),
3130                    "some data".into(),
3131                    XattrOp::Create,
3132                ),
3133                error!(EPERM)
3134            );
3135        })
3136        .await;
3137    }
3138
3139    #[::fuchsia::test]
3140    async fn get_security_xattr_succeeds_without_read_access() {
3141        spawn_kernel_and_run(async |locked, current_task| {
3142            let mut creds = Credentials::with_ids(1, 2);
3143            creds.groups = vec![3, 4];
3144            current_task.set_creds(creds);
3145
3146            // Create a node.
3147            let node = &current_task
3148                .fs()
3149                .root()
3150                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3151                .expect("create_node")
3152                .entry
3153                .node;
3154
3155            // Only give read access to the root and give root access to the current task.
3156            node.update_info(|info| info.mode = mode!(IFREG, 0o100));
3157            current_task.set_creds(Credentials::with_ids(0, 0));
3158
3159            // Setting the label should succeed even without write access to the file.
3160            assert_eq!(
3161                node.set_xattr(
3162                    locked,
3163                    &current_task,
3164                    &MountInfo::detached(),
3165                    "security.name".into(),
3166                    "security_label".into(),
3167                    XattrOp::Create,
3168                ),
3169                Ok(())
3170            );
3171
3172            // Remove root access from the current task.
3173            current_task.set_creds(Credentials::with_ids(1, 1));
3174
3175            // Getting the label should succeed even without read access to the file.
3176            assert_eq!(
3177                node.get_xattr(
3178                    locked,
3179                    &current_task,
3180                    &MountInfo::detached(),
3181                    "security.name".into(),
3182                    4096
3183                ),
3184                Ok(ValueOrSize::Value("security_label".into()))
3185            );
3186        })
3187        .await;
3188    }
3189}