Skip to main content

starnix_core/vfs/
fs_node.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::device::DeviceMode;
6use crate::mm::PAGE_SIZE;
7use crate::security::{self, Auditable, PermissionFlags};
8use crate::signals::{SignalInfo, send_standard_signal};
9use crate::task::{CurrentTask, CurrentTaskAndLocked, WaitQueue, Waiter, register_delayed_release};
10use crate::time::utc;
11use crate::vfs::fsverity::FsVerityState;
12use crate::vfs::pipe::{Pipe, PipeHandle};
13use crate::vfs::rw_queue::{RwQueue, RwQueueReadGuard, RwQueueWriteGuard};
14use crate::vfs::socket::SocketHandle;
15use crate::vfs::{
16    DefaultDirEntryOps, DirEntryOps, FileObject, FileObjectState, FileOps, FileSystem,
17    FileSystemHandle, FileWriteGuardState, FsStr, FsString, MAX_LFS_FILESIZE, MountInfo,
18    NamespaceNode, OPathOps, RecordLockCommand, RecordLockOwner, RecordLocks, WeakFileHandle,
19    checked_add_offset_and_length, inotify,
20};
21use bitflags::bitflags;
22use fuchsia_runtime::UtcInstant;
23use linux_uapi::{XATTR_SECURITY_PREFIX, XATTR_SYSTEM_PREFIX, XATTR_TRUSTED_PREFIX};
24use once_cell::race::OnceBool;
25use smallvec::SmallVec;
26use starnix_crypt::EncryptionKeyId;
27use starnix_lifecycle::{ObjectReleaser, ReleaserAction};
28use starnix_logging::{log_error, track_stub};
29use starnix_sync::{
30    BeforeFsNodeAppend, FileOpsCore, FsNodeAppend, LockEqualOrBefore, Locked, Mutex, RwLock,
31    RwLockReadGuard, Unlocked,
32};
33use starnix_types::ownership::{Releasable, ReleaseGuard};
34use starnix_types::time::{NANOS_PER_SECOND, timespec_from_time};
35use starnix_uapi::as_any::AsAny;
36use starnix_uapi::auth::{
37    CAP_CHOWN, CAP_DAC_OVERRIDE, CAP_DAC_READ_SEARCH, CAP_FOWNER, CAP_FSETID, CAP_MKNOD,
38    CAP_SYS_ADMIN, CAP_SYS_RESOURCE, FsCred, UserAndOrGroupId,
39};
40use starnix_uapi::device_id::DeviceId;
41use starnix_uapi::errors::{EACCES, ENOTSUP, EPERM, Errno};
42use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
43use starnix_uapi::inotify_mask::InotifyMask;
44use starnix_uapi::mount_flags::MountFlags;
45use starnix_uapi::open_flags::OpenFlags;
46use starnix_uapi::resource_limits::Resource;
47use starnix_uapi::seal_flags::SealFlags;
48use starnix_uapi::signals::SIGXFSZ;
49use starnix_uapi::{
50    FALLOC_FL_COLLAPSE_RANGE, FALLOC_FL_INSERT_RANGE, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE,
51    FALLOC_FL_UNSHARE_RANGE, FALLOC_FL_ZERO_RANGE, LOCK_EX, LOCK_NB, LOCK_SH, LOCK_UN,
52    STATX__RESERVED, STATX_ATIME, STATX_ATTR_VERITY, STATX_BASIC_STATS, STATX_BLOCKS, STATX_CTIME,
53    STATX_GID, STATX_INO, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_UID, XATTR_USER_PREFIX,
54    errno, error, fsverity_descriptor, gid_t, ino_t, statx, statx_timestamp, timespec, uapi, uid_t,
55};
56use std::sync::atomic::Ordering;
57use std::sync::{Arc, OnceLock, Weak};
58use syncio::zxio_node_attr_has_t;
59
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub enum FsNodeLinkBehavior {
62    Allowed,
63    Disallowed,
64}
65
66impl Default for FsNodeLinkBehavior {
67    fn default() -> Self {
68        FsNodeLinkBehavior::Allowed
69    }
70}
71
72pub type AppendLockGuard<'a> = RwQueueReadGuard<'a, FsNodeAppend>;
73pub type AppendLockWriteGuard<'a> = RwQueueWriteGuard<'a, FsNodeAppend>;
74
75bitflags! {
76    pub struct FsNodeFlags: u8 {
77        const IS_PRIVATE = 1 << 0;
78    }
79}
80
81pub struct FsNode {
82    /// The inode number for this FsNode.
83    pub ino: ino_t,
84
85    /// Flags for this node.
86    pub flags: FsNodeFlags,
87
88    /// The FsNodeOps for this FsNode.
89    ///
90    /// The FsNodeOps are implemented by the individual file systems to provide
91    /// specific behaviors for this FsNode.
92    ops: Box<dyn FsNodeOps>,
93
94    /// The FileSystem that owns this FsNode's tree.
95    fs: Weak<FileSystem>,
96
97    /// A RwLock to synchronize append operations for this node.
98    ///
99    /// FileObjects writing with O_APPEND should grab a write() lock on this
100    /// field to ensure they operate sequentially. FileObjects writing without
101    /// O_APPEND should grab read() lock so that they can operate in parallel.
102    pub append_lock: RwQueue<FsNodeAppend>,
103
104    /// Mutable information about this node.
105    ///
106    /// This data is used to populate the uapi::stat structure.
107    info: RwLock<FsNodeInfo>,
108
109    /// Data associated with an FsNode that is rarely needed.
110    rare_data: OnceLock<Box<FsNodeRareData>>,
111
112    /// Tracks lock state for this file.
113    pub write_guard_state: Mutex<FileWriteGuardState>,
114
115    /// Cached FsVerity state associated with this node.
116    pub fsverity: Mutex<FsVerityState>,
117
118    /// The security state associated with this node. Must always be acquired last
119    /// relative to other `FsNode` locks.
120    pub security_state: security::FsNodeState,
121}
122
123#[derive(Default)]
124struct FsNodeRareData {
125    /// The pipe located at this node, if any.
126    ///
127    /// Used if, and only if, the node has a mode of FileMode::IFIFO.
128    fifo: OnceLock<PipeHandle>,
129
130    /// The UNIX domain socket bound to this node, if any.
131    bound_socket: OnceLock<SocketHandle>,
132
133    /// Information about the locking information on this node.
134    ///
135    /// No other lock on this object may be taken while this lock is held.
136    flock_info: Mutex<FlockInfo>,
137
138    /// Records locks associated with this node.
139    record_locks: RecordLocks,
140
141    /// Whether this node can be linked into a directory.
142    ///
143    /// Only set for nodes created with `O_TMPFILE`.
144    link_behavior: OnceLock<FsNodeLinkBehavior>,
145
146    /// Inotify watchers on this node. See inotify(7).
147    watchers: inotify::InotifyWatchers,
148}
149
150impl FsNodeRareData {
151    fn ensure_fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
152        self.fifo.get_or_init(|| {
153            let mut default_pipe_capacity = (*PAGE_SIZE * 16) as usize;
154            if !security::is_task_capable_noaudit(current_task, CAP_SYS_RESOURCE) {
155                let kernel = current_task.kernel();
156                let max_size = kernel.system_limits.pipe_max_size.load(Ordering::Relaxed);
157                default_pipe_capacity = std::cmp::min(default_pipe_capacity, max_size);
158            }
159            Pipe::new(default_pipe_capacity)
160        })
161    }
162}
163
164pub enum FsNodeReleaserAction {}
165impl ReleaserAction<FsNode> for FsNodeReleaserAction {
166    fn release(fs_node: ReleaseGuard<FsNode>) {
167        register_delayed_release(fs_node);
168    }
169}
170pub type FsNodeReleaser = ObjectReleaser<FsNode, FsNodeReleaserAction>;
171pub type FsNodeHandle = Arc<FsNodeReleaser>;
172pub type WeakFsNodeHandle = Weak<FsNodeReleaser>;
173
174#[derive(Debug, Default, Clone, PartialEq)]
175pub struct FsNodeInfo {
176    pub mode: FileMode,
177    pub link_count: usize,
178    pub uid: uid_t,
179    pub gid: gid_t,
180    pub rdev: DeviceId,
181    pub size: usize,
182    pub blksize: usize,
183    pub blocks: usize,
184    pub time_status_change: UtcInstant,
185    pub time_access: UtcInstant,
186    pub time_modify: UtcInstant,
187    pub casefold: bool,
188
189    // If this node is fscrypt encrypted, stores the id of the user wrapping key used to encrypt it.
190    pub wrapping_key_id: Option<[u8; 16]>,
191
192    // Used to indicate to filesystems that manage timestamps that an access has occurred and to
193    // update the node's atime.
194    // This only impacts accesses within Starnix. Most Fuchsia programs are not expected to maintain
195    // access times. If the file handle is transferred out of Starnix, there may be inconsistencies.
196    pub pending_time_access_update: bool,
197}
198
199impl FsNodeInfo {
200    pub fn new(mode: FileMode, owner: FsCred) -> Self {
201        let now = utc::utc_now();
202        Self {
203            mode,
204            link_count: if mode.is_dir() { 2 } else { 1 },
205            uid: owner.uid,
206            gid: owner.gid,
207            blksize: DEFAULT_BYTES_PER_BLOCK,
208            time_status_change: now,
209            time_access: now,
210            time_modify: now,
211            ..Default::default()
212        }
213    }
214
215    pub fn storage_size(&self) -> usize {
216        self.blksize.saturating_mul(self.blocks)
217    }
218
219    pub fn chmod(&mut self, mode: FileMode) {
220        self.mode = (self.mode & !FileMode::PERMISSIONS) | (mode & FileMode::PERMISSIONS);
221    }
222
223    pub fn chown(&mut self, owner: Option<uid_t>, group: Option<gid_t>) {
224        if let Some(owner) = owner {
225            self.uid = owner;
226        }
227        if let Some(group) = group {
228            self.gid = group;
229        }
230        // Clear the setuid and setgid bits if the file is executable and a regular file.
231        if self.mode.is_reg() {
232            self.mode &= !FileMode::ISUID;
233            self.clear_sgid_bit();
234        }
235    }
236
237    fn clear_sgid_bit(&mut self) {
238        // If the group execute bit is not set, the setgid bit actually indicates mandatory
239        // locking and should not be cleared.
240        if self.mode.intersects(FileMode::IXGRP) {
241            self.mode &= !FileMode::ISGID;
242        }
243    }
244
245    fn clear_suid_and_sgid_bits(&mut self) {
246        self.mode &= !FileMode::ISUID;
247        self.clear_sgid_bit();
248    }
249
250    pub fn cred(&self) -> FsCred {
251        FsCred { uid: self.uid, gid: self.gid }
252    }
253
254    pub fn suid_and_sgid(
255        &self,
256        current_task: &CurrentTask,
257        fs_node: &FsNode,
258    ) -> Result<UserAndOrGroupId, Errno> {
259        let uid = self.mode.contains(FileMode::ISUID).then_some(self.uid);
260
261        // See <https://man7.org/linux/man-pages/man7/inode.7.html>:
262        //
263        //   For an executable file, the set-group-ID bit causes the
264        //   effective group ID of a process that executes the file to change
265        //   as described in execve(2).  For a file that does not have the
266        //   group execution bit (S_IXGRP) set, the set-group-ID bit indicates
267        //   mandatory file/record locking.
268        let gid = self.mode.contains(FileMode::ISGID | FileMode::IXGRP).then_some(self.gid);
269
270        let maybe_set_id = UserAndOrGroupId { uid, gid };
271        if maybe_set_id.is_some() {
272            // Check that uid and gid actually have execute access before
273            // returning them as the SUID or SGID.
274            check_access(
275                fs_node,
276                current_task,
277                security::PermissionFlags::EXEC,
278                self.uid,
279                self.gid,
280                self.mode,
281            )?;
282        }
283        Ok(maybe_set_id)
284    }
285}
286
287#[derive(Default)]
288struct FlockInfo {
289    /// Whether the node is currently locked. The meaning of the different values are:
290    /// - `None`: The node is not locked.
291    /// - `Some(false)`: The node is locked non exclusively.
292    /// - `Some(true)`: The node is locked exclusively.
293    locked_exclusive: Option<bool>,
294    /// The FileObject that hold the lock.
295    locking_handles: Vec<WeakFileHandle>,
296    /// The queue to notify process waiting on the lock.
297    wait_queue: WaitQueue,
298}
299
300impl FlockInfo {
301    /// Removes all file handle not holding `predicate` from the list of object holding the lock. If
302    /// this empties the list, unlocks the node and notifies all waiting processes.
303    pub fn retain<F>(&mut self, predicate: F)
304    where
305        F: Fn(&FileObject) -> bool,
306    {
307        if !self.locking_handles.is_empty() {
308            self.locking_handles
309                .retain(|w| if let Some(fh) = w.upgrade() { predicate(&fh) } else { false });
310            if self.locking_handles.is_empty() {
311                self.locked_exclusive = None;
312                self.wait_queue.notify_all();
313            }
314        }
315    }
316}
317
318/// `st_blksize` is measured in units of 512 bytes.
319pub const DEFAULT_BYTES_PER_BLOCK: usize = 512;
320
321pub struct FlockOperation {
322    operation: u32,
323}
324
325impl FlockOperation {
326    pub fn from_flags(operation: u32) -> Result<Self, Errno> {
327        if operation & !(LOCK_SH | LOCK_EX | LOCK_UN | LOCK_NB) != 0 {
328            return error!(EINVAL);
329        }
330        if [LOCK_SH, LOCK_EX, LOCK_UN].iter().filter(|&&o| operation & o == o).count() != 1 {
331            return error!(EINVAL);
332        }
333        Ok(Self { operation })
334    }
335
336    pub fn is_unlock(&self) -> bool {
337        self.operation & LOCK_UN > 0
338    }
339
340    pub fn is_lock_exclusive(&self) -> bool {
341        self.operation & LOCK_EX > 0
342    }
343
344    pub fn is_blocking(&self) -> bool {
345        self.operation & LOCK_NB == 0
346    }
347}
348
349impl FileObject {
350    /// Advisory locking.
351    ///
352    /// See flock(2).
353    pub fn flock(
354        &self,
355        locked: &mut Locked<Unlocked>,
356        current_task: &CurrentTask,
357        operation: FlockOperation,
358    ) -> Result<(), Errno> {
359        if self.flags().contains(OpenFlags::PATH) {
360            return error!(EBADF);
361        }
362        loop {
363            let mut flock_info = self.name.entry.node.ensure_rare_data().flock_info.lock();
364            if operation.is_unlock() {
365                flock_info.retain(|fh| !std::ptr::eq(fh, self));
366                return Ok(());
367            }
368            // Operation is a locking operation.
369            // 1. File is not locked
370            if flock_info.locked_exclusive.is_none() {
371                flock_info.locked_exclusive = Some(operation.is_lock_exclusive());
372                flock_info.locking_handles.push(self.weak_handle.clone());
373                return Ok(());
374            }
375
376            let file_lock_is_exclusive = flock_info.locked_exclusive == Some(true);
377            let fd_has_lock = flock_info
378                .locking_handles
379                .iter()
380                .find_map(|w| {
381                    w.upgrade().and_then(|fh| {
382                        if std::ptr::eq(&fh as &FileObject, self) { Some(()) } else { None }
383                    })
384                })
385                .is_some();
386
387            // 2. File is locked, but fd already have a lock
388            if fd_has_lock {
389                if operation.is_lock_exclusive() == file_lock_is_exclusive {
390                    // Correct lock is already held, return.
391                    return Ok(());
392                } else {
393                    // Incorrect lock is held. Release the lock and loop back to try to reacquire
394                    // it. flock doesn't guarantee atomic lock type switching.
395                    flock_info.retain(|fh| !std::ptr::eq(fh, self));
396                    continue;
397                }
398            }
399
400            // 3. File is locked, and fd doesn't have a lock.
401            if !file_lock_is_exclusive && !operation.is_lock_exclusive() {
402                // The lock is not exclusive, let's grab it.
403                flock_info.locking_handles.push(self.weak_handle.clone());
404                return Ok(());
405            }
406
407            // 4. The operation cannot be done at this time.
408            if !operation.is_blocking() {
409                return error!(EAGAIN);
410            }
411
412            // Register a waiter to be notified when the lock is released. Release the lock on
413            // FlockInfo, and wait.
414            let waiter = Waiter::new();
415            flock_info.wait_queue.wait_async(&waiter);
416            std::mem::drop(flock_info);
417            waiter.wait(locked, current_task)?;
418        }
419    }
420}
421
422// The inner mod is required because bitflags cannot pass the attribute through to the single
423// variant, and attributes cannot be applied to macro invocations.
424mod inner_flags {
425    // Part of the code for the AT_STATX_SYNC_AS_STAT case that's produced by the macro triggers the
426    // lint, but as a whole, the produced code is still correct.
427    #![allow(clippy::bad_bit_mask)] // TODO(b/303500202) Remove once addressed in bitflags.
428    use super::{bitflags, uapi};
429
430    bitflags! {
431        #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
432        pub struct StatxFlags: u32 {
433            const AT_SYMLINK_NOFOLLOW = uapi::AT_SYMLINK_NOFOLLOW;
434            const AT_EMPTY_PATH = uapi::AT_EMPTY_PATH;
435            const AT_NO_AUTOMOUNT = uapi::AT_NO_AUTOMOUNT;
436            const AT_STATX_SYNC_AS_STAT = uapi::AT_STATX_SYNC_AS_STAT;
437            const AT_STATX_FORCE_SYNC = uapi::AT_STATX_FORCE_SYNC;
438            const AT_STATX_DONT_SYNC = uapi::AT_STATX_DONT_SYNC;
439            const STATX_ATTR_VERITY = uapi::STATX_ATTR_VERITY;
440        }
441    }
442}
443
444pub use inner_flags::StatxFlags;
445
446#[derive(Copy, Clone, Debug, PartialEq, Eq)]
447pub enum UnlinkKind {
448    /// Unlink a directory.
449    Directory,
450
451    /// Unlink a non-directory.
452    NonDirectory,
453}
454
455pub enum SymlinkTarget {
456    Path(FsString),
457    Node(NamespaceNode),
458}
459
460#[derive(Clone, Copy, PartialEq, Eq)]
461pub enum XattrOp {
462    /// Set the value of the extended attribute regardless of whether it exists.
463    Set,
464    /// Create a new extended attribute. Fail if it already exists.
465    Create,
466    /// Replace the value of the extended attribute. Fail if it doesn't exist.
467    Replace,
468}
469
470impl XattrOp {
471    pub fn into_flags(self) -> u32 {
472        match self {
473            Self::Set => 0,
474            Self::Create => uapi::XATTR_CREATE,
475            Self::Replace => uapi::XATTR_REPLACE,
476        }
477    }
478}
479
480/// Returns a value, or the size required to contains it.
481#[derive(Clone, Debug, PartialEq)]
482pub enum ValueOrSize<T> {
483    Value(T),
484    Size(usize),
485}
486
487impl<T> ValueOrSize<T> {
488    pub fn map<F, U>(self, f: F) -> ValueOrSize<U>
489    where
490        F: FnOnce(T) -> U,
491    {
492        match self {
493            Self::Size(s) => ValueOrSize::Size(s),
494            Self::Value(v) => ValueOrSize::Value(f(v)),
495        }
496    }
497
498    #[cfg(test)]
499    pub fn unwrap(self) -> T {
500        match self {
501            Self::Size(_) => panic!("Unwrap ValueOrSize that is a Size"),
502            Self::Value(v) => v,
503        }
504    }
505}
506
507impl<T> From<T> for ValueOrSize<T> {
508    fn from(t: T) -> Self {
509        Self::Value(t)
510    }
511}
512
513#[derive(Copy, Clone, Eq, PartialEq, Debug)]
514pub enum FallocMode {
515    Allocate { keep_size: bool },
516    PunchHole,
517    Collapse,
518    Zero { keep_size: bool },
519    InsertRange,
520    UnshareRange,
521}
522
523impl FallocMode {
524    pub fn from_bits(mode: u32) -> Option<Self> {
525        // `fallocate()` allows only the following values for `mode`.
526        if mode == 0 {
527            Some(Self::Allocate { keep_size: false })
528        } else if mode == FALLOC_FL_KEEP_SIZE {
529            Some(Self::Allocate { keep_size: true })
530        } else if mode == FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE {
531            Some(Self::PunchHole)
532        } else if mode == FALLOC_FL_COLLAPSE_RANGE {
533            Some(Self::Collapse)
534        } else if mode == FALLOC_FL_ZERO_RANGE {
535            Some(Self::Zero { keep_size: false })
536        } else if mode == FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE {
537            Some(Self::Zero { keep_size: true })
538        } else if mode == FALLOC_FL_INSERT_RANGE {
539            Some(Self::InsertRange)
540        } else if mode == FALLOC_FL_UNSHARE_RANGE {
541            Some(Self::UnshareRange)
542        } else {
543            None
544        }
545    }
546}
547
548#[derive(Debug, Copy, Clone, PartialEq)]
549pub enum CheckAccessReason {
550    Access,
551    Chdir,
552    Chroot,
553    Exec,
554    ChangeTimestamps { now: bool },
555    InternalPermissionChecks,
556}
557
558pub type LookupVec<T> = SmallVec<[T; 8]>;
559
560pub trait FsNodeOps: Send + Sync + AsAny + 'static {
561    /// Delegate the access check to the node.
562    fn check_access(
563        &self,
564        _locked: &mut Locked<FileOpsCore>,
565        node: &FsNode,
566        current_task: &CurrentTask,
567        access: security::PermissionFlags,
568        info: &RwLock<FsNodeInfo>,
569        reason: CheckAccessReason,
570        audit_context: security::Auditable<'_>,
571    ) -> Result<(), Errno> {
572        node.default_check_access_impl(current_task, access, reason, info.read(), audit_context)
573    }
574
575    /// Build the [`DirEntryOps`] for a new [`DirEntry`] that will be associated
576    /// to this node.
577    fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
578        Box::new(DefaultDirEntryOps)
579    }
580
581    /// Build the `FileOps` for the file associated to this node.
582    ///
583    /// The returned FileOps will be used to create a FileObject, which might
584    /// be assigned an FdNumber.
585    fn create_file_ops(
586        &self,
587        locked: &mut Locked<FileOpsCore>,
588        node: &FsNode,
589        _current_task: &CurrentTask,
590        flags: OpenFlags,
591    ) -> Result<Box<dyn FileOps>, Errno>;
592
593    /// Find an existing child node and populate the child parameter. Return the node.
594    ///
595    /// The child parameter is an empty node. Operations other than initialize may panic before
596    /// initialize is called.
597    fn lookup(
598        &self,
599        _locked: &mut Locked<FileOpsCore>,
600        _node: &FsNode,
601        _current_task: &CurrentTask,
602        name: &FsStr,
603    ) -> Result<FsNodeHandle, Errno> {
604        // The default implementation here is suitable for filesystems that have permanent entries;
605        // entries that already exist will get found in the cache and shouldn't get this far.
606        error!(ENOENT, format!("looking for {name}"))
607    }
608
609    /// Returns whether this node supports pipelined lookups.
610    fn has_lookup_pipelined(&self) -> bool {
611        false
612    }
613
614    /// Find multiple children nodes in sequence.
615    ///
616    /// This can be used to pipeline lookups in filesystems that support it.
617    fn lookup_pipelined(
618        &self,
619        _locked: &mut Locked<FileOpsCore>,
620        _node: &FsNode,
621        _current_task: &CurrentTask,
622        _names: &[&FsStr],
623    ) -> LookupVec<Result<FsNodeHandle, Errno>> {
624        panic!("has_lookup_pipelined should be false");
625    }
626
627    /// Create and return the given child node.
628    ///
629    /// The mode field of the FsNodeInfo indicates what kind of child to
630    /// create.
631    ///
632    /// This function is never called with FileMode::IFDIR. The mkdir function
633    /// is used to create directories instead.
634    fn mknod(
635        &self,
636        locked: &mut Locked<FileOpsCore>,
637        _node: &FsNode,
638        _current_task: &CurrentTask,
639        _name: &FsStr,
640        _mode: FileMode,
641        _dev: DeviceId,
642        _owner: FsCred,
643    ) -> Result<FsNodeHandle, Errno>;
644
645    /// Create and return the given child node as a subdirectory.
646    fn mkdir(
647        &self,
648        locked: &mut Locked<FileOpsCore>,
649        _node: &FsNode,
650        _current_task: &CurrentTask,
651        _name: &FsStr,
652        _mode: FileMode,
653        _owner: FsCred,
654    ) -> Result<FsNodeHandle, Errno>;
655
656    /// Creates a symlink with the given `target` path.
657    fn create_symlink(
658        &self,
659        locked: &mut Locked<FileOpsCore>,
660        _node: &FsNode,
661        _current_task: &CurrentTask,
662        _name: &FsStr,
663        _target: &FsStr,
664        _owner: FsCred,
665    ) -> Result<FsNodeHandle, Errno>;
666
667    /// Creates an anonymous file.
668    ///
669    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
670    ///
671    /// Used by O_TMPFILE.
672    fn create_tmpfile(
673        &self,
674        _node: &FsNode,
675        _current_task: &CurrentTask,
676        _mode: FileMode,
677        _owner: FsCred,
678    ) -> Result<FsNodeHandle, Errno> {
679        error!(EOPNOTSUPP)
680    }
681
682    /// Reads the symlink from this node.
683    fn readlink(
684        &self,
685        _locked: &mut Locked<FileOpsCore>,
686        _node: &FsNode,
687        _current_task: &CurrentTask,
688    ) -> Result<SymlinkTarget, Errno> {
689        error!(EINVAL)
690    }
691
692    /// Create a hard link with the given name to the given child.
693    fn link(
694        &self,
695        _locked: &mut Locked<FileOpsCore>,
696        _node: &FsNode,
697        _current_task: &CurrentTask,
698        _name: &FsStr,
699        _child: &FsNodeHandle,
700    ) -> Result<(), Errno> {
701        error!(EPERM)
702    }
703
704    /// Remove the child with the given name, if the child exists.
705    ///
706    /// The UnlinkKind parameter indicates whether the caller intends to unlink
707    /// a directory or a non-directory child.
708    fn unlink(
709        &self,
710        locked: &mut Locked<FileOpsCore>,
711        _node: &FsNode,
712        _current_task: &CurrentTask,
713        _name: &FsStr,
714        _child: &FsNodeHandle,
715    ) -> Result<(), Errno>;
716
717    /// Acquire the necessary append lock for the operations that depend on them.
718    /// Should be done before calling `allocate` or `truncate` to avoid lock ordering issues.
719    fn append_lock_read<'a>(
720        &'a self,
721        locked: &'a mut Locked<BeforeFsNodeAppend>,
722        node: &'a FsNode,
723        current_task: &CurrentTask,
724    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FsNodeAppend>), Errno> {
725        return node.append_lock.read_and(locked, current_task);
726    }
727
728    /// Acquire the necessary append lock for operations that need exclusive access (e.g., write append).
729    fn append_lock_write<'a>(
730        &'a self,
731        locked: &'a mut Locked<BeforeFsNodeAppend>,
732        node: &'a FsNode,
733        current_task: &CurrentTask,
734    ) -> Result<(AppendLockWriteGuard<'a>, &'a mut Locked<FsNodeAppend>), Errno> {
735        return node.append_lock.write_and(locked, current_task);
736    }
737
738    /// Change the length of the file.
739    fn truncate(
740        &self,
741        _locked: &mut Locked<FileOpsCore>,
742        _guard: &AppendLockWriteGuard<'_>,
743        _node: &FsNode,
744        _current_task: &CurrentTask,
745        _length: u64,
746    ) -> Result<(), Errno> {
747        error!(EINVAL)
748    }
749
750    /// Manipulate allocated disk space for the file.
751    fn allocate(
752        &self,
753        _locked: &mut Locked<FileOpsCore>,
754        _guard: &AppendLockWriteGuard<'_>,
755        _node: &FsNode,
756        _current_task: &CurrentTask,
757        _mode: FallocMode,
758        _offset: u64,
759        _length: u64,
760    ) -> Result<(), Errno> {
761        error!(EINVAL)
762    }
763
764    /// Update the supplied info with initial state (e.g. size) for the node.
765    ///
766    /// FsNode calls this method when created, to allow the FsNodeOps to
767    /// set appropriate initial values in the FsNodeInfo.
768    fn initial_info(&self, _info: &mut FsNodeInfo) {}
769
770    /// Update node.info as needed.
771    ///
772    /// FsNode calls this method before converting the FsNodeInfo struct into
773    /// the uapi::stat struct to give the file system a chance to update this data
774    /// before it is used by clients.
775    ///
776    /// File systems that keep the FsNodeInfo up-to-date do not need to
777    /// override this function.
778    ///
779    /// Return a read guard for the updated information.
780    fn fetch_and_refresh_info<'a>(
781        &self,
782        _locked: &mut Locked<FileOpsCore>,
783        _node: &FsNode,
784        _current_task: &CurrentTask,
785        info: &'a RwLock<FsNodeInfo>,
786    ) -> Result<RwLockReadGuard<'a, FsNodeInfo>, Errno> {
787        Ok(info.read())
788    }
789
790    /// Syncs cached data to persistent storage.
791    fn sync(&self, _node: &FsNode, _current_task: &CurrentTask) -> Result<(), Errno> {
792        Ok(())
793    }
794
795    /// Update node attributes persistently.
796    fn update_attributes(
797        &self,
798        _locked: &mut Locked<FileOpsCore>,
799        _node: &FsNode,
800        _current_task: &CurrentTask,
801        _info: &FsNodeInfo,
802        _has: zxio_node_attr_has_t,
803    ) -> Result<(), Errno> {
804        Ok(())
805    }
806
807    /// Get an extended attribute on the node.
808    ///
809    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
810    /// instead return the size of the attribute, and can return an ERANGE error if max_size is not
811    /// 0, and lesser than the required size.
812    fn get_xattr(
813        &self,
814        _locked: &mut Locked<FileOpsCore>,
815        _node: &FsNode,
816        _current_task: &CurrentTask,
817        _name: &FsStr,
818        _max_size: usize,
819    ) -> Result<ValueOrSize<FsString>, Errno> {
820        error!(ENOTSUP)
821    }
822
823    /// Set an extended attribute on the node.
824    fn set_xattr(
825        &self,
826        _locked: &mut Locked<FileOpsCore>,
827        _node: &FsNode,
828        _current_task: &CurrentTask,
829        _name: &FsStr,
830        _value: &FsStr,
831        _op: XattrOp,
832    ) -> Result<(), Errno> {
833        error!(ENOTSUP)
834    }
835
836    fn remove_xattr(
837        &self,
838        _locked: &mut Locked<FileOpsCore>,
839        _node: &FsNode,
840        _current_task: &CurrentTask,
841        _name: &FsStr,
842    ) -> Result<(), Errno> {
843        error!(ENOTSUP)
844    }
845
846    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
847    /// instead return the size of the 0 separated string needed to represent the value, and can
848    /// return an ERANGE error if max_size is not 0, and lesser than the required size.
849    fn list_xattrs(
850        &self,
851        _locked: &mut Locked<FileOpsCore>,
852        _node: &FsNode,
853        _current_task: &CurrentTask,
854        _max_size: usize,
855    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
856        error!(ENOTSUP)
857    }
858
859    /// Called when the FsNode is freed by the Kernel.
860    fn forget(
861        self: Box<Self>,
862        _locked: &mut Locked<FileOpsCore>,
863        _current_task: &CurrentTask,
864        _info: FsNodeInfo,
865    ) -> Result<(), Errno> {
866        Ok(())
867    }
868
869    ////////////////////
870    // FS-Verity operations
871
872    /// Marks that FS-Verity is being built. Writes fsverity descriptor and merkle tree, the latter
873    /// computed by the filesystem.
874    /// This should ensure there are no writable file handles. Returns EEXIST if the file was
875    /// already fsverity-enabled. Returns EBUSY if this ioctl was already running on this file.
876    fn enable_fsverity(
877        &self,
878        _locked: &mut Locked<FileOpsCore>,
879        _node: &FsNode,
880        _current_task: &CurrentTask,
881        _descriptor: &fsverity_descriptor,
882    ) -> Result<(), Errno> {
883        error!(ENOTSUP)
884    }
885
886    /// Read fsverity descriptor, if the node is fsverity-enabled. Else returns ENODATA.
887    fn get_fsverity_descriptor(&self, _log_blocksize: u8) -> Result<fsverity_descriptor, Errno> {
888        error!(ENOTSUP)
889    }
890
891    /// Returns a descriptive name for this node, suitable to report to userspace in situations
892    /// where the node's path is unavailable (e.g. because it is anonymous, and has no path).
893    /// If no name is returned then a default name of the form "<class:[<node_id>]" will be used.
894    fn internal_name(&self, _node: &FsNode) -> Option<FsString> {
895        None
896    }
897
898    /// The key used to identify this node in the file system's node cache.
899    ///
900    /// For many file systems, this will be the same as the inode number. However, some file
901    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
902    fn node_key(&self, node: &FsNode) -> ino_t {
903        node.ino
904    }
905
906    /// Returns the size of the file.
907    fn get_size(
908        &self,
909        locked: &mut Locked<FileOpsCore>,
910        node: &FsNode,
911        current_task: &CurrentTask,
912    ) -> Result<usize, Errno> {
913        let info = node.fetch_and_refresh_info(locked, current_task)?;
914        Ok(info.size.try_into().map_err(|_| errno!(EINVAL))?)
915    }
916}
917
918impl<T> From<T> for Box<dyn FsNodeOps>
919where
920    T: FsNodeOps,
921{
922    fn from(ops: T) -> Box<dyn FsNodeOps> {
923        Box::new(ops)
924    }
925}
926
927/// Implements [`FsNodeOps`] methods in a way that makes sense for symlinks.
928/// You must implement [`FsNodeOps::readlink`].
929#[macro_export]
930macro_rules! fs_node_impl_symlink {
931    () => {
932        $crate::vfs::fs_node_impl_not_dir!();
933
934        fn create_file_ops(
935            &self,
936            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
937            node: &$crate::vfs::FsNode,
938            _current_task: &CurrentTask,
939            _flags: starnix_uapi::open_flags::OpenFlags,
940        ) -> Result<Box<dyn $crate::vfs::FileOps>, starnix_uapi::errors::Errno> {
941            assert!(node.is_lnk());
942            unreachable!("Symlink nodes cannot be opened.");
943        }
944    };
945}
946
947#[macro_export]
948macro_rules! fs_node_impl_dir_readonly {
949    () => {
950        fn check_access(
951            &self,
952            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
953            node: &$crate::vfs::FsNode,
954            current_task: &$crate::task::CurrentTask,
955            permission_flags: $crate::security::PermissionFlags,
956            info: &starnix_sync::RwLock<$crate::vfs::FsNodeInfo>,
957            reason: $crate::vfs::CheckAccessReason,
958            audit_context: $crate::security::Auditable<'_>,
959        ) -> Result<(), starnix_uapi::errors::Errno> {
960            let access = permission_flags.as_access();
961            if access.contains(starnix_uapi::file_mode::Access::WRITE) {
962                return starnix_uapi::error!(
963                    EROFS,
964                    format!("check_access failed: read-only directory")
965                );
966            }
967            node.default_check_access_impl(
968                current_task,
969                permission_flags,
970                reason,
971                info.read(),
972                audit_context,
973            )
974        }
975
976        fn mkdir(
977            &self,
978            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
979            _node: &$crate::vfs::FsNode,
980            _current_task: &$crate::task::CurrentTask,
981            name: &$crate::vfs::FsStr,
982            _mode: starnix_uapi::file_mode::FileMode,
983            _owner: starnix_uapi::auth::FsCred,
984        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
985            starnix_uapi::error!(EROFS, format!("mkdir failed: {:?}", name))
986        }
987
988        fn mknod(
989            &self,
990            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
991            _node: &$crate::vfs::FsNode,
992            _current_task: &$crate::task::CurrentTask,
993            name: &$crate::vfs::FsStr,
994            _mode: starnix_uapi::file_mode::FileMode,
995            _dev: starnix_uapi::device_id::DeviceId,
996            _owner: starnix_uapi::auth::FsCred,
997        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
998            starnix_uapi::error!(EROFS, format!("mknod failed: {:?}", name))
999        }
1000
1001        fn create_symlink(
1002            &self,
1003            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1004            _node: &$crate::vfs::FsNode,
1005            _current_task: &$crate::task::CurrentTask,
1006            name: &$crate::vfs::FsStr,
1007            _target: &$crate::vfs::FsStr,
1008            _owner: starnix_uapi::auth::FsCred,
1009        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1010            starnix_uapi::error!(EROFS, format!("symlink failed: {:?}", name))
1011        }
1012
1013        fn link(
1014            &self,
1015            _locked: &mut Locked<FileOpsCore>,
1016            _node: &$crate::vfs::FsNode,
1017            _current_task: &$crate::task::CurrentTask,
1018            name: &$crate::vfs::FsStr,
1019            _child: &$crate::vfs::FsNodeHandle,
1020        ) -> Result<(), starnix_uapi::errors::Errno> {
1021            starnix_uapi::error!(EROFS, format!("link failed: {:?}", name))
1022        }
1023
1024        fn unlink(
1025            &self,
1026            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1027            _node: &$crate::vfs::FsNode,
1028            _current_task: &$crate::task::CurrentTask,
1029            name: &$crate::vfs::FsStr,
1030            _child: &$crate::vfs::FsNodeHandle,
1031        ) -> Result<(), starnix_uapi::errors::Errno> {
1032            starnix_uapi::error!(EROFS, format!("unlink failed: {:?}", name))
1033        }
1034    };
1035}
1036
1037/// Trait that objects can implement if they need to handle extended attribute storage. Allows
1038/// delegating extended attribute operations in [`FsNodeOps`] to another object.
1039///
1040/// See [`fs_node_impl_xattr_delegate`] for usage details.
1041pub trait XattrStorage {
1042    /// Delegate for [`FsNodeOps::get_xattr`].
1043    fn get_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<FsString, Errno>;
1044
1045    /// Delegate for [`FsNodeOps::set_xattr`].
1046    fn set_xattr(
1047        &self,
1048        locked: &mut Locked<FileOpsCore>,
1049        name: &FsStr,
1050        value: &FsStr,
1051        op: XattrOp,
1052    ) -> Result<(), Errno>;
1053
1054    /// Delegate for [`FsNodeOps::remove_xattr`].
1055    fn remove_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<(), Errno>;
1056
1057    /// Delegate for [`FsNodeOps::list_xattrs`].
1058    fn list_xattrs(&self, locked: &mut Locked<FileOpsCore>) -> Result<Vec<FsString>, Errno>;
1059}
1060
1061/// Implements extended attribute ops for [`FsNodeOps`] by delegating to another object which
1062/// implements the [`XattrStorage`] trait or a similar interface. For example:
1063///
1064/// ```
1065/// struct Xattrs {}
1066///
1067/// impl XattrStorage for Xattrs {
1068///     // implement XattrStorage
1069/// }
1070///
1071/// struct Node {
1072///     xattrs: Xattrs
1073/// }
1074///
1075/// impl FsNodeOps for Node {
1076///     // Delegate extended attribute ops in FsNodeOps to self.xattrs
1077///     fs_node_impl_xattr_delegate!(self, self.xattrs);
1078///
1079///     // add other FsNodeOps impls here
1080/// }
1081/// ```
1082#[macro_export]
1083macro_rules! fs_node_impl_xattr_delegate {
1084    ($self:ident, $delegate:expr) => {
1085        fn get_xattr(
1086            &$self,
1087            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1088            _node: &FsNode,
1089            _current_task: &CurrentTask,
1090            name: &$crate::vfs::FsStr,
1091            _size: usize,
1092        ) -> Result<$crate::vfs::ValueOrSize<$crate::vfs::FsString>, starnix_uapi::errors::Errno> {
1093            Ok($delegate.get_xattr(locked, name)?.into())
1094        }
1095
1096        fn set_xattr(
1097            &$self,
1098            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1099            _node: &FsNode,
1100            _current_task: &CurrentTask,
1101            name: &$crate::vfs::FsStr,
1102            value: &$crate::vfs::FsStr,
1103            op: $crate::vfs::XattrOp,
1104        ) -> Result<(), starnix_uapi::errors::Errno> {
1105            $delegate.set_xattr(locked, name, value, op)
1106        }
1107
1108        fn remove_xattr(
1109            &$self,
1110            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1111            _node: &FsNode,
1112            _current_task: &CurrentTask,
1113            name: &$crate::vfs::FsStr,
1114        ) -> Result<(), starnix_uapi::errors::Errno> {
1115            $delegate.remove_xattr(locked, name)
1116        }
1117
1118        fn list_xattrs(
1119            &$self,
1120            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1121            _node: &FsNode,
1122            _current_task: &CurrentTask,
1123            _size: usize,
1124        ) -> Result<$crate::vfs::ValueOrSize<Vec<$crate::vfs::FsString>>, starnix_uapi::errors::Errno> {
1125            Ok($delegate.list_xattrs(locked)?.into())
1126        }
1127    };
1128}
1129
1130/// Stubs out [`FsNodeOps`] methods that only apply to directories.
1131#[macro_export]
1132macro_rules! fs_node_impl_not_dir {
1133    () => {
1134        fn lookup(
1135            &self,
1136            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1137            _node: &$crate::vfs::FsNode,
1138            _current_task: &$crate::task::CurrentTask,
1139            _name: &$crate::vfs::FsStr,
1140        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1141            starnix_uapi::error!(ENOTDIR)
1142        }
1143
1144        fn mknod(
1145            &self,
1146            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1147            _node: &$crate::vfs::FsNode,
1148            _current_task: &$crate::task::CurrentTask,
1149            _name: &$crate::vfs::FsStr,
1150            _mode: starnix_uapi::file_mode::FileMode,
1151            _dev: starnix_uapi::device_id::DeviceId,
1152            _owner: starnix_uapi::auth::FsCred,
1153        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1154            starnix_uapi::error!(ENOTDIR)
1155        }
1156
1157        fn mkdir(
1158            &self,
1159            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1160            _node: &$crate::vfs::FsNode,
1161            _current_task: &$crate::task::CurrentTask,
1162            _name: &$crate::vfs::FsStr,
1163            _mode: starnix_uapi::file_mode::FileMode,
1164            _owner: starnix_uapi::auth::FsCred,
1165        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1166            starnix_uapi::error!(ENOTDIR)
1167        }
1168
1169        fn create_symlink(
1170            &self,
1171            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1172            _node: &$crate::vfs::FsNode,
1173            _current_task: &$crate::task::CurrentTask,
1174            _name: &$crate::vfs::FsStr,
1175            _target: &$crate::vfs::FsStr,
1176            _owner: starnix_uapi::auth::FsCred,
1177        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1178            starnix_uapi::error!(ENOTDIR)
1179        }
1180
1181        fn unlink(
1182            &self,
1183            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1184            _node: &$crate::vfs::FsNode,
1185            _current_task: &$crate::task::CurrentTask,
1186            _name: &$crate::vfs::FsStr,
1187            _child: &$crate::vfs::FsNodeHandle,
1188        ) -> Result<(), starnix_uapi::errors::Errno> {
1189            starnix_uapi::error!(ENOTDIR)
1190        }
1191    };
1192}
1193
1194#[derive(Copy, Clone, Debug, PartialEq, Eq)]
1195pub enum TimeUpdateType {
1196    Now,
1197    Omit,
1198    Time(UtcInstant),
1199}
1200
1201// Public re-export of macros allows them to be used like regular rust items.
1202pub use fs_node_impl_dir_readonly;
1203pub use fs_node_impl_not_dir;
1204pub use fs_node_impl_symlink;
1205pub use fs_node_impl_xattr_delegate;
1206
1207pub struct SpecialNode;
1208
1209impl FsNodeOps for SpecialNode {
1210    fs_node_impl_not_dir!();
1211
1212    fn create_file_ops(
1213        &self,
1214        _locked: &mut Locked<FileOpsCore>,
1215        _node: &FsNode,
1216        _current_task: &CurrentTask,
1217        _flags: OpenFlags,
1218    ) -> Result<Box<dyn FileOps>, Errno> {
1219        unreachable!("Special nodes cannot be opened.");
1220    }
1221}
1222
1223impl FsNode {
1224    /// Returns true if the `fs_node` is private to the `Kernel`/`FileSystem`, in which
1225    /// case both MAC and DAC checks should be skipped.
1226    pub fn is_private(&self) -> bool {
1227        self.flags.contains(FsNodeFlags::IS_PRIVATE)
1228    }
1229
1230    /// Create a node without inserting it into the FileSystem node cache.
1231    ///
1232    /// This is usually not what you want!
1233    /// Only use if you're also using get_or_create_node, like ext4.
1234    pub fn new_uncached(
1235        ino: ino_t,
1236        ops: impl Into<Box<dyn FsNodeOps>>,
1237        fs: &FileSystemHandle,
1238        info: FsNodeInfo,
1239        flags: FsNodeFlags,
1240    ) -> FsNodeHandle {
1241        let ops = ops.into();
1242        FsNodeHandle::new(Self::new_internal(ino, ops, Arc::downgrade(fs), info, flags).into())
1243    }
1244
1245    fn new_internal(
1246        ino: ino_t,
1247        ops: Box<dyn FsNodeOps>,
1248        fs: Weak<FileSystem>,
1249        info: FsNodeInfo,
1250        flags: FsNodeFlags,
1251    ) -> Self {
1252        // Allow the FsNodeOps to populate initial info.
1253        let info = {
1254            let mut info = info;
1255            ops.initial_info(&mut info);
1256            info
1257        };
1258
1259        // The linter will fail in non test mode as it will not see the lock check.
1260        #[allow(clippy::let_and_return)]
1261        {
1262            let result = Self {
1263                ino,
1264                flags,
1265                ops,
1266                fs,
1267                info: RwLock::new(info),
1268                append_lock: Default::default(),
1269                rare_data: Default::default(),
1270                write_guard_state: Default::default(),
1271                fsverity: Mutex::new(FsVerityState::None),
1272                security_state: Default::default(),
1273            };
1274            #[cfg(any(test, debug_assertions))]
1275            {
1276                #[allow(
1277                    clippy::undocumented_unsafe_blocks,
1278                    reason = "Force documented unsafe blocks in Starnix"
1279                )]
1280                let locked = unsafe { Unlocked::new() };
1281                let _l1 = result.append_lock.read_for_lock_ordering(locked);
1282                let _l2 = result.info.read();
1283                let _l3 = result.write_guard_state.lock();
1284                let _l4 = result.fsverity.lock();
1285            }
1286            result
1287        }
1288    }
1289
1290    pub fn fs(&self) -> FileSystemHandle {
1291        self.fs.upgrade().expect("FileSystem did not live long enough")
1292    }
1293
1294    pub fn ops(&self) -> &dyn FsNodeOps {
1295        self.ops.as_ref()
1296    }
1297
1298    /// Returns an error if this node is encrypted and locked. Does not require
1299    /// fetch_and_refresh_info because FS_IOC_SET_ENCRYPTION_POLICY updates info and once a node is
1300    /// encrypted, it remains encrypted forever.
1301    pub fn fail_if_locked(&self, _current_task: &CurrentTask) -> Result<(), Errno> {
1302        let node_info = self.info();
1303        if let Some(wrapping_key_id) = node_info.wrapping_key_id {
1304            let crypt_service = self.fs().crypt_service().ok_or_else(|| errno!(ENOKEY))?;
1305            if !crypt_service.contains_key(EncryptionKeyId::from(wrapping_key_id)) {
1306                return error!(ENOKEY);
1307            }
1308        }
1309        Ok(())
1310    }
1311
1312    /// Returns the `FsNode`'s `FsNodeOps` as a `&T`, or `None` if the downcast fails.
1313    pub fn downcast_ops<T>(&self) -> Option<&T>
1314    where
1315        T: 'static,
1316    {
1317        self.ops().as_any().downcast_ref::<T>()
1318    }
1319
1320    pub fn on_file_closed(&self, file: &FileObjectState) {
1321        if let Some(rare_data) = self.rare_data.get() {
1322            let mut flock_info = rare_data.flock_info.lock();
1323            // This function will drop the flock from `file` because the `WeakFileHandle` for
1324            // `file` will no longer upgrade to an `FileHandle`.
1325            flock_info.retain(|_| true);
1326        }
1327        self.record_lock_release(RecordLockOwner::FileObject(file.id));
1328    }
1329
1330    pub fn record_lock(
1331        &self,
1332        locked: &mut Locked<Unlocked>,
1333        current_task: &CurrentTask,
1334        file: &FileObject,
1335        cmd: RecordLockCommand,
1336        flock: uapi::flock,
1337    ) -> Result<Option<uapi::flock>, Errno> {
1338        self.ensure_rare_data().record_locks.lock(locked, current_task, file, cmd, flock)
1339    }
1340
1341    /// Release all record locks acquired by the given owner.
1342    pub fn record_lock_release(&self, owner: RecordLockOwner) {
1343        if let Some(rare_data) = self.rare_data.get() {
1344            rare_data.record_locks.release_locks(owner);
1345        }
1346    }
1347
1348    pub fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
1349        self.ops().create_dir_entry_ops()
1350    }
1351
1352    pub fn create_file_ops<L>(
1353        &self,
1354        locked: &mut Locked<L>,
1355        current_task: &CurrentTask,
1356        flags: OpenFlags,
1357    ) -> Result<Box<dyn FileOps>, Errno>
1358    where
1359        L: LockEqualOrBefore<FileOpsCore>,
1360    {
1361        let locked = locked.cast_locked::<FileOpsCore>();
1362        self.ops().create_file_ops(locked, self, current_task, flags)
1363    }
1364
1365    pub fn open(
1366        &self,
1367        locked: &mut Locked<Unlocked>,
1368        current_task: &CurrentTask,
1369        namespace_node: &NamespaceNode,
1370        flags: OpenFlags,
1371        access_check: AccessCheck,
1372    ) -> Result<Box<dyn FileOps>, Errno> {
1373        // If O_PATH is set, there is no need to create a real FileOps because
1374        // most file operations are disabled.
1375        if flags.contains(OpenFlags::PATH) {
1376            return Ok(Box::new(OPathOps::new()));
1377        }
1378
1379        let access = access_check.resolve(flags);
1380        if access.is_nontrivial() {
1381            if flags.contains(OpenFlags::NOATIME) {
1382                self.check_o_noatime_allowed(current_task)?;
1383            }
1384
1385            // `flags` doesn't contain any information about the EXEC permission. Instead the syscalls
1386            // used to execute a file (`sys_execve` and `sys_execveat`) call `open()` with the EXEC
1387            // permission request in `access`.
1388            let mut permission_flags = PermissionFlags::from(access);
1389
1390            // The `APPEND` flag exists only in `flags`, to modify the behaviour of
1391            // `PermissionFlags::WRITE`
1392            if flags.contains(OpenFlags::APPEND) {
1393                permission_flags |= security::PermissionFlags::APPEND;
1394            }
1395
1396            // TODO: https://fxbug.dev/455782510 - Remove this once non-open() checks are fully
1397            // enforced.
1398            permission_flags |= security::PermissionFlags::FOR_OPEN;
1399
1400            self.check_access(
1401                locked,
1402                current_task,
1403                &namespace_node.mount,
1404                permission_flags,
1405                CheckAccessReason::InternalPermissionChecks,
1406                namespace_node,
1407            )?;
1408        }
1409
1410        let (mode, rdev) = {
1411            // Don't hold the info lock while calling into open_device or self.ops().
1412            // TODO: The mode and rdev are immutable and shouldn't require a lock to read.
1413            let info = self.info();
1414            (info.mode, info.rdev)
1415        };
1416
1417        match mode & FileMode::IFMT {
1418            FileMode::IFCHR => {
1419                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1420                    return error!(EACCES);
1421                }
1422                current_task.kernel().open_device(
1423                    locked,
1424                    current_task,
1425                    namespace_node,
1426                    flags,
1427                    rdev,
1428                    DeviceMode::Char,
1429                )
1430            }
1431            FileMode::IFBLK => {
1432                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1433                    return error!(EACCES);
1434                }
1435                current_task.kernel().open_device(
1436                    locked,
1437                    current_task,
1438                    namespace_node,
1439                    flags,
1440                    rdev,
1441                    DeviceMode::Block,
1442                )
1443            }
1444            FileMode::IFIFO => Pipe::open(locked, current_task, self.fifo(current_task), flags),
1445            // UNIX domain sockets can't be opened.
1446            FileMode::IFSOCK => error!(ENXIO),
1447            _ => self.create_file_ops(locked, current_task, flags),
1448        }
1449    }
1450
1451    pub fn lookup<L>(
1452        &self,
1453        locked: &mut Locked<L>,
1454        current_task: &CurrentTask,
1455        mount: &MountInfo,
1456        name: &FsStr,
1457    ) -> Result<FsNodeHandle, Errno>
1458    where
1459        L: LockEqualOrBefore<FileOpsCore>,
1460    {
1461        self.check_access(
1462            locked,
1463            current_task,
1464            mount,
1465            Access::EXEC,
1466            CheckAccessReason::InternalPermissionChecks,
1467            &[Auditable::Name(name), std::panic::Location::caller().into()],
1468        )?;
1469        let locked = locked.cast_locked::<FileOpsCore>();
1470        self.ops().lookup(locked, self, current_task, name)
1471    }
1472
1473    pub fn create_node<L>(
1474        &self,
1475        locked: &mut Locked<L>,
1476        current_task: &CurrentTask,
1477        mount: &MountInfo,
1478        name: &FsStr,
1479        mut mode: FileMode,
1480        dev: DeviceId,
1481        mut owner: FsCred,
1482    ) -> Result<FsNodeHandle, Errno>
1483    where
1484        L: LockEqualOrBefore<FileOpsCore>,
1485    {
1486        assert!(
1487            !matches!(mode.fmt(), FileMode::EMPTY | FileMode::IFLNK),
1488            "create_node with missing or symlink node type"
1489        );
1490
1491        self.check_access(
1492            locked,
1493            current_task,
1494            mount,
1495            Access::WRITE,
1496            CheckAccessReason::InternalPermissionChecks,
1497            security::Auditable::Name(name),
1498        )?;
1499
1500        if mode.is_dir() {
1501            // Even though the man page for mknod(2) says that mknod "cannot be used to create
1502            // directories" in starnix the mkdir syscall (`sys_mkdirat`) ends up calling
1503            // create_node.
1504            security::check_fs_node_mkdir_access(current_task, self, mode, name)?;
1505        } else {
1506            // https://man7.org/linux/man-pages/man2/mknod.2.html says on error EPERM:
1507            //
1508            //   mode requested creation of something other than a regular
1509            //   file, FIFO (named pipe), or UNIX domain socket, and the
1510            //   caller is not privileged (Linux: does not have the
1511            //   CAP_MKNOD capability); also returned if the filesystem
1512            //   containing pathname does not support the type of node
1513            //   requested.
1514            match mode.fmt() {
1515                FileMode::IFREG | FileMode::IFIFO | FileMode::IFSOCK => (),
1516                FileMode::IFCHR if dev == DeviceId::NONE => (),
1517                _ => security::check_task_capable(current_task, CAP_MKNOD)?,
1518            }
1519
1520            if mode.is_reg() {
1521                security::check_fs_node_create_access(current_task, self, mode, name)?;
1522            } else {
1523                security::check_fs_node_mknod_access(current_task, self, mode, name, dev)?;
1524            }
1525        }
1526
1527        // Propagate sticky bit(s) from parent directory to the child.
1528        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1529
1530        // Delegate to the `ops` implementation to actually create the node.
1531        let locked = locked.cast_locked::<FileOpsCore>();
1532        let new_node = if mode.is_dir() {
1533            self.ops().mkdir(locked, self, current_task, name, mode, owner)?
1534        } else {
1535            self.ops().mknod(locked, self, current_task, name, mode, dev, owner)?
1536        };
1537
1538        // Allow the LSM to apply a security label to the new node.
1539        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1540
1541        Ok(new_node)
1542    }
1543
1544    pub fn create_symlink<L>(
1545        &self,
1546        locked: &mut Locked<L>,
1547        current_task: &CurrentTask,
1548        mount: &MountInfo,
1549        name: &FsStr,
1550        target: &FsStr,
1551        owner: FsCred,
1552    ) -> Result<FsNodeHandle, Errno>
1553    where
1554        L: LockEqualOrBefore<FileOpsCore>,
1555    {
1556        self.check_access(
1557            locked,
1558            current_task,
1559            mount,
1560            Access::WRITE,
1561            CheckAccessReason::InternalPermissionChecks,
1562            security::Auditable::Name(name),
1563        )?;
1564        security::check_fs_node_symlink_access(current_task, self, name, target)?;
1565
1566        let locked = locked.cast_locked::<FileOpsCore>();
1567        let new_node =
1568            self.ops().create_symlink(locked, self, current_task, name, target, owner)?;
1569
1570        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1571
1572        Ok(new_node)
1573    }
1574
1575    /// Requests that the LSM initialise a security label for the `new_node`, and optionally provide
1576    /// an extended attribute to write to the file to persist it.  If no LSM is enabled, no extended
1577    /// attribute returned, or if the filesystem does not support extended attributes, then the call
1578    /// returns success. All other failure modes return an `Errno` that should be early-returned.
1579    fn init_new_node_security_on_create<L>(
1580        &self,
1581        locked: &mut Locked<L>,
1582        current_task: &CurrentTask,
1583        new_node: &FsNode,
1584        name: &FsStr,
1585    ) -> Result<(), Errno>
1586    where
1587        L: LockEqualOrBefore<FileOpsCore>,
1588    {
1589        let locked = locked.cast_locked::<FileOpsCore>();
1590        security::fs_node_init_on_create(current_task, &new_node, self, name)?
1591            .map(|xattr| {
1592                match new_node.ops().set_xattr(
1593                    locked,
1594                    &new_node,
1595                    current_task,
1596                    xattr.name,
1597                    xattr.value.as_slice().into(),
1598                    XattrOp::Create,
1599                ) {
1600                    Err(e) => {
1601                        if e.code == ENOTSUP {
1602                            // This should only occur if a task has an "fscreate" context set, and
1603                            // creates a new file in a filesystem that does not support xattrs.
1604                            Ok(())
1605                        } else {
1606                            Err(e)
1607                        }
1608                    }
1609                    result => result,
1610                }
1611            })
1612            .unwrap_or_else(|| Ok(()))
1613    }
1614
1615    pub fn create_tmpfile<L>(
1616        &self,
1617        locked: &mut Locked<L>,
1618        current_task: &CurrentTask,
1619        mount: &MountInfo,
1620        mut mode: FileMode,
1621        mut owner: FsCred,
1622        link_behavior: FsNodeLinkBehavior,
1623    ) -> Result<FsNodeHandle, Errno>
1624    where
1625        L: LockEqualOrBefore<FileOpsCore>,
1626    {
1627        self.check_access(
1628            locked,
1629            current_task,
1630            mount,
1631            Access::WRITE,
1632            CheckAccessReason::InternalPermissionChecks,
1633            security::Auditable::Location(std::panic::Location::caller()),
1634        )?;
1635        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1636        let node = self.ops().create_tmpfile(self, current_task, mode, owner)?;
1637        self.init_new_node_security_on_create(locked, current_task, &node, "".into())?;
1638        if link_behavior == FsNodeLinkBehavior::Disallowed {
1639            node.ensure_rare_data().link_behavior.set(link_behavior).unwrap();
1640        }
1641        Ok(node)
1642    }
1643
1644    // This method does not attempt to update the atime of the node.
1645    // Use `NamespaceNode::readlink` which checks the mount flags and updates the atime accordingly.
1646    pub fn readlink<L>(
1647        &self,
1648        locked: &mut Locked<L>,
1649        current_task: &CurrentTask,
1650    ) -> Result<SymlinkTarget, Errno>
1651    where
1652        L: LockEqualOrBefore<FileOpsCore>,
1653    {
1654        // TODO: 378864856 - Is there a permission check here other than security checks?
1655        security::check_fs_node_read_link_access(current_task, self)?;
1656        self.ops().readlink(locked.cast_locked::<FileOpsCore>(), self, current_task)
1657    }
1658
1659    pub fn link<L>(
1660        &self,
1661        locked: &mut Locked<L>,
1662        current_task: &CurrentTask,
1663        mount: &MountInfo,
1664        name: &FsStr,
1665        child: &FsNodeHandle,
1666    ) -> Result<FsNodeHandle, Errno>
1667    where
1668        L: LockEqualOrBefore<FileOpsCore>,
1669    {
1670        self.check_access(
1671            locked,
1672            current_task,
1673            mount,
1674            Access::WRITE,
1675            CheckAccessReason::InternalPermissionChecks,
1676            security::Auditable::Location(std::panic::Location::caller()),
1677        )?;
1678
1679        if child.is_dir() {
1680            return error!(EPERM);
1681        }
1682
1683        if let Some(child_rare_data) = child.rare_data.get() {
1684            if matches!(child_rare_data.link_behavior.get(), Some(FsNodeLinkBehavior::Disallowed)) {
1685                return error!(ENOENT);
1686            }
1687        }
1688
1689        // Check that `current_task` has permission to create the hard link.
1690        //
1691        // See description of /proc/sys/fs/protected_hardlinks in
1692        // https://man7.org/linux/man-pages/man5/proc.5.html for details of the security
1693        // vulnerabilities.
1694        //
1695        let (child_uid, mode) = {
1696            let info = child.info();
1697            (info.uid, info.mode)
1698        };
1699        // Check that the the filesystem UID of the calling process (`current_task`) is the same as
1700        // the UID of the existing file. The check can be bypassed if the calling process has
1701        // `CAP_FOWNER` capability.
1702        if child_uid != current_task.current_creds().fsuid
1703            && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1704        {
1705            // If current_task is not the user of the existing file, it needs to have read and write
1706            // access to the existing file.
1707            child
1708                .check_access(
1709                    locked,
1710                    current_task,
1711                    mount,
1712                    Access::READ | Access::WRITE,
1713                    CheckAccessReason::InternalPermissionChecks,
1714                    security::Auditable::Name(name),
1715                )
1716                .map_err(|e| {
1717                    // `check_access(..)` returns EACCES when the access rights doesn't match - change
1718                    // it to EPERM to match Linux standards.
1719                    if e == EACCES { errno!(EPERM) } else { e }
1720                })?;
1721            // There are also security issues that may arise when users link to setuid, setgid, or
1722            // special files.
1723            if mode.contains(FileMode::ISGID | FileMode::IXGRP) {
1724                return error!(EPERM);
1725            };
1726            if mode.contains(FileMode::ISUID) {
1727                return error!(EPERM);
1728            };
1729            if !mode.contains(FileMode::IFREG) {
1730                return error!(EPERM);
1731            };
1732        }
1733
1734        security::check_fs_node_link_access(current_task, self, child)?;
1735
1736        let locked = locked.cast_locked::<FileOpsCore>();
1737        self.ops().link(locked, self, current_task, name, child)?;
1738        Ok(child.clone())
1739    }
1740
1741    pub fn unlink<L>(
1742        &self,
1743        locked: &mut Locked<L>,
1744        current_task: &CurrentTask,
1745        mount: &MountInfo,
1746        name: &FsStr,
1747        child: &FsNodeHandle,
1748    ) -> Result<(), Errno>
1749    where
1750        L: LockEqualOrBefore<FileOpsCore>,
1751    {
1752        // The user must be able to search and write to the directory.
1753        self.check_access(
1754            locked,
1755            current_task,
1756            mount,
1757            Access::EXEC | Access::WRITE,
1758            CheckAccessReason::InternalPermissionChecks,
1759            security::Auditable::Name(name),
1760        )?;
1761        self.check_sticky_bit(current_task, child)?;
1762        if child.is_dir() {
1763            security::check_fs_node_rmdir_access(current_task, self, child, name)?;
1764        } else {
1765            security::check_fs_node_unlink_access(current_task, self, child, name)?;
1766        }
1767        let locked = locked.cast_locked::<FileOpsCore>();
1768        self.ops().unlink(locked, self, current_task, name, child)?;
1769        self.update_ctime_mtime();
1770        Ok(())
1771    }
1772
1773    pub fn truncate<L>(
1774        &self,
1775        locked: &mut Locked<L>,
1776        current_task: &CurrentTask,
1777        mount: &MountInfo,
1778        length: u64,
1779    ) -> Result<(), Errno>
1780    where
1781        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1782    {
1783        let mut locked = locked.cast_locked::<BeforeFsNodeAppend>();
1784        if self.is_dir() {
1785            return error!(EISDIR);
1786        }
1787        self.check_access(
1788            &mut locked,
1789            current_task,
1790            mount,
1791            Access::WRITE,
1792            CheckAccessReason::InternalPermissionChecks,
1793            security::Auditable::Location(std::panic::Location::caller()),
1794        )?;
1795
1796        let (guard, locked) = self.ops().append_lock_write(&mut locked, self, current_task)?;
1797        self.truncate_locked(locked, &guard, current_task, length)
1798    }
1799
1800    /// Avoid calling this method directly. You probably want to call `FileObject::ftruncate()`
1801    /// which will also perform all file-descriptor based verifications.
1802    pub fn ftruncate<L>(
1803        &self,
1804        locked: &mut Locked<L>,
1805        current_task: &CurrentTask,
1806        length: u64,
1807    ) -> Result<(), Errno>
1808    where
1809        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1810    {
1811        let locked = locked.cast_locked::<BeforeFsNodeAppend>();
1812
1813        if self.is_dir() {
1814            // When truncating a file descriptor, if the descriptor references a directory,
1815            // return EINVAL. This is different from the truncate() syscall which returns EISDIR.
1816            //
1817            // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#ERRORS
1818            return error!(EINVAL);
1819        }
1820
1821        // For ftruncate, we do not need to check that the file node is writable.
1822        //
1823        // The file object that calls this method must verify that the file was opened
1824        // with write permissions.
1825        //
1826        // This matters because a file could be opened with O_CREAT + O_RDWR + 0444 mode.
1827        // The file descriptor returned from such an operation can be truncated, even
1828        // though the file was created with a read-only mode.
1829        //
1830        // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#DESCRIPTION
1831        // which says:
1832        //
1833        // "With ftruncate(), the file must be open for writing; with truncate(),
1834        // the file must be writable."
1835
1836        let (guard, locked) = self.ops().append_lock_write(locked, self, current_task)?;
1837        self.truncate_locked(locked, &guard, current_task, length)
1838    }
1839
1840    // Called by `truncate` and `ftruncate` above.
1841    pub fn truncate_locked<L>(
1842        &self,
1843        locked: &mut Locked<L>,
1844        guard: &AppendLockWriteGuard<'_>,
1845        current_task: &CurrentTask,
1846        length: u64,
1847    ) -> Result<(), Errno>
1848    where
1849        L: LockEqualOrBefore<FileOpsCore>,
1850    {
1851        let locked = locked.cast_locked::<FileOpsCore>();
1852        if length > MAX_LFS_FILESIZE as u64 {
1853            return error!(EINVAL);
1854        }
1855        if length > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1856            send_standard_signal(locked, current_task, SignalInfo::kernel(SIGXFSZ));
1857            return error!(EFBIG);
1858        }
1859        self.clear_suid_and_sgid_bits(locked, current_task)?;
1860
1861        self.ops().truncate(locked, guard, self, current_task, length)?;
1862        self.update_ctime_mtime();
1863        Ok(())
1864    }
1865
1866    /// Avoid calling this method directly. You probably want to call `FileObject::fallocate()`
1867    /// which will also perform additional verifications.
1868    pub fn fallocate<L>(
1869        &self,
1870        locked: &mut Locked<L>,
1871        current_task: &CurrentTask,
1872        mode: FallocMode,
1873        offset: u64,
1874        length: u64,
1875    ) -> Result<(), Errno>
1876    where
1877        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1878    {
1879        let mut locked = locked.cast_locked::<BeforeFsNodeAppend>();
1880        let (guard, locked) = self.ops().append_lock_write(&mut locked, self, current_task)?;
1881        self.fallocate_locked(locked, &guard, current_task, mode, offset, length)
1882    }
1883
1884    pub fn fallocate_locked<L>(
1885        &self,
1886        locked: &mut Locked<L>,
1887        guard: &AppendLockWriteGuard<'_>,
1888        current_task: &CurrentTask,
1889        mode: FallocMode,
1890        offset: u64,
1891        length: u64,
1892    ) -> Result<(), Errno>
1893    where
1894        L: LockEqualOrBefore<FileOpsCore>,
1895    {
1896        let locked = locked.cast_locked::<FileOpsCore>();
1897        let allocate_size = checked_add_offset_and_length(offset as usize, length as usize)
1898            .map_err(|_| errno!(EFBIG))? as u64;
1899        if allocate_size > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1900            send_standard_signal(locked, current_task, SignalInfo::kernel(SIGXFSZ));
1901            return error!(EFBIG);
1902        }
1903
1904        self.clear_suid_and_sgid_bits(locked, current_task)?;
1905
1906        self.ops().allocate(locked, guard, self, current_task, mode, offset, length)?;
1907        self.update_ctime_mtime();
1908        Ok(())
1909    }
1910
1911    fn update_metadata_for_child(
1912        &self,
1913        current_task: &CurrentTask,
1914        mode: &mut FileMode,
1915        owner: &mut FsCred,
1916    ) {
1917        // The setgid bit on a directory causes the gid to be inherited by new children and the
1918        // setgid bit to be inherited by new child directories. See SetgidDirTest in gvisor.
1919        {
1920            let self_info = self.info();
1921            if self_info.mode.contains(FileMode::ISGID) {
1922                owner.gid = self_info.gid;
1923                if mode.is_dir() {
1924                    *mode |= FileMode::ISGID;
1925                }
1926            }
1927        }
1928
1929        if !mode.is_dir() {
1930            // https://man7.org/linux/man-pages/man7/inode.7.html says:
1931            //
1932            //   For an executable file, the set-group-ID bit causes the
1933            //   effective group ID of a process that executes the file to change
1934            //   as described in execve(2).
1935            //
1936            // We need to check whether the current task has permission to create such a file.
1937            // See a similar check in `FsNode::chmod`.
1938            let current_creds = current_task.current_creds();
1939            if owner.gid != current_creds.fsgid
1940                && !current_creds.is_in_group(owner.gid)
1941                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1942            {
1943                *mode &= !FileMode::ISGID;
1944            }
1945        }
1946    }
1947
1948    /// Checks if O_NOATIME is allowed,
1949    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1950        // Per open(2),
1951        //
1952        //   O_NOATIME (since Linux 2.6.8)
1953        //      ...
1954        //
1955        //      This flag can be employed only if one of the following
1956        //      conditions is true:
1957        //
1958        //      *  The effective UID of the process matches the owner UID
1959        //         of the file.
1960        //
1961        //      *  The calling process has the CAP_FOWNER capability in
1962        //         its user namespace and the owner UID of the file has a
1963        //         mapping in the namespace.
1964        if current_task.current_creds().fsuid != self.info().uid {
1965            security::check_task_capable(current_task, CAP_FOWNER)?;
1966        }
1967        Ok(())
1968    }
1969
1970    pub fn default_check_access_impl(
1971        &self,
1972        current_task: &CurrentTask,
1973        permission_flags: security::PermissionFlags,
1974        reason: CheckAccessReason,
1975        info: RwLockReadGuard<'_, FsNodeInfo>,
1976        audit_context: Auditable<'_>,
1977    ) -> Result<(), Errno> {
1978        let (node_uid, node_gid, mode) = (info.uid, info.gid, info.mode);
1979        std::mem::drop(info);
1980        if let CheckAccessReason::ChangeTimestamps { now } = reason {
1981            // To set the timestamps to the current time the caller must either have write access to
1982            // the file, be the file owner, or hold the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
1983            // To set the timestamps to other values the caller must either be the file owner or hold
1984            // the CAP_FOWNER capability.
1985            if current_task.current_creds().fsuid == node_uid {
1986                return Ok(());
1987            }
1988            if now {
1989                if security::is_task_capable_noaudit(current_task, CAP_FOWNER) {
1990                    return Ok(());
1991                }
1992            } else {
1993                security::check_task_capable(current_task, CAP_FOWNER)?;
1994                return Ok(());
1995            }
1996        }
1997        check_access(self, current_task, permission_flags, node_uid, node_gid, mode)?;
1998        security::fs_node_permission(current_task, self, permission_flags, audit_context)
1999    }
2000
2001    /// Check whether the node can be accessed in the current context with the specified access
2002    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
2003    /// owner or is in the file's group.
2004    pub fn check_access<'a, L>(
2005        &self,
2006        locked: &mut Locked<L>,
2007        current_task: &CurrentTask,
2008        mount: &MountInfo,
2009        access: impl Into<security::PermissionFlags>,
2010        reason: CheckAccessReason,
2011        audit_context: impl Into<security::Auditable<'a>>,
2012    ) -> Result<(), Errno>
2013    where
2014        L: LockEqualOrBefore<FileOpsCore>,
2015    {
2016        let mut permission_flags = access.into();
2017        if permission_flags.contains(security::PermissionFlags::WRITE)
2018            && !self.info().mode.is_special()
2019        {
2020            mount.check_readonly_filesystem()?;
2021        }
2022        if permission_flags.contains(security::PermissionFlags::EXEC) && !self.is_dir() {
2023            mount.check_noexec_filesystem()?;
2024        }
2025        if reason == CheckAccessReason::Access {
2026            permission_flags |= PermissionFlags::ACCESS;
2027        }
2028        self.ops().check_access(
2029            locked.cast_locked::<FileOpsCore>(),
2030            self,
2031            current_task,
2032            permission_flags,
2033            &self.info,
2034            reason,
2035            audit_context.into(),
2036        )
2037    }
2038
2039    /// Check whether the stick bit, `S_ISVTX`, forbids the `current_task` from removing the given
2040    /// `child`. If this node has `S_ISVTX`, then either the child must be owned by the `fsuid` of
2041    /// `current_task` or `current_task` must have `CAP_FOWNER`.
2042    pub fn check_sticky_bit(
2043        &self,
2044        current_task: &CurrentTask,
2045        child: &FsNodeHandle,
2046    ) -> Result<(), Errno> {
2047        if self.info().mode.contains(FileMode::ISVTX)
2048            && child.info().uid != current_task.current_creds().fsuid
2049        {
2050            security::check_task_capable(current_task, CAP_FOWNER)?;
2051        }
2052        Ok(())
2053    }
2054
2055    pub fn fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
2056        assert!(self.is_fifo());
2057        self.ensure_rare_data().ensure_fifo(current_task)
2058    }
2059
2060    /// Returns the UNIX domain socket bound to this node, if any.
2061    pub fn bound_socket(&self) -> Option<&SocketHandle> {
2062        if let Some(rare_data) = self.rare_data.get() { rare_data.bound_socket.get() } else { None }
2063    }
2064
2065    /// Register the provided socket as the UNIX domain socket bound to this node.
2066    ///
2067    /// It is a fatal error to call this method again if it has already been called on this node.
2068    pub fn set_bound_socket(&self, socket: SocketHandle) {
2069        assert!(self.ensure_rare_data().bound_socket.set(socket).is_ok());
2070    }
2071
2072    pub fn update_attributes<L, F>(
2073        &self,
2074        locked: &mut Locked<L>,
2075        current_task: &CurrentTask,
2076        mutator: F,
2077    ) -> Result<(), Errno>
2078    where
2079        L: LockEqualOrBefore<FileOpsCore>,
2080        F: FnOnce(&mut FsNodeInfo) -> Result<(), Errno>,
2081    {
2082        let mut info = self.info.write();
2083        let mut new_info = info.clone();
2084        mutator(&mut new_info)?;
2085
2086        let new_access = new_info.mode.user_access()
2087            | new_info.mode.group_access()
2088            | new_info.mode.other_access();
2089
2090        if new_access.intersects(Access::EXEC) {
2091            let write_guard_state = self.write_guard_state.lock();
2092            if let Ok(seals) = write_guard_state.get_seals() {
2093                if seals.contains(SealFlags::NO_EXEC) {
2094                    return error!(EPERM);
2095                }
2096            }
2097        }
2098
2099        // `mutator`s should not update the attribute change time, which is managed by this API.
2100        assert_eq!(info.time_status_change, new_info.time_status_change);
2101        if *info == new_info {
2102            return Ok(());
2103        }
2104        new_info.time_status_change = utc::utc_now();
2105
2106        let mut has = zxio_node_attr_has_t { ..Default::default() };
2107        has.modification_time = info.time_modify != new_info.time_modify;
2108        has.access_time = info.time_access != new_info.time_access;
2109        has.mode = info.mode != new_info.mode;
2110        has.uid = info.uid != new_info.uid;
2111        has.gid = info.gid != new_info.gid;
2112        has.rdev = info.rdev != new_info.rdev;
2113        has.casefold = info.casefold != new_info.casefold;
2114        has.wrapping_key_id = info.wrapping_key_id != new_info.wrapping_key_id;
2115
2116        security::check_fs_node_setattr_access(current_task, &self, &has)?;
2117
2118        // Call `update_attributes(..)` to persist the changes for the following fields.
2119        if has.modification_time
2120            || has.access_time
2121            || has.mode
2122            || has.uid
2123            || has.gid
2124            || has.rdev
2125            || has.casefold
2126            || has.wrapping_key_id
2127        {
2128            let locked = locked.cast_locked::<FileOpsCore>();
2129            self.ops().update_attributes(locked, self, current_task, &new_info, has)?;
2130        }
2131
2132        *info = new_info;
2133        Ok(())
2134    }
2135
2136    /// Set the permissions on this FsNode to the given values.
2137    ///
2138    /// Does not change the IFMT of the node.
2139    pub fn chmod<L>(
2140        &self,
2141        locked: &mut Locked<L>,
2142        current_task: &CurrentTask,
2143        mount: &MountInfo,
2144        mut mode: FileMode,
2145    ) -> Result<(), Errno>
2146    where
2147        L: LockEqualOrBefore<FileOpsCore>,
2148    {
2149        mount.check_readonly_filesystem()?;
2150        self.update_attributes(locked, current_task, |info| {
2151            let current_creds = current_task.current_creds();
2152            if info.uid != current_creds.euid {
2153                security::check_task_capable(current_task, CAP_FOWNER)?;
2154            } else if info.gid != current_creds.egid
2155                && !current_creds.is_in_group(info.gid)
2156                && mode.intersects(FileMode::ISGID)
2157                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
2158            {
2159                mode &= !FileMode::ISGID;
2160            }
2161            info.chmod(mode);
2162            Ok(())
2163        })
2164    }
2165
2166    /// Sets the owner and/or group on this FsNode.
2167    pub fn chown<L>(
2168        &self,
2169        locked: &mut Locked<L>,
2170        current_task: &CurrentTask,
2171        mount: &MountInfo,
2172        owner: Option<uid_t>,
2173        group: Option<gid_t>,
2174    ) -> Result<(), Errno>
2175    where
2176        L: LockEqualOrBefore<FileOpsCore>,
2177    {
2178        mount.check_readonly_filesystem()?;
2179        self.update_attributes(locked, current_task, |info| {
2180            if security::is_task_capable_noaudit(current_task, CAP_CHOWN) {
2181                info.chown(owner, group);
2182                return Ok(());
2183            }
2184
2185            // Nobody can change the owner.
2186            if let Some(uid) = owner {
2187                if info.uid != uid {
2188                    return error!(EPERM);
2189                }
2190            }
2191
2192            let (euid, is_in_group) = {
2193                let current_creds = current_task.current_creds();
2194                (current_creds.euid, group.map(|gid| current_creds.is_in_group(gid)))
2195            };
2196
2197            // The owner can change the group.
2198            if info.uid == euid {
2199                // To a group that it belongs.
2200                if let Some(is_in_group) = is_in_group {
2201                    if !is_in_group {
2202                        return error!(EPERM);
2203                    }
2204                }
2205                info.chown(None, group);
2206                return Ok(());
2207            }
2208
2209            // Any other user can call chown(file, -1, -1)
2210            if owner.is_some() || group.is_some() {
2211                return error!(EPERM);
2212            }
2213
2214            // But not on set-user-ID or set-group-ID files.
2215            // If we were to chown them, they would drop the set-ID bit.
2216            if info.mode.is_reg()
2217                && (info.mode.contains(FileMode::ISUID)
2218                    || info.mode.contains(FileMode::ISGID | FileMode::IXGRP))
2219            {
2220                return error!(EPERM);
2221            }
2222
2223            info.chown(None, None);
2224            Ok(())
2225        })
2226    }
2227
2228    /// Forcefully change the owner and group of this node.
2229    ///
2230    /// # Safety
2231    ///
2232    /// This function skips all the security checks and just updates the owner and group. Also, does
2233    /// not check if the filesystem is read-only and does not update the attribute change time.
2234    ///
2235    /// This function is used to set the owner and group of /proc/pid to the credentials of the
2236    /// current task. Please consider carefully whether you want to use this function for another
2237    /// purpose.
2238    pub unsafe fn force_chown(&self, creds: FsCred) {
2239        self.update_info(|info| {
2240            info.chown(Some(creds.uid), Some(creds.gid));
2241        });
2242    }
2243
2244    /// Whether this node is a regular file.
2245    pub fn is_reg(&self) -> bool {
2246        self.info().mode.is_reg()
2247    }
2248
2249    /// Whether this node is a directory.
2250    pub fn is_dir(&self) -> bool {
2251        self.info().mode.is_dir()
2252    }
2253
2254    /// Whether this node is a socket.
2255    pub fn is_sock(&self) -> bool {
2256        self.info().mode.is_sock()
2257    }
2258
2259    /// Whether this node is a FIFO.
2260    pub fn is_fifo(&self) -> bool {
2261        self.info().mode.is_fifo()
2262    }
2263
2264    /// Whether this node is a symbolic link.
2265    pub fn is_lnk(&self) -> bool {
2266        self.info().mode.is_lnk()
2267    }
2268
2269    pub fn dev(&self) -> DeviceId {
2270        self.fs().dev_id
2271    }
2272
2273    pub fn stat<L>(
2274        &self,
2275        locked: &mut Locked<L>,
2276        current_task: &CurrentTask,
2277    ) -> Result<uapi::stat, Errno>
2278    where
2279        L: LockEqualOrBefore<FileOpsCore>,
2280    {
2281        security::check_fs_node_getattr_access(current_task, self)?;
2282
2283        let info = self.fetch_and_refresh_info(locked, current_task)?;
2284
2285        let time_to_kernel_timespec_pair = |t| {
2286            let timespec { tv_sec, tv_nsec } = timespec_from_time(t);
2287            let time = tv_sec.try_into().map_err(|_| errno!(EINVAL))?;
2288            let time_nsec = tv_nsec.try_into().map_err(|_| errno!(EINVAL))?;
2289            Ok((time, time_nsec))
2290        };
2291
2292        let (st_atime, st_atime_nsec) = time_to_kernel_timespec_pair(info.time_access)?;
2293        let (st_mtime, st_mtime_nsec) = time_to_kernel_timespec_pair(info.time_modify)?;
2294        let (st_ctime, st_ctime_nsec) = time_to_kernel_timespec_pair(info.time_status_change)?;
2295
2296        Ok(uapi::stat {
2297            st_dev: self.dev().bits(),
2298            st_ino: self.ino,
2299            st_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2300            st_mode: info.mode.bits(),
2301            st_uid: info.uid,
2302            st_gid: info.gid,
2303            st_rdev: info.rdev.bits(),
2304            st_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2305            st_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2306            st_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2307            st_atime,
2308            st_atime_nsec,
2309            st_mtime,
2310            st_mtime_nsec,
2311            st_ctime,
2312            st_ctime_nsec,
2313            ..Default::default()
2314        })
2315    }
2316
2317    /// Returns the current size of the file.  This is inherently racy, so any caller that
2318    /// might want to use the value returned should hold their own locks if necessary.  For
2319    /// example, if using the value here to implement append (which is the case at the time
2320    /// of writing this comment), locks must be held to prevent the file size being changed
2321    /// concurrently.
2322    // TODO(https://fxbug.dev/454730248): This is probably the wrong way to implement O_APPEND.
2323    pub fn get_size<L>(
2324        &self,
2325        locked: &mut Locked<L>,
2326        current_task: &CurrentTask,
2327    ) -> Result<usize, Errno>
2328    where
2329        L: LockEqualOrBefore<FileOpsCore>,
2330    {
2331        self.ops().get_size(locked.cast_locked::<FileOpsCore>(), self, current_task)
2332    }
2333
2334    fn statx_timestamp_from_time(time: UtcInstant) -> statx_timestamp {
2335        let nanos = time.into_nanos();
2336        statx_timestamp {
2337            tv_sec: nanos / NANOS_PER_SECOND,
2338            tv_nsec: (nanos % NANOS_PER_SECOND) as u32,
2339            ..Default::default()
2340        }
2341    }
2342
2343    pub fn statx<L>(
2344        &self,
2345        locked: &mut Locked<L>,
2346        current_task: &CurrentTask,
2347        flags: StatxFlags,
2348        mask: u32,
2349    ) -> Result<statx, Errno>
2350    where
2351        L: LockEqualOrBefore<FileOpsCore>,
2352    {
2353        security::check_fs_node_getattr_access(current_task, self)?;
2354
2355        // Ignore mask for now and fill in all of the fields.
2356        let info = if flags.contains(StatxFlags::AT_STATX_DONT_SYNC) {
2357            self.info()
2358        } else {
2359            self.fetch_and_refresh_info(locked, current_task)?
2360        };
2361        if mask & STATX__RESERVED == STATX__RESERVED {
2362            return error!(EINVAL);
2363        }
2364
2365        track_stub!(TODO("https://fxbug.dev/302594110"), "statx attributes");
2366        let stx_mnt_id = 0;
2367        let mut stx_attributes = 0;
2368        let stx_attributes_mask = STATX_ATTR_VERITY as u64;
2369
2370        if matches!(*self.fsverity.lock(), FsVerityState::FsVerity) {
2371            stx_attributes |= STATX_ATTR_VERITY as u64;
2372        }
2373
2374        Ok(statx {
2375            stx_mask: STATX_NLINK
2376                | STATX_UID
2377                | STATX_GID
2378                | STATX_ATIME
2379                | STATX_MTIME
2380                | STATX_CTIME
2381                | STATX_INO
2382                | STATX_SIZE
2383                | STATX_BLOCKS
2384                | STATX_BASIC_STATS,
2385            stx_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2386            stx_attributes,
2387            stx_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2388            stx_uid: info.uid,
2389            stx_gid: info.gid,
2390            stx_mode: info.mode.bits().try_into().map_err(|_| errno!(EINVAL))?,
2391            stx_ino: self.ino,
2392            stx_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2393            stx_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2394            stx_attributes_mask,
2395            stx_ctime: Self::statx_timestamp_from_time(info.time_status_change),
2396            stx_mtime: Self::statx_timestamp_from_time(info.time_modify),
2397            stx_atime: Self::statx_timestamp_from_time(info.time_access),
2398
2399            stx_rdev_major: info.rdev.major(),
2400            stx_rdev_minor: info.rdev.minor(),
2401
2402            stx_dev_major: self.fs().dev_id.major(),
2403            stx_dev_minor: self.fs().dev_id.minor(),
2404            stx_mnt_id,
2405            ..Default::default()
2406        })
2407    }
2408
2409    /// Checks whether `current_task` has capabilities required for the specified `access` to the
2410    /// extended attribute `name`.
2411    fn check_xattr_access<L>(
2412        &self,
2413        locked: &mut Locked<L>,
2414        current_task: &CurrentTask,
2415        mount: &MountInfo,
2416        name: &FsStr,
2417        access: Access,
2418    ) -> Result<(), Errno>
2419    where
2420        L: LockEqualOrBefore<FileOpsCore>,
2421    {
2422        assert!(access == Access::READ || access == Access::WRITE);
2423
2424        let enodata_if_read =
2425            |e: Errno| if access == Access::READ && e.code == EPERM { errno!(ENODATA) } else { e };
2426
2427        // man xattr(7) describes the different access checks applied to each extended attribute
2428        // namespace.
2429        if name.starts_with(XATTR_USER_PREFIX.to_bytes()) {
2430            {
2431                let info = self.info();
2432                if !info.mode.is_reg() && !info.mode.is_dir() {
2433                    return Err(enodata_if_read(errno!(EPERM)));
2434                }
2435            }
2436
2437            // TODO: https://fxbug.dev/460734830 - Perform capability check(s) if file has sticky
2438            // bit set.
2439
2440            self.check_access(
2441                locked,
2442                current_task,
2443                mount,
2444                access,
2445                CheckAccessReason::InternalPermissionChecks,
2446                security::Auditable::Name(name),
2447            )?;
2448        } else if name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()) {
2449            // Trusted extended attributes require `CAP_SYS_ADMIN` to read or write.
2450            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2451        } else if name.starts_with(XATTR_SYSTEM_PREFIX.to_bytes()) {
2452            // System extended attributes have attribute-specific access policy.
2453            // TODO: https://fxbug.dev/460734830 -  Revise how system extended attributes are
2454            // access-controlled.
2455            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2456        } else if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2457            if access == Access::WRITE {
2458                // Writes require `CAP_SYS_ADMIN`, unless the LSM owning `name` specifies to skip.
2459                if !security::fs_node_xattr_skipcap(current_task, name) {
2460                    security::check_task_capable(current_task, CAP_SYS_ADMIN)
2461                        .map_err(enodata_if_read)?;
2462                }
2463            }
2464        } else {
2465            panic!("Unknown extended attribute prefix: {}", name);
2466        }
2467        Ok(())
2468    }
2469
2470    pub fn get_xattr<L>(
2471        &self,
2472        locked: &mut Locked<L>,
2473        current_task: &CurrentTask,
2474        mount: &MountInfo,
2475        name: &FsStr,
2476        max_size: usize,
2477    ) -> Result<ValueOrSize<FsString>, Errno>
2478    where
2479        L: LockEqualOrBefore<FileOpsCore>,
2480    {
2481        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2482        self.check_xattr_access(locked, current_task, mount, name, Access::READ)?;
2483
2484        // LSM access checks must be performed after discretionary checks.
2485        security::check_fs_node_getxattr_access(current_task, self, name)?;
2486
2487        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2488            // If the attribute is in the security.* domain then allow the LSM to handle the
2489            // request, or to delegate to `FsNodeOps::get_xattr()`.
2490            security::fs_node_getsecurity(locked, current_task, self, name, max_size)
2491        } else {
2492            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2493            self.ops().get_xattr(
2494                locked.cast_locked::<FileOpsCore>(),
2495                self,
2496                current_task,
2497                name,
2498                max_size,
2499            )
2500        }
2501    }
2502
2503    pub fn set_xattr<L>(
2504        &self,
2505        locked: &mut Locked<L>,
2506        current_task: &CurrentTask,
2507        mount: &MountInfo,
2508        name: &FsStr,
2509        value: &FsStr,
2510        op: XattrOp,
2511    ) -> Result<(), Errno>
2512    where
2513        L: LockEqualOrBefore<FileOpsCore>,
2514    {
2515        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2516        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2517
2518        // LSM access checks must be performed after discretionary checks.
2519        security::check_fs_node_setxattr_access(current_task, self, name, value, op)?;
2520
2521        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2522            // If the attribute is in the security.* domain then allow the LSM to handle the
2523            // request, or to delegate to `FsNodeOps::set_xattr()`.
2524            security::fs_node_setsecurity(locked, current_task, self, name, value, op)
2525        } else {
2526            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2527            self.ops().set_xattr(
2528                locked.cast_locked::<FileOpsCore>(),
2529                self,
2530                current_task,
2531                name,
2532                value,
2533                op,
2534            )
2535        }
2536    }
2537
2538    pub fn remove_xattr<L>(
2539        &self,
2540        locked: &mut Locked<L>,
2541        current_task: &CurrentTask,
2542        mount: &MountInfo,
2543        name: &FsStr,
2544    ) -> Result<(), Errno>
2545    where
2546        L: LockEqualOrBefore<FileOpsCore>,
2547    {
2548        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2549        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2550
2551        // LSM access checks must be performed after discretionary checks.
2552        security::check_fs_node_removexattr_access(current_task, self, name)?;
2553        self.ops().remove_xattr(locked.cast_locked::<FileOpsCore>(), self, current_task, name)
2554    }
2555
2556    pub fn list_xattrs<L>(
2557        &self,
2558        locked: &mut Locked<L>,
2559        current_task: &CurrentTask,
2560        max_size: usize,
2561    ) -> Result<ValueOrSize<Vec<FsString>>, Errno>
2562    where
2563        L: LockEqualOrBefore<FileOpsCore>,
2564    {
2565        security::check_fs_node_listxattr_access(current_task, self)?;
2566        Ok(self
2567            .ops()
2568            .list_xattrs(locked.cast_locked::<FileOpsCore>(), self, current_task, max_size)?
2569            .map(|mut v| {
2570                // Extended attributes may be listed even if the caller would not be able to read
2571                // (or modify) the attribute's value.
2572                // trusted.* attributes are only accessible with CAP_SYS_ADMIN and are omitted by
2573                // `listxattr()` unless the caller holds CAP_SYS_ADMIN.
2574                if !security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN) {
2575                    v.retain(|name| !name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()));
2576                }
2577                v
2578            }))
2579    }
2580
2581    /// Returns current `FsNodeInfo`.
2582    pub fn info(&self) -> RwLockReadGuard<'_, FsNodeInfo> {
2583        self.info.read()
2584    }
2585
2586    /// Refreshes the `FsNodeInfo` if necessary and returns a read guard.
2587    pub fn fetch_and_refresh_info<L>(
2588        &self,
2589        locked: &mut Locked<L>,
2590        current_task: &CurrentTask,
2591    ) -> Result<RwLockReadGuard<'_, FsNodeInfo>, Errno>
2592    where
2593        L: LockEqualOrBefore<FileOpsCore>,
2594    {
2595        self.ops().fetch_and_refresh_info(
2596            locked.cast_locked::<FileOpsCore>(),
2597            self,
2598            current_task,
2599            &self.info,
2600        )
2601    }
2602
2603    pub fn update_info<F, T>(&self, mutator: F) -> T
2604    where
2605        F: FnOnce(&mut FsNodeInfo) -> T,
2606    {
2607        let mut info = self.info.write();
2608        mutator(&mut info)
2609    }
2610
2611    /// Clear the SUID and SGID bits unless the `current_task` has `CAP_FSETID`
2612    pub fn clear_suid_and_sgid_bits<L>(
2613        &self,
2614        locked: &mut Locked<L>,
2615        current_task: &CurrentTask,
2616    ) -> Result<(), Errno>
2617    where
2618        L: LockEqualOrBefore<FileOpsCore>,
2619    {
2620        if !security::is_task_capable_noaudit(current_task, CAP_FSETID) {
2621            self.update_attributes(locked, current_task, |info| {
2622                info.clear_suid_and_sgid_bits();
2623                Ok(())
2624            })?;
2625        }
2626        Ok(())
2627    }
2628
2629    /// Update the ctime and mtime of a file to now.
2630    pub fn update_ctime_mtime(&self) {
2631        if self.fs().manages_timestamps() {
2632            return;
2633        }
2634        self.update_info(|info| {
2635            let now = utc::utc_now();
2636            info.time_status_change = now;
2637            info.time_modify = now;
2638        });
2639    }
2640
2641    /// Update the ctime of a file to now.
2642    pub fn update_ctime(&self) {
2643        if self.fs().manages_timestamps() {
2644            return;
2645        }
2646        self.update_info(|info| {
2647            let now = utc::utc_now();
2648            info.time_status_change = now;
2649        });
2650    }
2651
2652    /// Update the atime and mtime if the `current_task` has write access, is the file owner, or
2653    /// holds either the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
2654    pub fn update_atime_mtime<L>(
2655        &self,
2656        locked: &mut Locked<L>,
2657        current_task: &CurrentTask,
2658        mount: &MountInfo,
2659        atime: TimeUpdateType,
2660        mtime: TimeUpdateType,
2661    ) -> Result<(), Errno>
2662    where
2663        L: LockEqualOrBefore<FileOpsCore>,
2664    {
2665        // If the filesystem is read-only, this always fail.
2666        mount.check_readonly_filesystem()?;
2667
2668        let now = matches!((atime, mtime), (TimeUpdateType::Now, TimeUpdateType::Now));
2669        self.check_access(
2670            locked,
2671            current_task,
2672            mount,
2673            Access::WRITE,
2674            CheckAccessReason::ChangeTimestamps { now },
2675            security::Auditable::Location(std::panic::Location::caller()),
2676        )?;
2677
2678        if !matches!((atime, mtime), (TimeUpdateType::Omit, TimeUpdateType::Omit)) {
2679            // This function is called by `utimes(..)` which will update the access and
2680            // modification time. We need to call `update_attributes()` to update the mtime of
2681            // filesystems that manages file timestamps.
2682            self.update_attributes(locked, current_task, |info| {
2683                let now = utc::utc_now();
2684                let get_time = |time: TimeUpdateType| match time {
2685                    TimeUpdateType::Now => Some(now),
2686                    TimeUpdateType::Time(t) => Some(t),
2687                    TimeUpdateType::Omit => None,
2688                };
2689                if let Some(time) = get_time(atime) {
2690                    info.time_access = time;
2691                }
2692                if let Some(time) = get_time(mtime) {
2693                    info.time_modify = time;
2694                }
2695                Ok(())
2696            })?;
2697        }
2698        Ok(())
2699    }
2700
2701    /// Returns a string describing this `FsNode` in the format used by "/proc/../fd" for anonymous
2702    /// file descriptors. By default this is in the form:
2703    ///   <class>:[<node_id>]
2704    /// though `FsNodeOps` may customize this as required.
2705    pub fn internal_name(&self) -> FsString {
2706        if let Some(name) = self.ops().internal_name(self) {
2707            return name;
2708        };
2709        let class = if self.is_sock() {
2710            "socket"
2711        } else if self.is_fifo() {
2712            "pipe"
2713        } else {
2714            "file"
2715        };
2716        format!("{}:[{}]", class, self.ino).into()
2717    }
2718
2719    /// The key used to identify this node in the file system's node cache.
2720    ///
2721    /// For many file systems, this will be the same as the inode number. However, some file
2722    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
2723    pub fn node_key(&self) -> ino_t {
2724        self.ops().node_key(self)
2725    }
2726
2727    fn ensure_rare_data(&self) -> &FsNodeRareData {
2728        self.rare_data.get_or_init(|| Box::new(FsNodeRareData::default()))
2729    }
2730
2731    /// Returns the set of watchers for this node.
2732    ///
2733    /// Only call this function if you require this node to actually store a list of watchers. If
2734    /// you just wish to notify any watchers that might exist, please use `notify` instead.
2735    pub fn ensure_watchers(&self) -> &inotify::InotifyWatchers {
2736        &self.ensure_rare_data().watchers
2737    }
2738
2739    /// Notify the watchers of the given event.
2740    pub fn notify(
2741        &self,
2742        event_mask: InotifyMask,
2743        cookie: u32,
2744        name: &FsStr,
2745        mode: FileMode,
2746        is_dead: bool,
2747    ) {
2748        if let Some(rare_data) = self.rare_data.get() {
2749            rare_data.watchers.notify(event_mask, cookie, name, mode, is_dead);
2750        }
2751    }
2752
2753    /// Calls through to the filesystem to enable fs-verity on this file.
2754    pub fn enable_fsverity<L>(
2755        &self,
2756        locked: &mut Locked<L>,
2757        current_task: &CurrentTask,
2758        descriptor: &fsverity_descriptor,
2759    ) -> Result<(), Errno>
2760    where
2761        L: LockEqualOrBefore<FileOpsCore>,
2762    {
2763        let locked = locked.cast_locked::<FileOpsCore>();
2764        self.ops().enable_fsverity(locked, self, current_task, descriptor)
2765    }
2766}
2767
2768impl std::fmt::Debug for FsNode {
2769    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2770        f.debug_struct("FsNode")
2771            .field("fs", &self.fs().name())
2772            .field("info", &*self.info())
2773            .field("ops_ty", &self.ops().type_name())
2774            .finish()
2775    }
2776}
2777
2778impl Releasable for FsNode {
2779    type Context<'a> = CurrentTaskAndLocked<'a>;
2780
2781    fn release<'a>(self, context: CurrentTaskAndLocked<'a>) {
2782        let (locked, current_task) = context;
2783        if let Some(fs) = self.fs.upgrade() {
2784            fs.remove_node(&self);
2785        }
2786        if let Err(err) = self.ops.forget(
2787            locked.cast_locked::<FileOpsCore>(),
2788            current_task,
2789            self.info.into_inner(),
2790        ) {
2791            log_error!("Error on FsNodeOps::forget: {err:?}");
2792        }
2793    }
2794}
2795
2796fn check_access(
2797    fs_node: &FsNode,
2798    current_task: &CurrentTask,
2799    permission_flags: security::PermissionFlags,
2800    node_uid: uid_t,
2801    node_gid: gid_t,
2802    mode: FileMode,
2803) -> Result<(), Errno> {
2804    // Determine which of the access bits apply to the `current_task`.
2805    let (fsuid, is_in_group) = {
2806        let current_creds = current_task.current_creds();
2807        (current_creds.fsuid, current_creds.is_in_group(node_gid))
2808    };
2809    let granted = if fsuid == node_uid {
2810        mode.user_access()
2811    } else if is_in_group {
2812        mode.group_access()
2813    } else {
2814        mode.other_access()
2815    };
2816
2817    let access = permission_flags.as_access();
2818    if granted.contains(access) {
2819        return Ok(());
2820    }
2821
2822    // Callers with CAP_DAC_READ_SEARCH override can read files & directories, and traverse
2823    // directories to which they lack permission.
2824    let mut requested = access & !granted;
2825
2826    // If this check was triggered by `access()`, or a variant, then check for a `dontaudit`
2827    // statement for the `audit_access` permission for this caller & file.
2828    let have_dont_audit = OnceBool::new();
2829    let has_capability = move |current_task, capability| {
2830        let dont_audit = have_dont_audit.get_or_init(|| {
2831            permission_flags.contains(PermissionFlags::ACCESS)
2832                && security::has_dontaudit_access(current_task, fs_node)
2833        });
2834        if dont_audit {
2835            security::is_task_capable_noaudit(current_task, capability)
2836        } else {
2837            security::check_task_capable(current_task, capability).is_ok()
2838        }
2839    };
2840
2841    // CAP_DAC_READ_SEARCH allows bypass of read checks, and directory traverse (eXecute) checks.
2842    let dac_read_search_access =
2843        if mode.is_dir() { Access::READ | Access::EXEC } else { Access::READ };
2844    if dac_read_search_access.intersects(requested)
2845        && has_capability(current_task, CAP_DAC_READ_SEARCH)
2846    {
2847        requested.remove(dac_read_search_access);
2848    }
2849    if requested.is_empty() {
2850        return Ok(());
2851    }
2852
2853    // CAP_DAC_OVERRIDE allows bypass of all checks (though see the comment for file-execute).
2854    let mut dac_override_access = Access::READ | Access::WRITE;
2855    dac_override_access |= if mode.is_dir() {
2856        Access::EXEC
2857    } else {
2858        // File execute access checks may not be bypassed unless at least one executable bit is set.
2859        (mode.user_access() | mode.group_access() | mode.other_access()) & Access::EXEC
2860    };
2861    if dac_override_access.intersects(requested) && has_capability(current_task, CAP_DAC_OVERRIDE) {
2862        requested.remove(dac_override_access);
2863    }
2864    if requested.is_empty() {
2865        return Ok(());
2866    }
2867
2868    return error!(EACCES);
2869}
2870
2871#[cfg(test)]
2872mod tests {
2873    use super::*;
2874    use crate::device::mem::mem_device_init;
2875    use crate::testing::*;
2876    use crate::vfs::buffers::VecOutputBuffer;
2877    use starnix_uapi::auth::Credentials;
2878    use starnix_uapi::file_mode::mode;
2879
2880    #[::fuchsia::test]
2881    async fn open_device_file() {
2882        spawn_kernel_and_run(async |locked, current_task| {
2883            mem_device_init(locked, &*current_task).expect("mem_device_init");
2884
2885            // Create a device file that points to the `zero` device (which is automatically
2886            // registered in the kernel).
2887            current_task
2888                .fs()
2889                .root()
2890                .create_node(
2891                    locked,
2892                    &current_task,
2893                    "zero".into(),
2894                    mode!(IFCHR, 0o666),
2895                    DeviceId::ZERO,
2896                )
2897                .expect("create_node");
2898
2899            const CONTENT_LEN: usize = 10;
2900            let mut buffer = VecOutputBuffer::new(CONTENT_LEN);
2901
2902            // Read from the zero device.
2903            let device_file = current_task
2904                .open_file(locked, "zero".into(), OpenFlags::RDONLY)
2905                .expect("open device file");
2906            device_file.read(locked, &current_task, &mut buffer).expect("read from zero");
2907
2908            // Assert the contents.
2909            assert_eq!(&[0; CONTENT_LEN], buffer.data());
2910        })
2911        .await;
2912    }
2913
2914    #[::fuchsia::test]
2915    async fn node_info_is_reflected_in_stat() {
2916        spawn_kernel_and_run(async |locked, current_task| {
2917            // Create a node.
2918            let node = &current_task
2919                .fs()
2920                .root()
2921                .create_node(locked, &current_task, "zero".into(), FileMode::IFCHR, DeviceId::ZERO)
2922                .expect("create_node")
2923                .entry
2924                .node;
2925            node.update_info(|info| {
2926                info.mode = FileMode::IFSOCK;
2927                info.size = 1;
2928                info.blocks = 2;
2929                info.blksize = 4;
2930                info.uid = 9;
2931                info.gid = 10;
2932                info.link_count = 11;
2933                info.time_status_change = UtcInstant::from_nanos(1);
2934                info.time_access = UtcInstant::from_nanos(2);
2935                info.time_modify = UtcInstant::from_nanos(3);
2936                info.rdev = DeviceId::new(13, 13);
2937            });
2938            let stat = node.stat(locked, &current_task).expect("stat");
2939
2940            assert_eq!(stat.st_mode, FileMode::IFSOCK.bits());
2941            assert_eq!(stat.st_size, 1);
2942            assert_eq!(stat.st_blksize, 4);
2943            assert_eq!(stat.st_blocks, 2);
2944            assert_eq!(stat.st_uid, 9);
2945            assert_eq!(stat.st_gid, 10);
2946            assert_eq!(stat.st_nlink, 11);
2947            assert_eq!(stat.st_ctime, 0);
2948            assert_eq!(stat.st_ctime_nsec, 1);
2949            assert_eq!(stat.st_atime, 0);
2950            assert_eq!(stat.st_atime_nsec, 2);
2951            assert_eq!(stat.st_mtime, 0);
2952            assert_eq!(stat.st_mtime_nsec, 3);
2953            assert_eq!(stat.st_rdev, DeviceId::new(13, 13).bits());
2954        })
2955        .await;
2956    }
2957
2958    #[::fuchsia::test]
2959    fn test_flock_operation() {
2960        assert!(FlockOperation::from_flags(0).is_err());
2961        assert!(FlockOperation::from_flags(u32::MAX).is_err());
2962
2963        let operation1 = FlockOperation::from_flags(LOCK_SH).expect("from_flags");
2964        assert!(!operation1.is_unlock());
2965        assert!(!operation1.is_lock_exclusive());
2966        assert!(operation1.is_blocking());
2967
2968        let operation2 = FlockOperation::from_flags(LOCK_EX | LOCK_NB).expect("from_flags");
2969        assert!(!operation2.is_unlock());
2970        assert!(operation2.is_lock_exclusive());
2971        assert!(!operation2.is_blocking());
2972
2973        let operation3 = FlockOperation::from_flags(LOCK_UN).expect("from_flags");
2974        assert!(operation3.is_unlock());
2975        assert!(!operation3.is_lock_exclusive());
2976        assert!(operation3.is_blocking());
2977    }
2978
2979    #[::fuchsia::test]
2980    async fn test_check_access() {
2981        spawn_kernel_and_run(async |locked, current_task| {
2982            let mut creds = Credentials::with_ids(1, 2);
2983            creds.groups = vec![3, 4];
2984            current_task.set_creds(creds);
2985
2986            // Create a node.
2987            let node = &current_task
2988                .fs()
2989                .root()
2990                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
2991                .expect("create_node")
2992                .entry
2993                .node;
2994            let check_access = |locked: &mut Locked<Unlocked>,
2995                                uid: uid_t,
2996                                gid: gid_t,
2997                                perm: u32,
2998                                access: Access| {
2999                node.update_info(|info| {
3000                    info.mode = mode!(IFREG, perm);
3001                    info.uid = uid;
3002                    info.gid = gid;
3003                });
3004                node.check_access(
3005                    locked,
3006                    &current_task,
3007                    &MountInfo::detached(),
3008                    access,
3009                    CheckAccessReason::InternalPermissionChecks,
3010                    security::Auditable::Location(std::panic::Location::caller()),
3011                )
3012            };
3013
3014            assert_eq!(check_access(locked, 0, 0, 0o700, Access::EXEC), error!(EACCES));
3015            assert_eq!(check_access(locked, 0, 0, 0o700, Access::READ), error!(EACCES));
3016            assert_eq!(check_access(locked, 0, 0, 0o700, Access::WRITE), error!(EACCES));
3017
3018            assert_eq!(check_access(locked, 0, 0, 0o070, Access::EXEC), error!(EACCES));
3019            assert_eq!(check_access(locked, 0, 0, 0o070, Access::READ), error!(EACCES));
3020            assert_eq!(check_access(locked, 0, 0, 0o070, Access::WRITE), error!(EACCES));
3021
3022            assert_eq!(check_access(locked, 0, 0, 0o007, Access::EXEC), Ok(()));
3023            assert_eq!(check_access(locked, 0, 0, 0o007, Access::READ), Ok(()));
3024            assert_eq!(check_access(locked, 0, 0, 0o007, Access::WRITE), Ok(()));
3025
3026            assert_eq!(check_access(locked, 1, 0, 0o700, Access::EXEC), Ok(()));
3027            assert_eq!(check_access(locked, 1, 0, 0o700, Access::READ), Ok(()));
3028            assert_eq!(check_access(locked, 1, 0, 0o700, Access::WRITE), Ok(()));
3029
3030            assert_eq!(check_access(locked, 1, 0, 0o100, Access::EXEC), Ok(()));
3031            assert_eq!(check_access(locked, 1, 0, 0o100, Access::READ), error!(EACCES));
3032            assert_eq!(check_access(locked, 1, 0, 0o100, Access::WRITE), error!(EACCES));
3033
3034            assert_eq!(check_access(locked, 1, 0, 0o200, Access::EXEC), error!(EACCES));
3035            assert_eq!(check_access(locked, 1, 0, 0o200, Access::READ), error!(EACCES));
3036            assert_eq!(check_access(locked, 1, 0, 0o200, Access::WRITE), Ok(()));
3037
3038            assert_eq!(check_access(locked, 1, 0, 0o400, Access::EXEC), error!(EACCES));
3039            assert_eq!(check_access(locked, 1, 0, 0o400, Access::READ), Ok(()));
3040            assert_eq!(check_access(locked, 1, 0, 0o400, Access::WRITE), error!(EACCES));
3041
3042            assert_eq!(check_access(locked, 0, 2, 0o700, Access::EXEC), error!(EACCES));
3043            assert_eq!(check_access(locked, 0, 2, 0o700, Access::READ), error!(EACCES));
3044            assert_eq!(check_access(locked, 0, 2, 0o700, Access::WRITE), error!(EACCES));
3045
3046            assert_eq!(check_access(locked, 0, 2, 0o070, Access::EXEC), Ok(()));
3047            assert_eq!(check_access(locked, 0, 2, 0o070, Access::READ), Ok(()));
3048            assert_eq!(check_access(locked, 0, 2, 0o070, Access::WRITE), Ok(()));
3049
3050            assert_eq!(check_access(locked, 0, 3, 0o070, Access::EXEC), Ok(()));
3051            assert_eq!(check_access(locked, 0, 3, 0o070, Access::READ), Ok(()));
3052            assert_eq!(check_access(locked, 0, 3, 0o070, Access::WRITE), Ok(()));
3053        })
3054        .await;
3055    }
3056
3057    #[::fuchsia::test]
3058    async fn set_security_xattr_fails_without_security_module_or_root() {
3059        spawn_kernel_and_run(async |locked, current_task| {
3060            let mut creds = Credentials::with_ids(1, 2);
3061            creds.groups = vec![3, 4];
3062            current_task.set_creds(creds);
3063
3064            // Create a node.
3065            let node = &current_task
3066                .fs()
3067                .root()
3068                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
3069                .expect("create_node")
3070                .entry
3071                .node;
3072
3073            // Give read-write-execute access.
3074            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3075
3076            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3077            // should fail.
3078            assert_eq!(
3079                node.set_xattr(
3080                    locked,
3081                    &current_task,
3082                    &MountInfo::detached(),
3083                    "security.name".into(),
3084                    "security_label".into(),
3085                    XattrOp::Create,
3086                ),
3087                error!(EPERM)
3088            );
3089        })
3090        .await;
3091    }
3092
3093    #[::fuchsia::test]
3094    async fn set_non_user_xattr_fails_without_security_module_or_root() {
3095        spawn_kernel_and_run(async |locked, current_task| {
3096            let mut creds = Credentials::with_ids(1, 2);
3097            creds.groups = vec![3, 4];
3098            current_task.set_creds(creds);
3099
3100            // Create a node.
3101            let node = &current_task
3102                .fs()
3103                .root()
3104                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
3105                .expect("create_node")
3106                .entry
3107                .node;
3108
3109            // Give read-write-execute access.
3110            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3111
3112            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3113            // should fail.
3114            assert_eq!(
3115                node.set_xattr(
3116                    locked,
3117                    &current_task,
3118                    &MountInfo::detached(),
3119                    "trusted.name".into(),
3120                    "some data".into(),
3121                    XattrOp::Create,
3122                ),
3123                error!(EPERM)
3124            );
3125        })
3126        .await;
3127    }
3128
3129    #[::fuchsia::test]
3130    async fn get_security_xattr_succeeds_without_read_access() {
3131        spawn_kernel_and_run(async |locked, current_task| {
3132            let mut creds = Credentials::with_ids(1, 2);
3133            creds.groups = vec![3, 4];
3134            current_task.set_creds(creds);
3135
3136            // Create a node.
3137            let node = &current_task
3138                .fs()
3139                .root()
3140                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceId::NONE)
3141                .expect("create_node")
3142                .entry
3143                .node;
3144
3145            // Only give read access to the root and give root access to the current task.
3146            node.update_info(|info| info.mode = mode!(IFREG, 0o100));
3147            current_task.set_creds(Credentials::with_ids(0, 0));
3148
3149            // Setting the label should succeed even without write access to the file.
3150            assert_eq!(
3151                node.set_xattr(
3152                    locked,
3153                    &current_task,
3154                    &MountInfo::detached(),
3155                    "security.name".into(),
3156                    "security_label".into(),
3157                    XattrOp::Create,
3158                ),
3159                Ok(())
3160            );
3161
3162            // Remove root access from the current task.
3163            current_task.set_creds(Credentials::with_ids(1, 1));
3164
3165            // Getting the label should succeed even without read access to the file.
3166            assert_eq!(
3167                node.get_xattr(
3168                    locked,
3169                    &current_task,
3170                    &MountInfo::detached(),
3171                    "security.name".into(),
3172                    4096
3173                ),
3174                Ok(ValueOrSize::Value("security_label".into()))
3175            );
3176        })
3177        .await;
3178    }
3179}