starnix_core/vfs/
fs_node.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::device::DeviceMode;
6use crate::mm::PAGE_SIZE;
7use crate::security::{self, Auditable, PermissionFlags};
8use crate::signals::{SignalInfo, send_standard_signal};
9use crate::task::{CurrentTask, CurrentTaskAndLocked, WaitQueue, Waiter, register_delayed_release};
10use crate::time::utc;
11use crate::vfs::fsverity::FsVerityState;
12use crate::vfs::pipe::{Pipe, PipeHandle};
13use crate::vfs::rw_queue::{RwQueue, RwQueueReadGuard};
14use crate::vfs::socket::SocketHandle;
15use crate::vfs::{
16    DefaultDirEntryOps, DirEntryOps, FileObject, FileObjectState, FileOps, FileSystem,
17    FileSystemHandle, FileWriteGuardState, FsStr, FsString, MAX_LFS_FILESIZE, MountInfo,
18    NamespaceNode, OPathOps, RecordLockCommand, RecordLockOwner, RecordLocks, WeakFileHandle,
19    checked_add_offset_and_length, inotify,
20};
21use bitflags::bitflags;
22use fuchsia_runtime::UtcInstant;
23use linux_uapi::{XATTR_SECURITY_PREFIX, XATTR_SYSTEM_PREFIX, XATTR_TRUSTED_PREFIX};
24use once_cell::race::OnceBool;
25use starnix_crypt::EncryptionKeyId;
26use starnix_lifecycle::{ObjectReleaser, ReleaserAction};
27use starnix_logging::{log_error, track_stub};
28use starnix_sync::{
29    BeforeFsNodeAppend, FileOpsCore, FsNodeAppend, LockBefore, LockEqualOrBefore, Locked, Mutex,
30    RwLock, RwLockReadGuard, Unlocked,
31};
32use starnix_types::ownership::{Releasable, ReleaseGuard};
33use starnix_types::time::{NANOS_PER_SECOND, timespec_from_time};
34use starnix_uapi::as_any::AsAny;
35use starnix_uapi::auth::{
36    CAP_CHOWN, CAP_DAC_OVERRIDE, CAP_DAC_READ_SEARCH, CAP_FOWNER, CAP_FSETID, CAP_MKNOD,
37    CAP_SYS_ADMIN, CAP_SYS_RESOURCE, FsCred, UserAndOrGroupId,
38};
39use starnix_uapi::device_type::DeviceType;
40use starnix_uapi::errors::{EACCES, ENOTSUP, EPERM, Errno};
41use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
42use starnix_uapi::inotify_mask::InotifyMask;
43use starnix_uapi::mount_flags::MountFlags;
44use starnix_uapi::open_flags::OpenFlags;
45use starnix_uapi::resource_limits::Resource;
46use starnix_uapi::seal_flags::SealFlags;
47use starnix_uapi::signals::SIGXFSZ;
48use starnix_uapi::{
49    FALLOC_FL_COLLAPSE_RANGE, FALLOC_FL_INSERT_RANGE, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE,
50    FALLOC_FL_UNSHARE_RANGE, FALLOC_FL_ZERO_RANGE, LOCK_EX, LOCK_NB, LOCK_SH, LOCK_UN,
51    STATX__RESERVED, STATX_ATIME, STATX_ATTR_VERITY, STATX_BASIC_STATS, STATX_BLOCKS, STATX_CTIME,
52    STATX_GID, STATX_INO, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_UID, XATTR_USER_PREFIX,
53    errno, error, fsverity_descriptor, gid_t, ino_t, statx, statx_timestamp, timespec, uapi, uid_t,
54};
55use std::sync::atomic::Ordering;
56use std::sync::{Arc, OnceLock, Weak};
57use syncio::zxio_node_attr_has_t;
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum FsNodeLinkBehavior {
61    Allowed,
62    Disallowed,
63}
64
65impl Default for FsNodeLinkBehavior {
66    fn default() -> Self {
67        FsNodeLinkBehavior::Allowed
68    }
69}
70
71pub enum AppendLockGuard<'a> {
72    Read(RwQueueReadGuard<'a, FsNodeAppend>),
73    AlreadyLocked(&'a AppendLockGuard<'a>),
74}
75
76pub trait AppendLockStrategy<L> {
77    /// Helper method for acquiring append lock in `truncate`/`allocate`. Acquires the lock when it's not already acquired.
78    fn lock<'a>(
79        &'a self,
80        locked: &'a mut Locked<L>,
81        current_task: &CurrentTask,
82        node: &'a FsNode,
83    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno>;
84}
85
86struct RealAppendLockStrategy {}
87
88impl AppendLockStrategy<BeforeFsNodeAppend> for RealAppendLockStrategy {
89    fn lock<'a>(
90        &'a self,
91        locked: &'a mut Locked<BeforeFsNodeAppend>,
92        current_task: &CurrentTask,
93        node: &'a FsNode,
94    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno> {
95        let (guard, new_locked) = node.ops().append_lock_read(locked, node, current_task)?;
96        Ok((AppendLockGuard::Read(guard), new_locked.cast_locked()))
97    }
98}
99
100pub struct AlreadyLockedAppendLockStrategy<'a> {
101    // Keep the reference to the guard, which will be returned in subsequent attempts to acquire this lock.
102    guard: &'a AppendLockGuard<'a>,
103}
104
105impl<'a> AlreadyLockedAppendLockStrategy<'a> {
106    pub fn new(guard: &'a AppendLockGuard<'a>) -> Self {
107        Self { guard }
108    }
109}
110
111impl AppendLockStrategy<FileOpsCore> for AlreadyLockedAppendLockStrategy<'_> {
112    fn lock<'a>(
113        &'a self,
114        locked: &'a mut Locked<FileOpsCore>,
115        _current_task: &CurrentTask,
116        _node: &'a FsNode,
117    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno> {
118        Ok((AppendLockGuard::AlreadyLocked(self.guard), locked.cast_locked::<FileOpsCore>()))
119    }
120}
121
122pub struct FsNode {
123    /// The inode number for this FsNode.
124    pub ino: ino_t,
125
126    /// The FsNodeOps for this FsNode.
127    ///
128    /// The FsNodeOps are implemented by the individual file systems to provide
129    /// specific behaviors for this FsNode.
130    ops: Box<dyn FsNodeOps>,
131
132    /// The FileSystem that owns this FsNode's tree.
133    fs: Weak<FileSystem>,
134
135    /// A RwLock to synchronize append operations for this node.
136    ///
137    /// FileObjects writing with O_APPEND should grab a write() lock on this
138    /// field to ensure they operate sequentially. FileObjects writing without
139    /// O_APPEND should grab read() lock so that they can operate in parallel.
140    pub append_lock: RwQueue<FsNodeAppend>,
141
142    /// Mutable information about this node.
143    ///
144    /// This data is used to populate the uapi::stat structure.
145    info: RwLock<FsNodeInfo>,
146
147    /// Data associated with an FsNode that is rarely needed.
148    rare_data: OnceLock<Box<FsNodeRareData>>,
149
150    /// Tracks lock state for this file.
151    pub write_guard_state: Mutex<FileWriteGuardState>,
152
153    /// Cached FsVerity state associated with this node.
154    pub fsverity: Mutex<FsVerityState>,
155
156    /// The security state associated with this node. Must always be acquired last
157    /// relative to other `FsNode` locks.
158    pub security_state: security::FsNodeState,
159}
160
161#[derive(Default)]
162struct FsNodeRareData {
163    /// The pipe located at this node, if any.
164    ///
165    /// Used if, and only if, the node has a mode of FileMode::IFIFO.
166    fifo: OnceLock<PipeHandle>,
167
168    /// The UNIX domain socket bound to this node, if any.
169    bound_socket: OnceLock<SocketHandle>,
170
171    /// Information about the locking information on this node.
172    ///
173    /// No other lock on this object may be taken while this lock is held.
174    flock_info: Mutex<FlockInfo>,
175
176    /// Records locks associated with this node.
177    record_locks: RecordLocks,
178
179    /// Whether this node can be linked into a directory.
180    ///
181    /// Only set for nodes created with `O_TMPFILE`.
182    link_behavior: OnceLock<FsNodeLinkBehavior>,
183
184    /// Inotify watchers on this node. See inotify(7).
185    watchers: inotify::InotifyWatchers,
186}
187
188impl FsNodeRareData {
189    fn ensure_fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
190        self.fifo.get_or_init(|| {
191            let mut default_pipe_capacity = (*PAGE_SIZE * 16) as usize;
192            if !security::is_task_capable_noaudit(current_task, CAP_SYS_RESOURCE) {
193                let kernel = current_task.kernel();
194                let max_size = kernel.system_limits.pipe_max_size.load(Ordering::Relaxed);
195                default_pipe_capacity = std::cmp::min(default_pipe_capacity, max_size);
196            }
197            Pipe::new(default_pipe_capacity)
198        })
199    }
200}
201
202pub enum FsNodeReleaserAction {}
203impl ReleaserAction<FsNode> for FsNodeReleaserAction {
204    fn release(fs_node: ReleaseGuard<FsNode>) {
205        register_delayed_release(fs_node);
206    }
207}
208pub type FsNodeReleaser = ObjectReleaser<FsNode, FsNodeReleaserAction>;
209pub type FsNodeHandle = Arc<FsNodeReleaser>;
210pub type WeakFsNodeHandle = Weak<FsNodeReleaser>;
211
212#[derive(Debug, Default, Clone, PartialEq)]
213pub struct FsNodeInfo {
214    pub mode: FileMode,
215    pub link_count: usize,
216    pub uid: uid_t,
217    pub gid: gid_t,
218    pub rdev: DeviceType,
219    pub size: usize,
220    pub blksize: usize,
221    pub blocks: usize,
222    pub time_status_change: UtcInstant,
223    pub time_access: UtcInstant,
224    pub time_modify: UtcInstant,
225    pub casefold: bool,
226    // If this node is fscrypt encrypted, stores the id of the user wrapping key used to encrypt it.
227    pub wrapping_key_id: Option<[u8; 16]>,
228    // Used to indicate to filesystems that manage timestamps that an access has occurred and to
229    // update the node's atime.
230    // This only impacts accesses within Starnix. Most Fuchsia programs are not expected to maintain
231    // access times. If the file handle is transferred out of Starnix, there may be inconsistencies.
232    pub pending_time_access_update: bool,
233}
234
235impl FsNodeInfo {
236    pub fn new(mode: FileMode, owner: FsCred) -> Self {
237        let now = utc::utc_now();
238        Self {
239            mode,
240            link_count: if mode.is_dir() { 2 } else { 1 },
241            uid: owner.uid,
242            gid: owner.gid,
243            blksize: DEFAULT_BYTES_PER_BLOCK,
244            time_status_change: now,
245            time_access: now,
246            time_modify: now,
247            ..Default::default()
248        }
249    }
250
251    pub fn storage_size(&self) -> usize {
252        self.blksize.saturating_mul(self.blocks)
253    }
254
255    pub fn chmod(&mut self, mode: FileMode) {
256        self.mode = (self.mode & !FileMode::PERMISSIONS) | (mode & FileMode::PERMISSIONS);
257    }
258
259    pub fn chown(&mut self, owner: Option<uid_t>, group: Option<gid_t>) {
260        if let Some(owner) = owner {
261            self.uid = owner;
262        }
263        if let Some(group) = group {
264            self.gid = group;
265        }
266        // Clear the setuid and setgid bits if the file is executable and a regular file.
267        if self.mode.is_reg() {
268            self.mode &= !FileMode::ISUID;
269            self.clear_sgid_bit();
270        }
271    }
272
273    fn clear_sgid_bit(&mut self) {
274        // If the group execute bit is not set, the setgid bit actually indicates mandatory
275        // locking and should not be cleared.
276        if self.mode.intersects(FileMode::IXGRP) {
277            self.mode &= !FileMode::ISGID;
278        }
279    }
280
281    fn clear_suid_and_sgid_bits(&mut self) {
282        self.mode &= !FileMode::ISUID;
283        self.clear_sgid_bit();
284    }
285
286    pub fn cred(&self) -> FsCred {
287        FsCred { uid: self.uid, gid: self.gid }
288    }
289
290    pub fn suid_and_sgid(
291        &self,
292        current_task: &CurrentTask,
293        fs_node: &FsNode,
294    ) -> Result<UserAndOrGroupId, Errno> {
295        let uid = self.mode.contains(FileMode::ISUID).then_some(self.uid);
296
297        // See <https://man7.org/linux/man-pages/man7/inode.7.html>:
298        //
299        //   For an executable file, the set-group-ID bit causes the
300        //   effective group ID of a process that executes the file to change
301        //   as described in execve(2).  For a file that does not have the
302        //   group execution bit (S_IXGRP) set, the set-group-ID bit indicates
303        //   mandatory file/record locking.
304        let gid = self.mode.contains(FileMode::ISGID | FileMode::IXGRP).then_some(self.gid);
305
306        let maybe_set_id = UserAndOrGroupId { uid, gid };
307        if maybe_set_id.is_some() {
308            // Check that uid and gid actually have execute access before
309            // returning them as the SUID or SGID.
310            check_access(
311                fs_node,
312                current_task,
313                security::PermissionFlags::EXEC,
314                self.uid,
315                self.gid,
316                self.mode,
317            )?;
318        }
319        Ok(maybe_set_id)
320    }
321}
322
323#[derive(Default)]
324struct FlockInfo {
325    /// Whether the node is currently locked. The meaning of the different values are:
326    /// - `None`: The node is not locked.
327    /// - `Some(false)`: The node is locked non exclusively.
328    /// - `Some(true)`: The node is locked exclusively.
329    locked_exclusive: Option<bool>,
330    /// The FileObject that hold the lock.
331    locking_handles: Vec<WeakFileHandle>,
332    /// The queue to notify process waiting on the lock.
333    wait_queue: WaitQueue,
334}
335
336impl FlockInfo {
337    /// Removes all file handle not holding `predicate` from the list of object holding the lock. If
338    /// this empties the list, unlocks the node and notifies all waiting processes.
339    pub fn retain<F>(&mut self, predicate: F)
340    where
341        F: Fn(&FileObject) -> bool,
342    {
343        if !self.locking_handles.is_empty() {
344            self.locking_handles
345                .retain(|w| if let Some(fh) = w.upgrade() { predicate(&fh) } else { false });
346            if self.locking_handles.is_empty() {
347                self.locked_exclusive = None;
348                self.wait_queue.notify_all();
349            }
350        }
351    }
352}
353
354/// `st_blksize` is measured in units of 512 bytes.
355pub const DEFAULT_BYTES_PER_BLOCK: usize = 512;
356
357pub struct FlockOperation {
358    operation: u32,
359}
360
361impl FlockOperation {
362    pub fn from_flags(operation: u32) -> Result<Self, Errno> {
363        if operation & !(LOCK_SH | LOCK_EX | LOCK_UN | LOCK_NB) != 0 {
364            return error!(EINVAL);
365        }
366        if [LOCK_SH, LOCK_EX, LOCK_UN].iter().filter(|&&o| operation & o == o).count() != 1 {
367            return error!(EINVAL);
368        }
369        Ok(Self { operation })
370    }
371
372    pub fn is_unlock(&self) -> bool {
373        self.operation & LOCK_UN > 0
374    }
375
376    pub fn is_lock_exclusive(&self) -> bool {
377        self.operation & LOCK_EX > 0
378    }
379
380    pub fn is_blocking(&self) -> bool {
381        self.operation & LOCK_NB == 0
382    }
383}
384
385impl FileObject {
386    /// Advisory locking.
387    ///
388    /// See flock(2).
389    pub fn flock(
390        &self,
391        locked: &mut Locked<Unlocked>,
392        current_task: &CurrentTask,
393        operation: FlockOperation,
394    ) -> Result<(), Errno> {
395        if self.flags().contains(OpenFlags::PATH) {
396            return error!(EBADF);
397        }
398        loop {
399            let mut flock_info = self.name.entry.node.ensure_rare_data().flock_info.lock();
400            if operation.is_unlock() {
401                flock_info.retain(|fh| !std::ptr::eq(fh, self));
402                return Ok(());
403            }
404            // Operation is a locking operation.
405            // 1. File is not locked
406            if flock_info.locked_exclusive.is_none() {
407                flock_info.locked_exclusive = Some(operation.is_lock_exclusive());
408                flock_info.locking_handles.push(self.weak_handle.clone());
409                return Ok(());
410            }
411
412            let file_lock_is_exclusive = flock_info.locked_exclusive == Some(true);
413            let fd_has_lock = flock_info
414                .locking_handles
415                .iter()
416                .find_map(|w| {
417                    w.upgrade().and_then(|fh| {
418                        if std::ptr::eq(&fh as &FileObject, self) { Some(()) } else { None }
419                    })
420                })
421                .is_some();
422
423            // 2. File is locked, but fd already have a lock
424            if fd_has_lock {
425                if operation.is_lock_exclusive() == file_lock_is_exclusive {
426                    // Correct lock is already held, return.
427                    return Ok(());
428                } else {
429                    // Incorrect lock is held. Release the lock and loop back to try to reacquire
430                    // it. flock doesn't guarantee atomic lock type switching.
431                    flock_info.retain(|fh| !std::ptr::eq(fh, self));
432                    continue;
433                }
434            }
435
436            // 3. File is locked, and fd doesn't have a lock.
437            if !file_lock_is_exclusive && !operation.is_lock_exclusive() {
438                // The lock is not exclusive, let's grab it.
439                flock_info.locking_handles.push(self.weak_handle.clone());
440                return Ok(());
441            }
442
443            // 4. The operation cannot be done at this time.
444            if !operation.is_blocking() {
445                return error!(EAGAIN);
446            }
447
448            // Register a waiter to be notified when the lock is released. Release the lock on
449            // FlockInfo, and wait.
450            let waiter = Waiter::new();
451            flock_info.wait_queue.wait_async(&waiter);
452            std::mem::drop(flock_info);
453            waiter.wait(locked, current_task)?;
454        }
455    }
456}
457
458// The inner mod is required because bitflags cannot pass the attribute through to the single
459// variant, and attributes cannot be applied to macro invocations.
460mod inner_flags {
461    // Part of the code for the AT_STATX_SYNC_AS_STAT case that's produced by the macro triggers the
462    // lint, but as a whole, the produced code is still correct.
463    #![allow(clippy::bad_bit_mask)] // TODO(b/303500202) Remove once addressed in bitflags.
464    use super::{bitflags, uapi};
465
466    bitflags! {
467        #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
468        pub struct StatxFlags: u32 {
469            const AT_SYMLINK_NOFOLLOW = uapi::AT_SYMLINK_NOFOLLOW;
470            const AT_EMPTY_PATH = uapi::AT_EMPTY_PATH;
471            const AT_NO_AUTOMOUNT = uapi::AT_NO_AUTOMOUNT;
472            const AT_STATX_SYNC_AS_STAT = uapi::AT_STATX_SYNC_AS_STAT;
473            const AT_STATX_FORCE_SYNC = uapi::AT_STATX_FORCE_SYNC;
474            const AT_STATX_DONT_SYNC = uapi::AT_STATX_DONT_SYNC;
475            const STATX_ATTR_VERITY = uapi::STATX_ATTR_VERITY;
476        }
477    }
478}
479
480pub use inner_flags::StatxFlags;
481
482#[derive(Copy, Clone, Debug, PartialEq, Eq)]
483pub enum UnlinkKind {
484    /// Unlink a directory.
485    Directory,
486
487    /// Unlink a non-directory.
488    NonDirectory,
489}
490
491pub enum SymlinkTarget {
492    Path(FsString),
493    Node(NamespaceNode),
494}
495
496#[derive(Clone, Copy, PartialEq, Eq)]
497pub enum XattrOp {
498    /// Set the value of the extended attribute regardless of whether it exists.
499    Set,
500    /// Create a new extended attribute. Fail if it already exists.
501    Create,
502    /// Replace the value of the extended attribute. Fail if it doesn't exist.
503    Replace,
504}
505
506impl XattrOp {
507    pub fn into_flags(self) -> u32 {
508        match self {
509            Self::Set => 0,
510            Self::Create => uapi::XATTR_CREATE,
511            Self::Replace => uapi::XATTR_REPLACE,
512        }
513    }
514}
515
516/// Returns a value, or the size required to contains it.
517#[derive(Clone, Debug, PartialEq)]
518pub enum ValueOrSize<T> {
519    Value(T),
520    Size(usize),
521}
522
523impl<T> ValueOrSize<T> {
524    pub fn map<F, U>(self, f: F) -> ValueOrSize<U>
525    where
526        F: FnOnce(T) -> U,
527    {
528        match self {
529            Self::Size(s) => ValueOrSize::Size(s),
530            Self::Value(v) => ValueOrSize::Value(f(v)),
531        }
532    }
533
534    #[cfg(test)]
535    pub fn unwrap(self) -> T {
536        match self {
537            Self::Size(_) => panic!("Unwrap ValueOrSize that is a Size"),
538            Self::Value(v) => v,
539        }
540    }
541}
542
543impl<T> From<T> for ValueOrSize<T> {
544    fn from(t: T) -> Self {
545        Self::Value(t)
546    }
547}
548
549#[derive(Copy, Clone, Eq, PartialEq, Debug)]
550pub enum FallocMode {
551    Allocate { keep_size: bool },
552    PunchHole,
553    Collapse,
554    Zero { keep_size: bool },
555    InsertRange,
556    UnshareRange,
557}
558
559impl FallocMode {
560    pub fn from_bits(mode: u32) -> Option<Self> {
561        // `fallocate()` allows only the following values for `mode`.
562        if mode == 0 {
563            Some(Self::Allocate { keep_size: false })
564        } else if mode == FALLOC_FL_KEEP_SIZE {
565            Some(Self::Allocate { keep_size: true })
566        } else if mode == FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE {
567            Some(Self::PunchHole)
568        } else if mode == FALLOC_FL_COLLAPSE_RANGE {
569            Some(Self::Collapse)
570        } else if mode == FALLOC_FL_ZERO_RANGE {
571            Some(Self::Zero { keep_size: false })
572        } else if mode == FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE {
573            Some(Self::Zero { keep_size: true })
574        } else if mode == FALLOC_FL_INSERT_RANGE {
575            Some(Self::InsertRange)
576        } else if mode == FALLOC_FL_UNSHARE_RANGE {
577            Some(Self::UnshareRange)
578        } else {
579            None
580        }
581    }
582}
583
584#[derive(Debug, Copy, Clone, PartialEq)]
585pub enum CheckAccessReason {
586    Access,
587    Chdir,
588    Chroot,
589    Exec,
590    ChangeTimestamps { now: bool },
591    InternalPermissionChecks,
592}
593
594pub trait FsNodeOps: Send + Sync + AsAny + 'static {
595    /// Delegate the access check to the node.
596    fn check_access(
597        &self,
598        _locked: &mut Locked<FileOpsCore>,
599        node: &FsNode,
600        current_task: &CurrentTask,
601        access: security::PermissionFlags,
602        info: &RwLock<FsNodeInfo>,
603        reason: CheckAccessReason,
604        audit_context: security::Auditable<'_>,
605    ) -> Result<(), Errno> {
606        node.default_check_access_impl(current_task, access, reason, info.read(), audit_context)
607    }
608
609    /// Build the [`DirEntryOps`] for a new [`DirEntry`] that will be associated
610    /// to this node.
611    fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
612        Box::new(DefaultDirEntryOps)
613    }
614
615    /// Build the `FileOps` for the file associated to this node.
616    ///
617    /// The returned FileOps will be used to create a FileObject, which might
618    /// be assigned an FdNumber.
619    fn create_file_ops(
620        &self,
621        locked: &mut Locked<FileOpsCore>,
622        node: &FsNode,
623        _current_task: &CurrentTask,
624        flags: OpenFlags,
625    ) -> Result<Box<dyn FileOps>, Errno>;
626
627    /// Find an existing child node and populate the child parameter. Return the node.
628    ///
629    /// The child parameter is an empty node. Operations other than initialize may panic before
630    /// initialize is called.
631    fn lookup(
632        &self,
633        _locked: &mut Locked<FileOpsCore>,
634        _node: &FsNode,
635        _current_task: &CurrentTask,
636        name: &FsStr,
637    ) -> Result<FsNodeHandle, Errno> {
638        // The default implementation here is suitable for filesystems that have permanent entries;
639        // entries that already exist will get found in the cache and shouldn't get this far.
640        error!(ENOENT, format!("looking for {name}"))
641    }
642
643    /// Create and return the given child node.
644    ///
645    /// The mode field of the FsNodeInfo indicates what kind of child to
646    /// create.
647    ///
648    /// This function is never called with FileMode::IFDIR. The mkdir function
649    /// is used to create directories instead.
650    fn mknod(
651        &self,
652        locked: &mut Locked<FileOpsCore>,
653        _node: &FsNode,
654        _current_task: &CurrentTask,
655        _name: &FsStr,
656        _mode: FileMode,
657        _dev: DeviceType,
658        _owner: FsCred,
659    ) -> Result<FsNodeHandle, Errno>;
660
661    /// Create and return the given child node as a subdirectory.
662    fn mkdir(
663        &self,
664        locked: &mut Locked<FileOpsCore>,
665        _node: &FsNode,
666        _current_task: &CurrentTask,
667        _name: &FsStr,
668        _mode: FileMode,
669        _owner: FsCred,
670    ) -> Result<FsNodeHandle, Errno>;
671
672    /// Creates a symlink with the given `target` path.
673    fn create_symlink(
674        &self,
675        locked: &mut Locked<FileOpsCore>,
676        _node: &FsNode,
677        _current_task: &CurrentTask,
678        _name: &FsStr,
679        _target: &FsStr,
680        _owner: FsCred,
681    ) -> Result<FsNodeHandle, Errno>;
682
683    /// Creates an anonymous file.
684    ///
685    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
686    ///
687    /// Used by O_TMPFILE.
688    fn create_tmpfile(
689        &self,
690        _node: &FsNode,
691        _current_task: &CurrentTask,
692        _mode: FileMode,
693        _owner: FsCred,
694    ) -> Result<FsNodeHandle, Errno> {
695        error!(EOPNOTSUPP)
696    }
697
698    /// Reads the symlink from this node.
699    fn readlink(
700        &self,
701        _locked: &mut Locked<FileOpsCore>,
702        _node: &FsNode,
703        _current_task: &CurrentTask,
704    ) -> Result<SymlinkTarget, Errno> {
705        error!(EINVAL)
706    }
707
708    /// Create a hard link with the given name to the given child.
709    fn link(
710        &self,
711        _locked: &mut Locked<FileOpsCore>,
712        _node: &FsNode,
713        _current_task: &CurrentTask,
714        _name: &FsStr,
715        _child: &FsNodeHandle,
716    ) -> Result<(), Errno> {
717        error!(EPERM)
718    }
719
720    /// Remove the child with the given name, if the child exists.
721    ///
722    /// The UnlinkKind parameter indicates whether the caller intends to unlink
723    /// a directory or a non-directory child.
724    fn unlink(
725        &self,
726        locked: &mut Locked<FileOpsCore>,
727        _node: &FsNode,
728        _current_task: &CurrentTask,
729        _name: &FsStr,
730        _child: &FsNodeHandle,
731    ) -> Result<(), Errno>;
732
733    /// Acquire the necessary append lock for the operations that depend on them.
734    /// Should be done before calling `allocate` or `truncate` to avoid lock ordering issues.
735    fn append_lock_read<'a>(
736        &'a self,
737        locked: &'a mut Locked<BeforeFsNodeAppend>,
738        node: &'a FsNode,
739        current_task: &CurrentTask,
740    ) -> Result<(RwQueueReadGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
741        return node.append_lock.read_and(locked, current_task);
742    }
743
744    /// Change the length of the file.
745    fn truncate(
746        &self,
747        _locked: &mut Locked<FileOpsCore>,
748        _guard: &AppendLockGuard<'_>,
749        _node: &FsNode,
750        _current_task: &CurrentTask,
751        _length: u64,
752    ) -> Result<(), Errno> {
753        error!(EINVAL)
754    }
755
756    /// Manipulate allocated disk space for the file.
757    fn allocate(
758        &self,
759        _locked: &mut Locked<FileOpsCore>,
760        _guard: &AppendLockGuard<'_>,
761        _node: &FsNode,
762        _current_task: &CurrentTask,
763        _mode: FallocMode,
764        _offset: u64,
765        _length: u64,
766    ) -> Result<(), Errno> {
767        error!(EINVAL)
768    }
769
770    /// Update the supplied info with initial state (e.g. size) for the node.
771    ///
772    /// FsNode calls this method when created, to allow the FsNodeOps to
773    /// set appropriate initial values in the FsNodeInfo.
774    fn initial_info(&self, _info: &mut FsNodeInfo) {}
775
776    /// Update node.info as needed.
777    ///
778    /// FsNode calls this method before converting the FsNodeInfo struct into
779    /// the uapi::stat struct to give the file system a chance to update this data
780    /// before it is used by clients.
781    ///
782    /// File systems that keep the FsNodeInfo up-to-date do not need to
783    /// override this function.
784    ///
785    /// Return a read guard for the updated information.
786    fn fetch_and_refresh_info<'a>(
787        &self,
788        _locked: &mut Locked<FileOpsCore>,
789        _node: &FsNode,
790        _current_task: &CurrentTask,
791        info: &'a RwLock<FsNodeInfo>,
792    ) -> Result<RwLockReadGuard<'a, FsNodeInfo>, Errno> {
793        Ok(info.read())
794    }
795
796    /// Update node attributes persistently.
797    fn update_attributes(
798        &self,
799        _locked: &mut Locked<FileOpsCore>,
800        _current_task: &CurrentTask,
801        _info: &FsNodeInfo,
802        _has: zxio_node_attr_has_t,
803    ) -> Result<(), Errno> {
804        Ok(())
805    }
806
807    /// Get an extended attribute on the node.
808    ///
809    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
810    /// instead return the size of the attribute, and can return an ERANGE error if max_size is not
811    /// 0, and lesser than the required size.
812    fn get_xattr(
813        &self,
814        _locked: &mut Locked<FileOpsCore>,
815        _node: &FsNode,
816        _current_task: &CurrentTask,
817        _name: &FsStr,
818        _max_size: usize,
819    ) -> Result<ValueOrSize<FsString>, Errno> {
820        error!(ENOTSUP)
821    }
822
823    /// Set an extended attribute on the node.
824    fn set_xattr(
825        &self,
826        _locked: &mut Locked<FileOpsCore>,
827        _node: &FsNode,
828        _current_task: &CurrentTask,
829        _name: &FsStr,
830        _value: &FsStr,
831        _op: XattrOp,
832    ) -> Result<(), Errno> {
833        error!(ENOTSUP)
834    }
835
836    fn remove_xattr(
837        &self,
838        _locked: &mut Locked<FileOpsCore>,
839        _node: &FsNode,
840        _current_task: &CurrentTask,
841        _name: &FsStr,
842    ) -> Result<(), Errno> {
843        error!(ENOTSUP)
844    }
845
846    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
847    /// instead return the size of the 0 separated string needed to represent the value, and can
848    /// return an ERANGE error if max_size is not 0, and lesser than the required size.
849    fn list_xattrs(
850        &self,
851        _locked: &mut Locked<FileOpsCore>,
852        _node: &FsNode,
853        _current_task: &CurrentTask,
854        _max_size: usize,
855    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
856        error!(ENOTSUP)
857    }
858
859    /// Called when the FsNode is freed by the Kernel.
860    fn forget(
861        self: Box<Self>,
862        _locked: &mut Locked<FileOpsCore>,
863        _current_task: &CurrentTask,
864        _info: FsNodeInfo,
865    ) -> Result<(), Errno> {
866        Ok(())
867    }
868
869    ////////////////////
870    // FS-Verity operations
871
872    /// Marks that FS-Verity is being built. Writes fsverity descriptor and merkle tree, the latter
873    /// computed by the filesystem.
874    /// This should ensure there are no writable file handles. Returns EEXIST if the file was
875    /// already fsverity-enabled. Returns EBUSY if this ioctl was already running on this file.
876    fn enable_fsverity(&self, _descriptor: &fsverity_descriptor) -> Result<(), Errno> {
877        error!(ENOTSUP)
878    }
879
880    /// Read fsverity descriptor, if the node is fsverity-enabled. Else returns ENODATA.
881    fn get_fsverity_descriptor(&self, _log_blocksize: u8) -> Result<fsverity_descriptor, Errno> {
882        error!(ENOTSUP)
883    }
884
885    /// Returns a descriptive name for this node, suitable to report to userspace in situations
886    /// where the node's path is unavailable (e.g. because it is anonymous, and has no path).
887    /// If no name is returned then a default name of the form "<class:[<node_id>]" will be used.
888    fn internal_name(&self, _node: &FsNode) -> Option<FsString> {
889        None
890    }
891
892    /// The key used to identify this node in the file system's node cache.
893    ///
894    /// For many file systems, this will be the same as the inode number. However, some file
895    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
896    fn node_key(&self, node: &FsNode) -> ino_t {
897        node.ino
898    }
899}
900
901impl<T> From<T> for Box<dyn FsNodeOps>
902where
903    T: FsNodeOps,
904{
905    fn from(ops: T) -> Box<dyn FsNodeOps> {
906        Box::new(ops)
907    }
908}
909
910/// Implements [`FsNodeOps`] methods in a way that makes sense for symlinks.
911/// You must implement [`FsNodeOps::readlink`].
912#[macro_export]
913macro_rules! fs_node_impl_symlink {
914    () => {
915        $crate::vfs::fs_node_impl_not_dir!();
916
917        fn create_file_ops(
918            &self,
919            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
920            node: &$crate::vfs::FsNode,
921            _current_task: &CurrentTask,
922            _flags: starnix_uapi::open_flags::OpenFlags,
923        ) -> Result<Box<dyn $crate::vfs::FileOps>, starnix_uapi::errors::Errno> {
924            assert!(node.is_lnk());
925            unreachable!("Symlink nodes cannot be opened.");
926        }
927    };
928}
929
930#[macro_export]
931macro_rules! fs_node_impl_dir_readonly {
932    () => {
933        fn check_access(
934            &self,
935            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
936            node: &$crate::vfs::FsNode,
937            current_task: &$crate::task::CurrentTask,
938            permission_flags: $crate::security::PermissionFlags,
939            info: &starnix_sync::RwLock<$crate::vfs::FsNodeInfo>,
940            reason: $crate::vfs::CheckAccessReason,
941            audit_context: $crate::security::Auditable<'_>,
942        ) -> Result<(), starnix_uapi::errors::Errno> {
943            let access = permission_flags.as_access();
944            if access.contains(starnix_uapi::file_mode::Access::WRITE) {
945                return starnix_uapi::error!(
946                    EROFS,
947                    format!("check_access failed: read-only directory")
948                );
949            }
950            node.default_check_access_impl(
951                current_task,
952                permission_flags,
953                reason,
954                info.read(),
955                audit_context,
956            )
957        }
958
959        fn mkdir(
960            &self,
961            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
962            _node: &$crate::vfs::FsNode,
963            _current_task: &$crate::task::CurrentTask,
964            name: &$crate::vfs::FsStr,
965            _mode: starnix_uapi::file_mode::FileMode,
966            _owner: starnix_uapi::auth::FsCred,
967        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
968            starnix_uapi::error!(EROFS, format!("mkdir failed: {:?}", name))
969        }
970
971        fn mknod(
972            &self,
973            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
974            _node: &$crate::vfs::FsNode,
975            _current_task: &$crate::task::CurrentTask,
976            name: &$crate::vfs::FsStr,
977            _mode: starnix_uapi::file_mode::FileMode,
978            _dev: starnix_uapi::device_type::DeviceType,
979            _owner: starnix_uapi::auth::FsCred,
980        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
981            starnix_uapi::error!(EROFS, format!("mknod failed: {:?}", name))
982        }
983
984        fn create_symlink(
985            &self,
986            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
987            _node: &$crate::vfs::FsNode,
988            _current_task: &$crate::task::CurrentTask,
989            name: &$crate::vfs::FsStr,
990            _target: &$crate::vfs::FsStr,
991            _owner: starnix_uapi::auth::FsCred,
992        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
993            starnix_uapi::error!(EROFS, format!("symlink failed: {:?}", name))
994        }
995
996        fn link(
997            &self,
998            _locked: &mut Locked<FileOpsCore>,
999            _node: &$crate::vfs::FsNode,
1000            _current_task: &$crate::task::CurrentTask,
1001            name: &$crate::vfs::FsStr,
1002            _child: &$crate::vfs::FsNodeHandle,
1003        ) -> Result<(), starnix_uapi::errors::Errno> {
1004            starnix_uapi::error!(EROFS, format!("link failed: {:?}", name))
1005        }
1006
1007        fn unlink(
1008            &self,
1009            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1010            _node: &$crate::vfs::FsNode,
1011            _current_task: &$crate::task::CurrentTask,
1012            name: &$crate::vfs::FsStr,
1013            _child: &$crate::vfs::FsNodeHandle,
1014        ) -> Result<(), starnix_uapi::errors::Errno> {
1015            starnix_uapi::error!(EROFS, format!("unlink failed: {:?}", name))
1016        }
1017    };
1018}
1019
1020/// Trait that objects can implement if they need to handle extended attribute storage. Allows
1021/// delegating extended attribute operations in [`FsNodeOps`] to another object.
1022///
1023/// See [`fs_node_impl_xattr_delegate`] for usage details.
1024pub trait XattrStorage {
1025    /// Delegate for [`FsNodeOps::get_xattr`].
1026    fn get_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<FsString, Errno>;
1027
1028    /// Delegate for [`FsNodeOps::set_xattr`].
1029    fn set_xattr(
1030        &self,
1031        locked: &mut Locked<FileOpsCore>,
1032        name: &FsStr,
1033        value: &FsStr,
1034        op: XattrOp,
1035    ) -> Result<(), Errno>;
1036
1037    /// Delegate for [`FsNodeOps::remove_xattr`].
1038    fn remove_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<(), Errno>;
1039
1040    /// Delegate for [`FsNodeOps::list_xattrs`].
1041    fn list_xattrs(&self, locked: &mut Locked<FileOpsCore>) -> Result<Vec<FsString>, Errno>;
1042}
1043
1044/// Implements extended attribute ops for [`FsNodeOps`] by delegating to another object which
1045/// implements the [`XattrStorage`] trait or a similar interface. For example:
1046///
1047/// ```
1048/// struct Xattrs {}
1049///
1050/// impl XattrStorage for Xattrs {
1051///     // implement XattrStorage
1052/// }
1053///
1054/// struct Node {
1055///     xattrs: Xattrs
1056/// }
1057///
1058/// impl FsNodeOps for Node {
1059///     // Delegate extended attribute ops in FsNodeOps to self.xattrs
1060///     fs_node_impl_xattr_delegate!(self, self.xattrs);
1061///
1062///     // add other FsNodeOps impls here
1063/// }
1064/// ```
1065#[macro_export]
1066macro_rules! fs_node_impl_xattr_delegate {
1067    ($self:ident, $delegate:expr) => {
1068        fn get_xattr(
1069            &$self,
1070            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1071            _node: &FsNode,
1072            _current_task: &CurrentTask,
1073            name: &$crate::vfs::FsStr,
1074            _size: usize,
1075        ) -> Result<$crate::vfs::ValueOrSize<$crate::vfs::FsString>, starnix_uapi::errors::Errno> {
1076            Ok($delegate.get_xattr(locked, name)?.into())
1077        }
1078
1079        fn set_xattr(
1080            &$self,
1081            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1082            _node: &FsNode,
1083            _current_task: &CurrentTask,
1084            name: &$crate::vfs::FsStr,
1085            value: &$crate::vfs::FsStr,
1086            op: $crate::vfs::XattrOp,
1087        ) -> Result<(), starnix_uapi::errors::Errno> {
1088            $delegate.set_xattr(locked, name, value, op)
1089        }
1090
1091        fn remove_xattr(
1092            &$self,
1093            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1094            _node: &FsNode,
1095            _current_task: &CurrentTask,
1096            name: &$crate::vfs::FsStr,
1097        ) -> Result<(), starnix_uapi::errors::Errno> {
1098            $delegate.remove_xattr(locked, name)
1099        }
1100
1101        fn list_xattrs(
1102            &$self,
1103            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1104            _node: &FsNode,
1105            _current_task: &CurrentTask,
1106            _size: usize,
1107        ) -> Result<$crate::vfs::ValueOrSize<Vec<$crate::vfs::FsString>>, starnix_uapi::errors::Errno> {
1108            Ok($delegate.list_xattrs(locked)?.into())
1109        }
1110    };
1111}
1112
1113/// Stubs out [`FsNodeOps`] methods that only apply to directories.
1114#[macro_export]
1115macro_rules! fs_node_impl_not_dir {
1116    () => {
1117        fn lookup(
1118            &self,
1119            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1120            _node: &$crate::vfs::FsNode,
1121            _current_task: &$crate::task::CurrentTask,
1122            _name: &$crate::vfs::FsStr,
1123        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1124            starnix_uapi::error!(ENOTDIR)
1125        }
1126
1127        fn mknod(
1128            &self,
1129            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1130            _node: &$crate::vfs::FsNode,
1131            _current_task: &$crate::task::CurrentTask,
1132            _name: &$crate::vfs::FsStr,
1133            _mode: starnix_uapi::file_mode::FileMode,
1134            _dev: starnix_uapi::device_type::DeviceType,
1135            _owner: starnix_uapi::auth::FsCred,
1136        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1137            starnix_uapi::error!(ENOTDIR)
1138        }
1139
1140        fn mkdir(
1141            &self,
1142            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1143            _node: &$crate::vfs::FsNode,
1144            _current_task: &$crate::task::CurrentTask,
1145            _name: &$crate::vfs::FsStr,
1146            _mode: starnix_uapi::file_mode::FileMode,
1147            _owner: starnix_uapi::auth::FsCred,
1148        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1149            starnix_uapi::error!(ENOTDIR)
1150        }
1151
1152        fn create_symlink(
1153            &self,
1154            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1155            _node: &$crate::vfs::FsNode,
1156            _current_task: &$crate::task::CurrentTask,
1157            _name: &$crate::vfs::FsStr,
1158            _target: &$crate::vfs::FsStr,
1159            _owner: starnix_uapi::auth::FsCred,
1160        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1161            starnix_uapi::error!(ENOTDIR)
1162        }
1163
1164        fn unlink(
1165            &self,
1166            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1167            _node: &$crate::vfs::FsNode,
1168            _current_task: &$crate::task::CurrentTask,
1169            _name: &$crate::vfs::FsStr,
1170            _child: &$crate::vfs::FsNodeHandle,
1171        ) -> Result<(), starnix_uapi::errors::Errno> {
1172            starnix_uapi::error!(ENOTDIR)
1173        }
1174    };
1175}
1176
1177#[derive(Copy, Clone, Debug, PartialEq, Eq)]
1178pub enum TimeUpdateType {
1179    Now,
1180    Omit,
1181    Time(UtcInstant),
1182}
1183
1184// Public re-export of macros allows them to be used like regular rust items.
1185pub use {
1186    fs_node_impl_dir_readonly, fs_node_impl_not_dir, fs_node_impl_symlink,
1187    fs_node_impl_xattr_delegate,
1188};
1189
1190pub struct SpecialNode;
1191
1192impl FsNodeOps for SpecialNode {
1193    fs_node_impl_not_dir!();
1194
1195    fn create_file_ops(
1196        &self,
1197        _locked: &mut Locked<FileOpsCore>,
1198        _node: &FsNode,
1199        _current_task: &CurrentTask,
1200        _flags: OpenFlags,
1201    ) -> Result<Box<dyn FileOps>, Errno> {
1202        unreachable!("Special nodes cannot be opened.");
1203    }
1204}
1205
1206impl FsNode {
1207    /// Create a node without inserting it into the FileSystem node cache.
1208    ///
1209    /// This is usually not what you want!
1210    /// Only use if you're also using get_or_create_node, like ext4.
1211    pub fn new_uncached(
1212        ino: ino_t,
1213        ops: impl Into<Box<dyn FsNodeOps>>,
1214        fs: &FileSystemHandle,
1215        info: FsNodeInfo,
1216    ) -> FsNodeHandle {
1217        let ops = ops.into();
1218        FsNodeHandle::new(Self::new_internal(ino, ops, Arc::downgrade(fs), info).into())
1219    }
1220
1221    fn new_internal(
1222        ino: ino_t,
1223        ops: Box<dyn FsNodeOps>,
1224        fs: Weak<FileSystem>,
1225        info: FsNodeInfo,
1226    ) -> Self {
1227        // Allow the FsNodeOps to populate initial info.
1228        let info = {
1229            let mut info = info;
1230            ops.initial_info(&mut info);
1231            info
1232        };
1233
1234        // The linter will fail in non test mode as it will not see the lock check.
1235        #[allow(clippy::let_and_return)]
1236        {
1237            let result = Self {
1238                ino,
1239                ops,
1240                fs,
1241                info: RwLock::new(info),
1242                append_lock: Default::default(),
1243                rare_data: Default::default(),
1244                write_guard_state: Default::default(),
1245                fsverity: Mutex::new(FsVerityState::None),
1246                security_state: Default::default(),
1247            };
1248            #[cfg(any(test, debug_assertions))]
1249            {
1250                #[allow(
1251                    clippy::undocumented_unsafe_blocks,
1252                    reason = "Force documented unsafe blocks in Starnix"
1253                )]
1254                let locked = unsafe { Unlocked::new() };
1255                let _l1 = result.append_lock.read_for_lock_ordering(locked);
1256                let _l2 = result.info.read();
1257                let _l3 = result.write_guard_state.lock();
1258                let _l4 = result.fsverity.lock();
1259                // TODO(https://fxbug.dev/367585803): Add lock levels to SELinux implementation.
1260                let _l5 = result.security_state.lock();
1261            }
1262            result
1263        }
1264    }
1265
1266    pub fn fs(&self) -> FileSystemHandle {
1267        self.fs.upgrade().expect("FileSystem did not live long enough")
1268    }
1269
1270    pub fn ops(&self) -> &dyn FsNodeOps {
1271        self.ops.as_ref()
1272    }
1273
1274    /// Returns an error if this node is encrypted and locked. Does not require
1275    /// fetch_and_refresh_info because FS_IOC_SET_ENCRYPTION_POLICY updates info and once a node is
1276    /// encrypted, it remains encrypted forever.
1277    pub fn fail_if_locked(&self, _current_task: &CurrentTask) -> Result<(), Errno> {
1278        let node_info = self.info();
1279        if let Some(wrapping_key_id) = node_info.wrapping_key_id {
1280            let crypt_service = self.fs().crypt_service().ok_or_else(|| errno!(ENOKEY))?;
1281            if !crypt_service.contains_key(EncryptionKeyId::from(wrapping_key_id)) {
1282                return error!(ENOKEY);
1283            }
1284        }
1285        Ok(())
1286    }
1287
1288    /// Returns the `FsNode`'s `FsNodeOps` as a `&T`, or `None` if the downcast fails.
1289    pub fn downcast_ops<T>(&self) -> Option<&T>
1290    where
1291        T: 'static,
1292    {
1293        self.ops().as_any().downcast_ref::<T>()
1294    }
1295
1296    pub fn on_file_closed(&self, file: &FileObjectState) {
1297        if let Some(rare_data) = self.rare_data.get() {
1298            let mut flock_info = rare_data.flock_info.lock();
1299            // This function will drop the flock from `file` because the `WeakFileHandle` for
1300            // `file` will no longer upgrade to an `FileHandle`.
1301            flock_info.retain(|_| true);
1302        }
1303        self.record_lock_release(RecordLockOwner::FileObject(file.id));
1304    }
1305
1306    pub fn record_lock(
1307        &self,
1308        locked: &mut Locked<Unlocked>,
1309        current_task: &CurrentTask,
1310        file: &FileObject,
1311        cmd: RecordLockCommand,
1312        flock: uapi::flock,
1313    ) -> Result<Option<uapi::flock>, Errno> {
1314        self.ensure_rare_data().record_locks.lock(locked, current_task, file, cmd, flock)
1315    }
1316
1317    /// Release all record locks acquired by the given owner.
1318    pub fn record_lock_release(&self, owner: RecordLockOwner) {
1319        if let Some(rare_data) = self.rare_data.get() {
1320            rare_data.record_locks.release_locks(owner);
1321        }
1322    }
1323
1324    pub fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
1325        self.ops().create_dir_entry_ops()
1326    }
1327
1328    pub fn create_file_ops<L>(
1329        &self,
1330        locked: &mut Locked<L>,
1331        current_task: &CurrentTask,
1332        flags: OpenFlags,
1333    ) -> Result<Box<dyn FileOps>, Errno>
1334    where
1335        L: LockEqualOrBefore<FileOpsCore>,
1336    {
1337        let locked = locked.cast_locked::<FileOpsCore>();
1338        self.ops().create_file_ops(locked, self, current_task, flags)
1339    }
1340
1341    pub fn open(
1342        &self,
1343        locked: &mut Locked<Unlocked>,
1344        current_task: &CurrentTask,
1345        namespace_node: &NamespaceNode,
1346        flags: OpenFlags,
1347        access_check: AccessCheck,
1348    ) -> Result<Box<dyn FileOps>, Errno> {
1349        // If O_PATH is set, there is no need to create a real FileOps because
1350        // most file operations are disabled.
1351        if flags.contains(OpenFlags::PATH) {
1352            return Ok(Box::new(OPathOps::new()));
1353        }
1354
1355        let access = access_check.resolve(flags);
1356        if access.is_nontrivial() {
1357            if flags.contains(OpenFlags::NOATIME) {
1358                self.check_o_noatime_allowed(current_task)?;
1359            }
1360
1361            // `flags` doesn't contain any information about the EXEC permission. Instead the syscalls
1362            // used to execute a file (`sys_execve` and `sys_execveat`) call `open()` with the EXEC
1363            // permission request in `access`.
1364            let mut permission_flags = PermissionFlags::from(access);
1365
1366            // The `APPEND` flag exists only in `flags`, to modify the behaviour of
1367            // `PermissionFlags::WRITE`
1368            if flags.contains(OpenFlags::APPEND) {
1369                permission_flags |= security::PermissionFlags::APPEND;
1370            }
1371
1372            // TODO: https://fxbug.dev/455782510 - Remove this once non-open() checks are fully
1373            // enforced.
1374            permission_flags |= security::PermissionFlags::FOR_OPEN;
1375
1376            self.check_access(
1377                locked,
1378                current_task,
1379                &namespace_node.mount,
1380                permission_flags,
1381                CheckAccessReason::InternalPermissionChecks,
1382                namespace_node,
1383            )?;
1384        }
1385
1386        let (mode, rdev) = {
1387            // Don't hold the info lock while calling into open_device or self.ops().
1388            // TODO: The mode and rdev are immutable and shouldn't require a lock to read.
1389            let info = self.info();
1390            (info.mode, info.rdev)
1391        };
1392
1393        match mode & FileMode::IFMT {
1394            FileMode::IFCHR => {
1395                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1396                    return error!(EACCES);
1397                }
1398                current_task.kernel().open_device(
1399                    locked,
1400                    current_task,
1401                    namespace_node,
1402                    flags,
1403                    rdev,
1404                    DeviceMode::Char,
1405                )
1406            }
1407            FileMode::IFBLK => {
1408                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1409                    return error!(EACCES);
1410                }
1411                current_task.kernel().open_device(
1412                    locked,
1413                    current_task,
1414                    namespace_node,
1415                    flags,
1416                    rdev,
1417                    DeviceMode::Block,
1418                )
1419            }
1420            FileMode::IFIFO => Pipe::open(locked, current_task, self.fifo(current_task), flags),
1421            // UNIX domain sockets can't be opened.
1422            FileMode::IFSOCK => error!(ENXIO),
1423            _ => self.create_file_ops(locked, current_task, flags),
1424        }
1425    }
1426
1427    pub fn lookup<L>(
1428        &self,
1429        locked: &mut Locked<L>,
1430        current_task: &CurrentTask,
1431        mount: &MountInfo,
1432        name: &FsStr,
1433    ) -> Result<FsNodeHandle, Errno>
1434    where
1435        L: LockEqualOrBefore<FileOpsCore>,
1436    {
1437        self.check_access(
1438            locked,
1439            current_task,
1440            mount,
1441            Access::EXEC,
1442            CheckAccessReason::InternalPermissionChecks,
1443            &[Auditable::Name(name), std::panic::Location::caller().into()],
1444        )?;
1445        let locked = locked.cast_locked::<FileOpsCore>();
1446        self.ops().lookup(locked, self, current_task, name)
1447    }
1448
1449    pub fn create_node<L>(
1450        &self,
1451        locked: &mut Locked<L>,
1452        current_task: &CurrentTask,
1453        mount: &MountInfo,
1454        name: &FsStr,
1455        mut mode: FileMode,
1456        dev: DeviceType,
1457        mut owner: FsCred,
1458    ) -> Result<FsNodeHandle, Errno>
1459    where
1460        L: LockEqualOrBefore<FileOpsCore>,
1461    {
1462        assert!(mode & FileMode::IFMT != FileMode::EMPTY, "mknod called without node type.");
1463        self.check_access(
1464            locked,
1465            current_task,
1466            mount,
1467            Access::WRITE,
1468            CheckAccessReason::InternalPermissionChecks,
1469            security::Auditable::Name(name),
1470        )?;
1471        if mode.is_reg() {
1472            security::check_fs_node_create_access(current_task, self, mode, name)?;
1473        } else if mode.is_dir() {
1474            // Even though the man page for mknod(2) says that mknod "cannot be used to create
1475            // directories" in starnix the mkdir syscall (`sys_mkdirat`) ends up calling
1476            //create_node.
1477            security::check_fs_node_mkdir_access(current_task, self, mode, name)?;
1478        } else if !matches!(
1479            mode.fmt(),
1480            FileMode::IFCHR | FileMode::IFBLK | FileMode::IFIFO | FileMode::IFSOCK
1481        ) {
1482            security::check_fs_node_mknod_access(current_task, self, mode, name, dev)?;
1483        }
1484
1485        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1486
1487        let new_node = if mode.is_dir() {
1488            let locked = locked.cast_locked::<FileOpsCore>();
1489            self.ops().mkdir(locked, self, current_task, name, mode, owner)?
1490        } else {
1491            // https://man7.org/linux/man-pages/man2/mknod.2.html says on error EPERM:
1492            //
1493            //   mode requested creation of something other than a regular
1494            //   file, FIFO (named pipe), or UNIX domain socket, and the
1495            //   caller is not privileged (Linux: does not have the
1496            //   CAP_MKNOD capability); also returned if the filesystem
1497            //   containing pathname does not support the type of node
1498            //   requested.
1499            if !matches!(mode.fmt(), FileMode::IFREG | FileMode::IFIFO | FileMode::IFSOCK) {
1500                security::check_task_capable(current_task, CAP_MKNOD)?;
1501            }
1502            let locked = locked.cast_locked::<FileOpsCore>();
1503            self.ops().mknod(locked, self, current_task, name, mode, dev, owner)?
1504        };
1505
1506        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1507
1508        Ok(new_node)
1509    }
1510
1511    pub fn create_symlink<L>(
1512        &self,
1513        locked: &mut Locked<L>,
1514        current_task: &CurrentTask,
1515        mount: &MountInfo,
1516        name: &FsStr,
1517        target: &FsStr,
1518        owner: FsCred,
1519    ) -> Result<FsNodeHandle, Errno>
1520    where
1521        L: LockEqualOrBefore<FileOpsCore>,
1522    {
1523        self.check_access(
1524            locked,
1525            current_task,
1526            mount,
1527            Access::WRITE,
1528            CheckAccessReason::InternalPermissionChecks,
1529            security::Auditable::Name(name),
1530        )?;
1531        security::check_fs_node_symlink_access(current_task, self, name, target)?;
1532
1533        let locked = locked.cast_locked::<FileOpsCore>();
1534        let new_node =
1535            self.ops().create_symlink(locked, self, current_task, name, target, owner)?;
1536
1537        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1538
1539        Ok(new_node)
1540    }
1541
1542    /// Requests that the LSM initialise a security label for the `new_node`, and optionally provide
1543    /// an extended attribute to write to the file to persist it.  If no LSM is enabled, no extended
1544    /// attribute returned, or if the filesystem does not support extended attributes, then the call
1545    /// returns success. All other failure modes return an `Errno` that should be early-returned.
1546    fn init_new_node_security_on_create<L>(
1547        &self,
1548        locked: &mut Locked<L>,
1549        current_task: &CurrentTask,
1550        new_node: &FsNode,
1551        name: &FsStr,
1552    ) -> Result<(), Errno>
1553    where
1554        L: LockEqualOrBefore<FileOpsCore>,
1555    {
1556        let locked = locked.cast_locked::<FileOpsCore>();
1557        security::fs_node_init_on_create(current_task, &new_node, self, name)?
1558            .map(|xattr| {
1559                match new_node.ops().set_xattr(
1560                    locked,
1561                    &new_node,
1562                    current_task,
1563                    xattr.name,
1564                    xattr.value.as_slice().into(),
1565                    XattrOp::Create,
1566                ) {
1567                    Err(e) => {
1568                        if e.code == ENOTSUP {
1569                            // This should only occur if a task has an "fscreate" context set, and
1570                            // creates a new file in a filesystem that does not support xattrs.
1571                            Ok(())
1572                        } else {
1573                            Err(e)
1574                        }
1575                    }
1576                    result => result,
1577                }
1578            })
1579            .unwrap_or_else(|| Ok(()))
1580    }
1581
1582    pub fn create_tmpfile<L>(
1583        &self,
1584        locked: &mut Locked<L>,
1585        current_task: &CurrentTask,
1586        mount: &MountInfo,
1587        mut mode: FileMode,
1588        mut owner: FsCred,
1589        link_behavior: FsNodeLinkBehavior,
1590    ) -> Result<FsNodeHandle, Errno>
1591    where
1592        L: LockEqualOrBefore<FileOpsCore>,
1593    {
1594        self.check_access(
1595            locked,
1596            current_task,
1597            mount,
1598            Access::WRITE,
1599            CheckAccessReason::InternalPermissionChecks,
1600            security::Auditable::Location(std::panic::Location::caller()),
1601        )?;
1602        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1603        let node = self.ops().create_tmpfile(self, current_task, mode, owner)?;
1604        self.init_new_node_security_on_create(locked, current_task, &node, "".into())?;
1605        if link_behavior == FsNodeLinkBehavior::Disallowed {
1606            node.ensure_rare_data().link_behavior.set(link_behavior).unwrap();
1607        }
1608        Ok(node)
1609    }
1610
1611    // This method does not attempt to update the atime of the node.
1612    // Use `NamespaceNode::readlink` which checks the mount flags and updates the atime accordingly.
1613    pub fn readlink<L>(
1614        &self,
1615        locked: &mut Locked<L>,
1616        current_task: &CurrentTask,
1617    ) -> Result<SymlinkTarget, Errno>
1618    where
1619        L: LockEqualOrBefore<FileOpsCore>,
1620    {
1621        // TODO: 378864856 - Is there a permission check here other than security checks?
1622        security::check_fs_node_read_link_access(current_task, self)?;
1623        self.ops().readlink(locked.cast_locked::<FileOpsCore>(), self, current_task)
1624    }
1625
1626    pub fn link<L>(
1627        &self,
1628        locked: &mut Locked<L>,
1629        current_task: &CurrentTask,
1630        mount: &MountInfo,
1631        name: &FsStr,
1632        child: &FsNodeHandle,
1633    ) -> Result<FsNodeHandle, Errno>
1634    where
1635        L: LockEqualOrBefore<FileOpsCore>,
1636    {
1637        self.check_access(
1638            locked,
1639            current_task,
1640            mount,
1641            Access::WRITE,
1642            CheckAccessReason::InternalPermissionChecks,
1643            security::Auditable::Location(std::panic::Location::caller()),
1644        )?;
1645
1646        if child.is_dir() {
1647            return error!(EPERM);
1648        }
1649
1650        if let Some(child_rare_data) = child.rare_data.get() {
1651            if matches!(child_rare_data.link_behavior.get(), Some(FsNodeLinkBehavior::Disallowed)) {
1652                return error!(ENOENT);
1653            }
1654        }
1655
1656        // Check that `current_task` has permission to create the hard link.
1657        //
1658        // See description of /proc/sys/fs/protected_hardlinks in
1659        // https://man7.org/linux/man-pages/man5/proc.5.html for details of the security
1660        // vulnerabilities.
1661        //
1662        let (child_uid, mode) = {
1663            let info = child.info();
1664            (info.uid, info.mode)
1665        };
1666        // Check that the the filesystem UID of the calling process (`current_task`) is the same as
1667        // the UID of the existing file. The check can be bypassed if the calling process has
1668        // `CAP_FOWNER` capability.
1669        if child_uid != current_task.current_creds().fsuid
1670            && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1671        {
1672            // If current_task is not the user of the existing file, it needs to have read and write
1673            // access to the existing file.
1674            child
1675                .check_access(
1676                    locked,
1677                    current_task,
1678                    mount,
1679                    Access::READ | Access::WRITE,
1680                    CheckAccessReason::InternalPermissionChecks,
1681                    security::Auditable::Name(name),
1682                )
1683                .map_err(|e| {
1684                    // `check_access(..)` returns EACCES when the access rights doesn't match - change
1685                    // it to EPERM to match Linux standards.
1686                    if e == EACCES { errno!(EPERM) } else { e }
1687                })?;
1688            // There are also security issues that may arise when users link to setuid, setgid, or
1689            // special files.
1690            if mode.contains(FileMode::ISGID | FileMode::IXGRP) {
1691                return error!(EPERM);
1692            };
1693            if mode.contains(FileMode::ISUID) {
1694                return error!(EPERM);
1695            };
1696            if !mode.contains(FileMode::IFREG) {
1697                return error!(EPERM);
1698            };
1699        }
1700
1701        security::check_fs_node_link_access(current_task, self, child)?;
1702
1703        let locked = locked.cast_locked::<FileOpsCore>();
1704        self.ops().link(locked, self, current_task, name, child)?;
1705        Ok(child.clone())
1706    }
1707
1708    pub fn unlink<L>(
1709        &self,
1710        locked: &mut Locked<L>,
1711        current_task: &CurrentTask,
1712        mount: &MountInfo,
1713        name: &FsStr,
1714        child: &FsNodeHandle,
1715    ) -> Result<(), Errno>
1716    where
1717        L: LockEqualOrBefore<FileOpsCore>,
1718    {
1719        // The user must be able to search and write to the directory.
1720        self.check_access(
1721            locked,
1722            current_task,
1723            mount,
1724            Access::EXEC | Access::WRITE,
1725            CheckAccessReason::InternalPermissionChecks,
1726            security::Auditable::Name(name),
1727        )?;
1728        self.check_sticky_bit(current_task, child)?;
1729        if child.is_dir() {
1730            security::check_fs_node_rmdir_access(current_task, self, child, name)?;
1731        } else {
1732            security::check_fs_node_unlink_access(current_task, self, child, name)?;
1733        }
1734        let locked = locked.cast_locked::<FileOpsCore>();
1735        self.ops().unlink(locked, self, current_task, name, child)?;
1736        self.update_ctime_mtime();
1737        Ok(())
1738    }
1739
1740    pub fn truncate<L>(
1741        &self,
1742        locked: &mut Locked<L>,
1743        current_task: &CurrentTask,
1744        mount: &MountInfo,
1745        length: u64,
1746    ) -> Result<(), Errno>
1747    where
1748        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1749    {
1750        self.truncate_with_strategy(locked, RealAppendLockStrategy {}, current_task, mount, length)
1751    }
1752
1753    pub fn truncate_with_strategy<L, M>(
1754        &self,
1755        locked: &mut Locked<L>,
1756        strategy: impl AppendLockStrategy<M>,
1757        current_task: &CurrentTask,
1758        mount: &MountInfo,
1759        length: u64,
1760    ) -> Result<(), Errno>
1761    where
1762        M: LockEqualOrBefore<FileOpsCore>,
1763        L: LockEqualOrBefore<M>,
1764    {
1765        if self.is_dir() {
1766            return error!(EISDIR);
1767        }
1768
1769        {
1770            let locked = locked.cast_locked::<M>();
1771            self.check_access(
1772                locked,
1773                current_task,
1774                mount,
1775                Access::WRITE,
1776                CheckAccessReason::InternalPermissionChecks,
1777                security::Auditable::Location(std::panic::Location::caller()),
1778            )?;
1779        }
1780
1781        self.truncate_common(locked, strategy, current_task, length)
1782    }
1783
1784    /// Avoid calling this method directly. You probably want to call `FileObject::ftruncate()`
1785    /// which will also perform all file-descriptor based verifications.
1786    pub fn ftruncate<L>(
1787        &self,
1788        locked: &mut Locked<L>,
1789        current_task: &CurrentTask,
1790        length: u64,
1791    ) -> Result<(), Errno>
1792    where
1793        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1794    {
1795        if self.is_dir() {
1796            // When truncating a file descriptor, if the descriptor references a directory,
1797            // return EINVAL. This is different from the truncate() syscall which returns EISDIR.
1798            //
1799            // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#ERRORS
1800            return error!(EINVAL);
1801        }
1802
1803        // For ftruncate, we do not need to check that the file node is writable.
1804        //
1805        // The file object that calls this method must verify that the file was opened
1806        // with write permissions.
1807        //
1808        // This matters because a file could be opened with O_CREAT + O_RDWR + 0444 mode.
1809        // The file descriptor returned from such an operation can be truncated, even
1810        // though the file was created with a read-only mode.
1811        //
1812        // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#DESCRIPTION
1813        // which says:
1814        //
1815        // "With ftruncate(), the file must be open for writing; with truncate(),
1816        // the file must be writable."
1817
1818        self.truncate_common(locked, RealAppendLockStrategy {}, current_task, length)
1819    }
1820
1821    // Called by `truncate` and `ftruncate` above.
1822    fn truncate_common<L, M>(
1823        &self,
1824        locked: &mut Locked<L>,
1825        strategy: impl AppendLockStrategy<M>,
1826        current_task: &CurrentTask,
1827        length: u64,
1828    ) -> Result<(), Errno>
1829    where
1830        M: LockEqualOrBefore<FileOpsCore>,
1831        L: LockEqualOrBefore<M>,
1832    {
1833        if length > MAX_LFS_FILESIZE as u64 {
1834            return error!(EINVAL);
1835        }
1836        {
1837            let locked = locked.cast_locked::<M>().cast_locked::<FileOpsCore>();
1838            if length > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1839                send_standard_signal(locked, current_task, SignalInfo::default(SIGXFSZ));
1840                return error!(EFBIG);
1841            }
1842        }
1843        let locked = locked.cast_locked::<M>();
1844        self.clear_suid_and_sgid_bits(locked, current_task)?;
1845        // We have to take the append lock since otherwise it would be possible to truncate and for
1846        // an append to continue using the old size.
1847        let (guard, locked) = strategy.lock(locked, current_task, self)?;
1848        self.ops().truncate(locked, &guard, self, current_task, length)?;
1849        self.update_ctime_mtime();
1850        Ok(())
1851    }
1852
1853    /// Avoid calling this method directly. You probably want to call `FileObject::fallocate()`
1854    /// which will also perform additional verifications.
1855    pub fn fallocate<L>(
1856        &self,
1857        locked: &mut Locked<L>,
1858        current_task: &CurrentTask,
1859        mode: FallocMode,
1860        offset: u64,
1861        length: u64,
1862    ) -> Result<(), Errno>
1863    where
1864        L: LockBefore<BeforeFsNodeAppend>,
1865    {
1866        self.fallocate_with_strategy(
1867            locked,
1868            RealAppendLockStrategy {},
1869            current_task,
1870            mode,
1871            offset,
1872            length,
1873        )
1874    }
1875
1876    pub fn fallocate_with_strategy<L, M>(
1877        &self,
1878        locked: &mut Locked<L>,
1879        strategy: impl AppendLockStrategy<M>,
1880        current_task: &CurrentTask,
1881        mode: FallocMode,
1882        offset: u64,
1883        length: u64,
1884    ) -> Result<(), Errno>
1885    where
1886        M: LockEqualOrBefore<FileOpsCore>,
1887        L: LockEqualOrBefore<M>,
1888    {
1889        let allocate_size = checked_add_offset_and_length(offset as usize, length as usize)
1890            .map_err(|_| errno!(EFBIG))? as u64;
1891        {
1892            let locked = locked.cast_locked::<M>().cast_locked::<FileOpsCore>();
1893            if allocate_size > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1894                send_standard_signal(locked, current_task, SignalInfo::default(SIGXFSZ));
1895                return error!(EFBIG);
1896            }
1897        }
1898
1899        let locked = locked.cast_locked::<M>();
1900        self.clear_suid_and_sgid_bits(locked, current_task)?;
1901        let (guard, locked) = strategy.lock(locked, current_task, self)?;
1902        self.ops().allocate(locked, &guard, self, current_task, mode, offset, length)?;
1903        self.update_ctime_mtime();
1904        Ok(())
1905    }
1906
1907    fn update_metadata_for_child(
1908        &self,
1909        current_task: &CurrentTask,
1910        mode: &mut FileMode,
1911        owner: &mut FsCred,
1912    ) {
1913        // The setgid bit on a directory causes the gid to be inherited by new children and the
1914        // setgid bit to be inherited by new child directories. See SetgidDirTest in gvisor.
1915        {
1916            let self_info = self.info();
1917            if self_info.mode.contains(FileMode::ISGID) {
1918                owner.gid = self_info.gid;
1919                if mode.is_dir() {
1920                    *mode |= FileMode::ISGID;
1921                }
1922            }
1923        }
1924
1925        if !mode.is_dir() {
1926            // https://man7.org/linux/man-pages/man7/inode.7.html says:
1927            //
1928            //   For an executable file, the set-group-ID bit causes the
1929            //   effective group ID of a process that executes the file to change
1930            //   as described in execve(2).
1931            //
1932            // We need to check whether the current task has permission to create such a file.
1933            // See a similar check in `FsNode::chmod`.
1934            let current_creds = current_task.current_creds();
1935            if owner.gid != current_creds.fsgid
1936                && !current_creds.is_in_group(owner.gid)
1937                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1938            {
1939                *mode &= !FileMode::ISGID;
1940            }
1941        }
1942    }
1943
1944    /// Checks if O_NOATIME is allowed,
1945    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1946        // Per open(2),
1947        //
1948        //   O_NOATIME (since Linux 2.6.8)
1949        //      ...
1950        //
1951        //      This flag can be employed only if one of the following
1952        //      conditions is true:
1953        //
1954        //      *  The effective UID of the process matches the owner UID
1955        //         of the file.
1956        //
1957        //      *  The calling process has the CAP_FOWNER capability in
1958        //         its user namespace and the owner UID of the file has a
1959        //         mapping in the namespace.
1960        if current_task.current_creds().fsuid != self.info().uid {
1961            security::check_task_capable(current_task, CAP_FOWNER)?;
1962        }
1963        Ok(())
1964    }
1965
1966    pub fn default_check_access_impl(
1967        &self,
1968        current_task: &CurrentTask,
1969        permission_flags: security::PermissionFlags,
1970        reason: CheckAccessReason,
1971        info: RwLockReadGuard<'_, FsNodeInfo>,
1972        audit_context: Auditable<'_>,
1973    ) -> Result<(), Errno> {
1974        let (node_uid, node_gid, mode) = (info.uid, info.gid, info.mode);
1975        std::mem::drop(info);
1976        if let CheckAccessReason::ChangeTimestamps { now } = reason {
1977            // To set the timestamps to the current time the caller must either have write access to
1978            // the file, be the file owner, or hold the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
1979            // To set the timestamps to other values the caller must either be the file owner or hold
1980            // the CAP_FOWNER capability.
1981            if current_task.current_creds().fsuid == node_uid {
1982                return Ok(());
1983            }
1984            if now {
1985                if security::is_task_capable_noaudit(current_task, CAP_FOWNER) {
1986                    return Ok(());
1987                }
1988            } else {
1989                security::check_task_capable(current_task, CAP_FOWNER)?;
1990                return Ok(());
1991            }
1992        }
1993        check_access(self, current_task, permission_flags, node_uid, node_gid, mode)?;
1994        security::fs_node_permission(current_task, self, permission_flags, audit_context)
1995    }
1996
1997    /// Check whether the node can be accessed in the current context with the specified access
1998    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
1999    /// owner or is in the file's group.
2000    pub fn check_access<'a, L>(
2001        &self,
2002        locked: &mut Locked<L>,
2003        current_task: &CurrentTask,
2004        mount: &MountInfo,
2005        access: impl Into<security::PermissionFlags>,
2006        reason: CheckAccessReason,
2007        audit_context: impl Into<security::Auditable<'a>>,
2008    ) -> Result<(), Errno>
2009    where
2010        L: LockEqualOrBefore<FileOpsCore>,
2011    {
2012        let mut permission_flags = access.into();
2013        if permission_flags.contains(security::PermissionFlags::WRITE) {
2014            mount.check_readonly_filesystem()?;
2015        }
2016        if permission_flags.contains(security::PermissionFlags::EXEC) && !self.is_dir() {
2017            mount.check_noexec_filesystem()?;
2018        }
2019        if reason == CheckAccessReason::Access {
2020            permission_flags |= PermissionFlags::ACCESS;
2021        }
2022        self.ops().check_access(
2023            locked.cast_locked::<FileOpsCore>(),
2024            self,
2025            current_task,
2026            permission_flags,
2027            &self.info,
2028            reason,
2029            audit_context.into(),
2030        )
2031    }
2032
2033    /// Check whether the stick bit, `S_ISVTX`, forbids the `current_task` from removing the given
2034    /// `child`. If this node has `S_ISVTX`, then either the child must be owned by the `fsuid` of
2035    /// `current_task` or `current_task` must have `CAP_FOWNER`.
2036    pub fn check_sticky_bit(
2037        &self,
2038        current_task: &CurrentTask,
2039        child: &FsNodeHandle,
2040    ) -> Result<(), Errno> {
2041        if self.info().mode.contains(FileMode::ISVTX)
2042            && child.info().uid != current_task.current_creds().fsuid
2043        {
2044            security::check_task_capable(current_task, CAP_FOWNER)?;
2045        }
2046        Ok(())
2047    }
2048
2049    pub fn fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
2050        assert!(self.is_fifo());
2051        self.ensure_rare_data().ensure_fifo(current_task)
2052    }
2053
2054    /// Returns the UNIX domain socket bound to this node, if any.
2055    pub fn bound_socket(&self) -> Option<&SocketHandle> {
2056        if let Some(rare_data) = self.rare_data.get() { rare_data.bound_socket.get() } else { None }
2057    }
2058
2059    /// Register the provided socket as the UNIX domain socket bound to this node.
2060    ///
2061    /// It is a fatal error to call this method again if it has already been called on this node.
2062    pub fn set_bound_socket(&self, socket: SocketHandle) {
2063        assert!(self.ensure_rare_data().bound_socket.set(socket).is_ok());
2064    }
2065
2066    pub fn update_attributes<L, F>(
2067        &self,
2068        locked: &mut Locked<L>,
2069        current_task: &CurrentTask,
2070        mutator: F,
2071    ) -> Result<(), Errno>
2072    where
2073        L: LockEqualOrBefore<FileOpsCore>,
2074        F: FnOnce(&mut FsNodeInfo) -> Result<(), Errno>,
2075    {
2076        let mut info = self.info.write();
2077        let mut new_info = info.clone();
2078        mutator(&mut new_info)?;
2079
2080        let new_access = new_info.mode.user_access()
2081            | new_info.mode.group_access()
2082            | new_info.mode.other_access();
2083
2084        if new_access.intersects(Access::EXEC) {
2085            let write_guard_state = self.write_guard_state.lock();
2086            if let Ok(seals) = write_guard_state.get_seals() {
2087                if seals.contains(SealFlags::NO_EXEC) {
2088                    return error!(EPERM);
2089                }
2090            }
2091        }
2092
2093        // `mutator`s should not update the attribute change time, which is managed by this API.
2094        assert_eq!(info.time_status_change, new_info.time_status_change);
2095        if *info == new_info {
2096            return Ok(());
2097        }
2098        new_info.time_status_change = utc::utc_now();
2099
2100        let mut has = zxio_node_attr_has_t { ..Default::default() };
2101        has.modification_time = info.time_modify != new_info.time_modify;
2102        has.access_time = info.time_access != new_info.time_access;
2103        has.mode = info.mode != new_info.mode;
2104        has.uid = info.uid != new_info.uid;
2105        has.gid = info.gid != new_info.gid;
2106        has.rdev = info.rdev != new_info.rdev;
2107        has.casefold = info.casefold != new_info.casefold;
2108        has.wrapping_key_id = info.wrapping_key_id != new_info.wrapping_key_id;
2109
2110        security::check_fs_node_setattr_access(current_task, &self, &has)?;
2111
2112        // Call `update_attributes(..)` to persist the changes for the following fields.
2113        if has.modification_time
2114            || has.access_time
2115            || has.mode
2116            || has.uid
2117            || has.gid
2118            || has.rdev
2119            || has.casefold
2120            || has.wrapping_key_id
2121        {
2122            let locked = locked.cast_locked::<FileOpsCore>();
2123            self.ops().update_attributes(locked, current_task, &new_info, has)?;
2124        }
2125
2126        *info = new_info;
2127        Ok(())
2128    }
2129
2130    /// Set the permissions on this FsNode to the given values.
2131    ///
2132    /// Does not change the IFMT of the node.
2133    pub fn chmod<L>(
2134        &self,
2135        locked: &mut Locked<L>,
2136        current_task: &CurrentTask,
2137        mount: &MountInfo,
2138        mut mode: FileMode,
2139    ) -> Result<(), Errno>
2140    where
2141        L: LockEqualOrBefore<FileOpsCore>,
2142    {
2143        mount.check_readonly_filesystem()?;
2144        self.update_attributes(locked, current_task, |info| {
2145            let current_creds = current_task.current_creds();
2146            if info.uid != current_creds.euid {
2147                security::check_task_capable(current_task, CAP_FOWNER)?;
2148            } else if info.gid != current_creds.egid
2149                && !current_creds.is_in_group(info.gid)
2150                && mode.intersects(FileMode::ISGID)
2151                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
2152            {
2153                mode &= !FileMode::ISGID;
2154            }
2155            info.chmod(mode);
2156            Ok(())
2157        })
2158    }
2159
2160    /// Sets the owner and/or group on this FsNode.
2161    pub fn chown<L>(
2162        &self,
2163        locked: &mut Locked<L>,
2164        current_task: &CurrentTask,
2165        mount: &MountInfo,
2166        owner: Option<uid_t>,
2167        group: Option<gid_t>,
2168    ) -> Result<(), Errno>
2169    where
2170        L: LockEqualOrBefore<FileOpsCore>,
2171    {
2172        mount.check_readonly_filesystem()?;
2173        self.update_attributes(locked, current_task, |info| {
2174            if security::is_task_capable_noaudit(current_task, CAP_CHOWN) {
2175                info.chown(owner, group);
2176                return Ok(());
2177            }
2178
2179            // Nobody can change the owner.
2180            if let Some(uid) = owner {
2181                if info.uid != uid {
2182                    return error!(EPERM);
2183                }
2184            }
2185
2186            let (euid, is_in_group) = {
2187                let current_creds = current_task.current_creds();
2188                (current_creds.euid, group.map(|gid| current_creds.is_in_group(gid)))
2189            };
2190
2191            // The owner can change the group.
2192            if info.uid == euid {
2193                // To a group that it belongs.
2194                if let Some(is_in_group) = is_in_group {
2195                    if !is_in_group {
2196                        return error!(EPERM);
2197                    }
2198                }
2199                info.chown(None, group);
2200                return Ok(());
2201            }
2202
2203            // Any other user can call chown(file, -1, -1)
2204            if owner.is_some() || group.is_some() {
2205                return error!(EPERM);
2206            }
2207
2208            // But not on set-user-ID or set-group-ID files.
2209            // If we were to chown them, they would drop the set-ID bit.
2210            if info.mode.is_reg()
2211                && (info.mode.contains(FileMode::ISUID)
2212                    || info.mode.contains(FileMode::ISGID | FileMode::IXGRP))
2213            {
2214                return error!(EPERM);
2215            }
2216
2217            info.chown(None, None);
2218            Ok(())
2219        })
2220    }
2221
2222    /// Forcefully change the owner and group of this node.
2223    ///
2224    /// # Safety
2225    ///
2226    /// This function skips all the security checks and just updates the owner and group. Also, does
2227    /// not check if the filesystem is read-only and does not update the attribute change time.
2228    ///
2229    /// This function is used to set the owner and group of /proc/pid to the credentials of the
2230    /// current task. Please consider carefully whether you want to use this function for another
2231    /// purpose.
2232    pub unsafe fn force_chown(&self, creds: FsCred) {
2233        self.update_info(|info| {
2234            info.chown(Some(creds.uid), Some(creds.gid));
2235        });
2236    }
2237
2238    /// Whether this node is a regular file.
2239    pub fn is_reg(&self) -> bool {
2240        self.info().mode.is_reg()
2241    }
2242
2243    /// Whether this node is a directory.
2244    pub fn is_dir(&self) -> bool {
2245        self.info().mode.is_dir()
2246    }
2247
2248    /// Whether this node is a socket.
2249    pub fn is_sock(&self) -> bool {
2250        self.info().mode.is_sock()
2251    }
2252
2253    /// Whether this node is a FIFO.
2254    pub fn is_fifo(&self) -> bool {
2255        self.info().mode.is_fifo()
2256    }
2257
2258    /// Whether this node is a symbolic link.
2259    pub fn is_lnk(&self) -> bool {
2260        self.info().mode.is_lnk()
2261    }
2262
2263    pub fn dev(&self) -> DeviceType {
2264        self.fs().dev_id
2265    }
2266
2267    pub fn stat<L>(
2268        &self,
2269        locked: &mut Locked<L>,
2270        current_task: &CurrentTask,
2271    ) -> Result<uapi::stat, Errno>
2272    where
2273        L: LockEqualOrBefore<FileOpsCore>,
2274    {
2275        security::check_fs_node_getattr_access(current_task, self)?;
2276
2277        let info = self.fetch_and_refresh_info(locked, current_task)?;
2278
2279        let time_to_kernel_timespec_pair = |t| {
2280            let timespec { tv_sec, tv_nsec } = timespec_from_time(t);
2281            let time = tv_sec.try_into().map_err(|_| errno!(EINVAL))?;
2282            let time_nsec = tv_nsec.try_into().map_err(|_| errno!(EINVAL))?;
2283            Ok((time, time_nsec))
2284        };
2285
2286        let (st_atime, st_atime_nsec) = time_to_kernel_timespec_pair(info.time_access)?;
2287        let (st_mtime, st_mtime_nsec) = time_to_kernel_timespec_pair(info.time_modify)?;
2288        let (st_ctime, st_ctime_nsec) = time_to_kernel_timespec_pair(info.time_status_change)?;
2289
2290        Ok(uapi::stat {
2291            st_dev: self.dev().bits(),
2292            st_ino: self.ino,
2293            st_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2294            st_mode: info.mode.bits(),
2295            st_uid: info.uid,
2296            st_gid: info.gid,
2297            st_rdev: info.rdev.bits(),
2298            st_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2299            st_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2300            st_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2301            st_atime,
2302            st_atime_nsec,
2303            st_mtime,
2304            st_mtime_nsec,
2305            st_ctime,
2306            st_ctime_nsec,
2307            ..Default::default()
2308        })
2309    }
2310
2311    // TODO(https://fxbug.dev/454730248): This is probably the wrong way to implement O_APPEND.
2312    pub fn get_size<L>(
2313        &self,
2314        locked: &mut Locked<L>,
2315        current_task: &CurrentTask,
2316    ) -> Result<usize, Errno>
2317    where
2318        L: LockEqualOrBefore<FileOpsCore>,
2319    {
2320        let info = self.fetch_and_refresh_info(locked, current_task)?;
2321        Ok(info.size.try_into().map_err(|_| errno!(EINVAL))?)
2322    }
2323
2324    fn statx_timestamp_from_time(time: UtcInstant) -> statx_timestamp {
2325        let nanos = time.into_nanos();
2326        statx_timestamp {
2327            tv_sec: nanos / NANOS_PER_SECOND,
2328            tv_nsec: (nanos % NANOS_PER_SECOND) as u32,
2329            ..Default::default()
2330        }
2331    }
2332
2333    pub fn statx<L>(
2334        &self,
2335        locked: &mut Locked<L>,
2336        current_task: &CurrentTask,
2337        flags: StatxFlags,
2338        mask: u32,
2339    ) -> Result<statx, Errno>
2340    where
2341        L: LockEqualOrBefore<FileOpsCore>,
2342    {
2343        security::check_fs_node_getattr_access(current_task, self)?;
2344
2345        // Ignore mask for now and fill in all of the fields.
2346        let info = if flags.contains(StatxFlags::AT_STATX_DONT_SYNC) {
2347            self.info()
2348        } else {
2349            self.fetch_and_refresh_info(locked, current_task)?
2350        };
2351        if mask & STATX__RESERVED == STATX__RESERVED {
2352            return error!(EINVAL);
2353        }
2354
2355        track_stub!(TODO("https://fxbug.dev/302594110"), "statx attributes");
2356        let stx_mnt_id = 0;
2357        let mut stx_attributes = 0;
2358        let stx_attributes_mask = STATX_ATTR_VERITY as u64;
2359
2360        if matches!(*self.fsverity.lock(), FsVerityState::FsVerity) {
2361            stx_attributes |= STATX_ATTR_VERITY as u64;
2362        }
2363
2364        Ok(statx {
2365            stx_mask: STATX_NLINK
2366                | STATX_UID
2367                | STATX_GID
2368                | STATX_ATIME
2369                | STATX_MTIME
2370                | STATX_CTIME
2371                | STATX_INO
2372                | STATX_SIZE
2373                | STATX_BLOCKS
2374                | STATX_BASIC_STATS,
2375            stx_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2376            stx_attributes,
2377            stx_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2378            stx_uid: info.uid,
2379            stx_gid: info.gid,
2380            stx_mode: info.mode.bits().try_into().map_err(|_| errno!(EINVAL))?,
2381            stx_ino: self.ino,
2382            stx_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2383            stx_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2384            stx_attributes_mask,
2385            stx_ctime: Self::statx_timestamp_from_time(info.time_status_change),
2386            stx_mtime: Self::statx_timestamp_from_time(info.time_modify),
2387            stx_atime: Self::statx_timestamp_from_time(info.time_access),
2388
2389            stx_rdev_major: info.rdev.major(),
2390            stx_rdev_minor: info.rdev.minor(),
2391
2392            stx_dev_major: self.fs().dev_id.major(),
2393            stx_dev_minor: self.fs().dev_id.minor(),
2394            stx_mnt_id,
2395            ..Default::default()
2396        })
2397    }
2398
2399    /// Checks whether `current_task` has capabilities required for the specified `access` to the
2400    /// extended attribute `name`.
2401    fn check_xattr_access<L>(
2402        &self,
2403        locked: &mut Locked<L>,
2404        current_task: &CurrentTask,
2405        mount: &MountInfo,
2406        name: &FsStr,
2407        access: Access,
2408    ) -> Result<(), Errno>
2409    where
2410        L: LockEqualOrBefore<FileOpsCore>,
2411    {
2412        assert!(access == Access::READ || access == Access::WRITE);
2413
2414        let enodata_if_read =
2415            |e: Errno| if access == Access::READ && e.code == EPERM { errno!(ENODATA) } else { e };
2416
2417        // man xattr(7) describes the different access checks applied to each extended attribute
2418        // namespace.
2419        if name.starts_with(XATTR_USER_PREFIX.to_bytes()) {
2420            {
2421                let info = self.info();
2422                if !info.mode.is_reg() && !info.mode.is_dir() {
2423                    return Err(enodata_if_read(errno!(EPERM)));
2424                }
2425            }
2426
2427            // TODO: https://fxbug.dev/460734830 - Perform capability check(s) if file has sticky
2428            // bit set.
2429
2430            self.check_access(
2431                locked,
2432                current_task,
2433                mount,
2434                access,
2435                CheckAccessReason::InternalPermissionChecks,
2436                security::Auditable::Name(name),
2437            )?;
2438        } else if name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()) {
2439            // Trusted extended attributes require `CAP_SYS_ADMIN` to read or write.
2440            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2441        } else if name.starts_with(XATTR_SYSTEM_PREFIX.to_bytes()) {
2442            // System extended attributes have attribute-specific access policy.
2443            // TODO: https://fxbug.dev/460734830 -  Revise how system extended attributes are
2444            // access-controlled.
2445            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2446        } else if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2447            if access == Access::WRITE {
2448                // Writes require `CAP_SYS_ADMIN`, unless the LSM owning `name` specifies to skip.
2449                if !security::fs_node_xattr_skipcap(current_task, name) {
2450                    security::check_task_capable(current_task, CAP_SYS_ADMIN)
2451                        .map_err(enodata_if_read)?;
2452                }
2453            }
2454        } else {
2455            panic!("Unknown extended attribute prefix: {}", name);
2456        }
2457        Ok(())
2458    }
2459
2460    pub fn get_xattr<L>(
2461        &self,
2462        locked: &mut Locked<L>,
2463        current_task: &CurrentTask,
2464        mount: &MountInfo,
2465        name: &FsStr,
2466        max_size: usize,
2467    ) -> Result<ValueOrSize<FsString>, Errno>
2468    where
2469        L: LockEqualOrBefore<FileOpsCore>,
2470    {
2471        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2472        self.check_xattr_access(locked, current_task, mount, name, Access::READ)?;
2473
2474        // LSM access checks must be performed after discretionary checks.
2475        security::check_fs_node_getxattr_access(current_task, self, name)?;
2476
2477        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2478            // If the attribute is in the security.* domain then allow the LSM to handle the
2479            // request, or to delegate to `FsNodeOps::get_xattr()`.
2480            security::fs_node_getsecurity(locked, current_task, self, name, max_size)
2481        } else {
2482            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2483            self.ops().get_xattr(
2484                locked.cast_locked::<FileOpsCore>(),
2485                self,
2486                current_task,
2487                name,
2488                max_size,
2489            )
2490        }
2491    }
2492
2493    pub fn set_xattr<L>(
2494        &self,
2495        locked: &mut Locked<L>,
2496        current_task: &CurrentTask,
2497        mount: &MountInfo,
2498        name: &FsStr,
2499        value: &FsStr,
2500        op: XattrOp,
2501    ) -> Result<(), Errno>
2502    where
2503        L: LockEqualOrBefore<FileOpsCore>,
2504    {
2505        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2506        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2507
2508        // LSM access checks must be performed after discretionary checks.
2509        security::check_fs_node_setxattr_access(current_task, self, name, value, op)?;
2510
2511        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2512            // If the attribute is in the security.* domain then allow the LSM to handle the
2513            // request, or to delegate to `FsNodeOps::set_xattr()`.
2514            security::fs_node_setsecurity(locked, current_task, self, name, value, op)
2515        } else {
2516            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2517            self.ops().set_xattr(
2518                locked.cast_locked::<FileOpsCore>(),
2519                self,
2520                current_task,
2521                name,
2522                value,
2523                op,
2524            )
2525        }
2526    }
2527
2528    pub fn remove_xattr<L>(
2529        &self,
2530        locked: &mut Locked<L>,
2531        current_task: &CurrentTask,
2532        mount: &MountInfo,
2533        name: &FsStr,
2534    ) -> Result<(), Errno>
2535    where
2536        L: LockEqualOrBefore<FileOpsCore>,
2537    {
2538        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2539        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2540
2541        // LSM access checks must be performed after discretionary checks.
2542        security::check_fs_node_removexattr_access(current_task, self, name)?;
2543        self.ops().remove_xattr(locked.cast_locked::<FileOpsCore>(), self, current_task, name)
2544    }
2545
2546    pub fn list_xattrs<L>(
2547        &self,
2548        locked: &mut Locked<L>,
2549        current_task: &CurrentTask,
2550        max_size: usize,
2551    ) -> Result<ValueOrSize<Vec<FsString>>, Errno>
2552    where
2553        L: LockEqualOrBefore<FileOpsCore>,
2554    {
2555        security::check_fs_node_listxattr_access(current_task, self)?;
2556        Ok(self
2557            .ops()
2558            .list_xattrs(locked.cast_locked::<FileOpsCore>(), self, current_task, max_size)?
2559            .map(|mut v| {
2560                // Extended attributes may be listed even if the caller would not be able to read
2561                // (or modify) the attribute's value.
2562                // trusted.* attributes are only accessible with CAP_SYS_ADMIN and are omitted by
2563                // `listxattr()` unless the caller holds CAP_SYS_ADMIN.
2564                if !security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN) {
2565                    v.retain(|name| !name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()));
2566                }
2567                v
2568            }))
2569    }
2570
2571    /// Returns current `FsNodeInfo`.
2572    pub fn info(&self) -> RwLockReadGuard<'_, FsNodeInfo> {
2573        self.info.read()
2574    }
2575
2576    /// Refreshes the `FsNodeInfo` if necessary and returns a read guard.
2577    pub fn fetch_and_refresh_info<L>(
2578        &self,
2579        locked: &mut Locked<L>,
2580        current_task: &CurrentTask,
2581    ) -> Result<RwLockReadGuard<'_, FsNodeInfo>, Errno>
2582    where
2583        L: LockEqualOrBefore<FileOpsCore>,
2584    {
2585        self.ops().fetch_and_refresh_info(
2586            locked.cast_locked::<FileOpsCore>(),
2587            self,
2588            current_task,
2589            &self.info,
2590        )
2591    }
2592
2593    pub fn update_info<F, T>(&self, mutator: F) -> T
2594    where
2595        F: FnOnce(&mut FsNodeInfo) -> T,
2596    {
2597        let mut info = self.info.write();
2598        mutator(&mut info)
2599    }
2600
2601    /// Clear the SUID and SGID bits unless the `current_task` has `CAP_FSETID`
2602    pub fn clear_suid_and_sgid_bits<L>(
2603        &self,
2604        locked: &mut Locked<L>,
2605        current_task: &CurrentTask,
2606    ) -> Result<(), Errno>
2607    where
2608        L: LockEqualOrBefore<FileOpsCore>,
2609    {
2610        if !security::is_task_capable_noaudit(current_task, CAP_FSETID) {
2611            self.update_attributes(locked, current_task, |info| {
2612                info.clear_suid_and_sgid_bits();
2613                Ok(())
2614            })?;
2615        }
2616        Ok(())
2617    }
2618
2619    /// Update the ctime and mtime of a file to now.
2620    pub fn update_ctime_mtime(&self) {
2621        if self.fs().manages_timestamps() {
2622            return;
2623        }
2624        self.update_info(|info| {
2625            let now = utc::utc_now();
2626            info.time_status_change = now;
2627            info.time_modify = now;
2628        });
2629    }
2630
2631    /// Update the ctime of a file to now.
2632    pub fn update_ctime(&self) {
2633        if self.fs().manages_timestamps() {
2634            return;
2635        }
2636        self.update_info(|info| {
2637            let now = utc::utc_now();
2638            info.time_status_change = now;
2639        });
2640    }
2641
2642    /// Update the atime and mtime if the `current_task` has write access, is the file owner, or
2643    /// holds either the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
2644    pub fn update_atime_mtime<L>(
2645        &self,
2646        locked: &mut Locked<L>,
2647        current_task: &CurrentTask,
2648        mount: &MountInfo,
2649        atime: TimeUpdateType,
2650        mtime: TimeUpdateType,
2651    ) -> Result<(), Errno>
2652    where
2653        L: LockEqualOrBefore<FileOpsCore>,
2654    {
2655        // If the filesystem is read-only, this always fail.
2656        mount.check_readonly_filesystem()?;
2657
2658        let now = matches!((atime, mtime), (TimeUpdateType::Now, TimeUpdateType::Now));
2659        self.check_access(
2660            locked,
2661            current_task,
2662            mount,
2663            Access::WRITE,
2664            CheckAccessReason::ChangeTimestamps { now },
2665            security::Auditable::Location(std::panic::Location::caller()),
2666        )?;
2667
2668        if !matches!((atime, mtime), (TimeUpdateType::Omit, TimeUpdateType::Omit)) {
2669            // This function is called by `utimes(..)` which will update the access and
2670            // modification time. We need to call `update_attributes()` to update the mtime of
2671            // filesystems that manages file timestamps.
2672            self.update_attributes(locked, current_task, |info| {
2673                let now = utc::utc_now();
2674                let get_time = |time: TimeUpdateType| match time {
2675                    TimeUpdateType::Now => Some(now),
2676                    TimeUpdateType::Time(t) => Some(t),
2677                    TimeUpdateType::Omit => None,
2678                };
2679                if let Some(time) = get_time(atime) {
2680                    info.time_access = time;
2681                }
2682                if let Some(time) = get_time(mtime) {
2683                    info.time_modify = time;
2684                }
2685                Ok(())
2686            })?;
2687        }
2688        Ok(())
2689    }
2690
2691    /// Returns a string describing this `FsNode` in the format used by "/proc/../fd" for anonymous
2692    /// file descriptors. By default this is in the form:
2693    ///   <class>:[<node_id>]
2694    /// though `FsNodeOps` may customize this as required.
2695    pub fn internal_name(&self) -> FsString {
2696        if let Some(name) = self.ops().internal_name(self) {
2697            return name;
2698        };
2699        let class = if self.is_sock() {
2700            "socket"
2701        } else if self.is_fifo() {
2702            "pipe"
2703        } else {
2704            "file"
2705        };
2706        format!("{}:[{}]", class, self.ino).into()
2707    }
2708
2709    /// The key used to identify this node in the file system's node cache.
2710    ///
2711    /// For many file systems, this will be the same as the inode number. However, some file
2712    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
2713    pub fn node_key(&self) -> ino_t {
2714        self.ops().node_key(self)
2715    }
2716
2717    fn ensure_rare_data(&self) -> &FsNodeRareData {
2718        self.rare_data.get_or_init(|| Box::new(FsNodeRareData::default()))
2719    }
2720
2721    /// Returns the set of watchers for this node.
2722    ///
2723    /// Only call this function if you require this node to actually store a list of watchers. If
2724    /// you just wish to notify any watchers that might exist, please use `notify` instead.
2725    pub fn ensure_watchers(&self) -> &inotify::InotifyWatchers {
2726        &self.ensure_rare_data().watchers
2727    }
2728
2729    /// Notify the watchers of the given event.
2730    pub fn notify(
2731        &self,
2732        event_mask: InotifyMask,
2733        cookie: u32,
2734        name: &FsStr,
2735        mode: FileMode,
2736        is_dead: bool,
2737    ) {
2738        if let Some(rare_data) = self.rare_data.get() {
2739            rare_data.watchers.notify(event_mask, cookie, name, mode, is_dead);
2740        }
2741    }
2742}
2743
2744impl std::fmt::Debug for FsNode {
2745    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2746        f.debug_struct("FsNode")
2747            .field("fs", &self.fs().name())
2748            .field("info", &*self.info())
2749            .field("ops_ty", &self.ops().type_name())
2750            .finish()
2751    }
2752}
2753
2754impl Releasable for FsNode {
2755    type Context<'a> = CurrentTaskAndLocked<'a>;
2756
2757    fn release<'a>(self, context: CurrentTaskAndLocked<'a>) {
2758        let (locked, current_task) = context;
2759        if let Some(fs) = self.fs.upgrade() {
2760            fs.remove_node(&self);
2761        }
2762        if let Err(err) = self.ops.forget(
2763            locked.cast_locked::<FileOpsCore>(),
2764            current_task,
2765            self.info.into_inner(),
2766        ) {
2767            log_error!("Error on FsNodeOps::forget: {err:?}");
2768        }
2769    }
2770}
2771
2772fn check_access(
2773    fs_node: &FsNode,
2774    current_task: &CurrentTask,
2775    permission_flags: security::PermissionFlags,
2776    node_uid: uid_t,
2777    node_gid: gid_t,
2778    mode: FileMode,
2779) -> Result<(), Errno> {
2780    // Determine which of the access bits apply to the `current_task`.
2781    let (fsuid, is_in_group) = {
2782        let current_creds = current_task.current_creds();
2783        (current_creds.fsuid, current_creds.is_in_group(node_gid))
2784    };
2785    let granted = if fsuid == node_uid {
2786        mode.user_access()
2787    } else if is_in_group {
2788        mode.group_access()
2789    } else {
2790        mode.other_access()
2791    };
2792
2793    let access = permission_flags.as_access();
2794    if granted.contains(access) {
2795        return Ok(());
2796    }
2797
2798    // Callers with CAP_DAC_READ_SEARCH override can read files & directories, and traverse
2799    // directories to which they lack permission.
2800    let mut requested = access & !granted;
2801
2802    // If this check was triggered by `access()`, or a variant, then check for a `dontaudit`
2803    // statement for the `audit_access` permission for this caller & file.
2804    let have_dont_audit = OnceBool::new();
2805    let has_capability = move |current_task, capability| {
2806        let dont_audit = have_dont_audit.get_or_init(|| {
2807            permission_flags.contains(PermissionFlags::ACCESS)
2808                && security::has_dontaudit_access(current_task, fs_node)
2809        });
2810        if dont_audit {
2811            security::is_task_capable_noaudit(current_task, capability)
2812        } else {
2813            security::check_task_capable(current_task, capability).is_ok()
2814        }
2815    };
2816
2817    // CAP_DAC_READ_SEARCH allows bypass of read checks, and directory traverse (eXecute) checks.
2818    let dac_read_search_access =
2819        if mode.is_dir() { Access::READ | Access::EXEC } else { Access::READ };
2820    if dac_read_search_access.intersects(requested)
2821        && has_capability(current_task, CAP_DAC_READ_SEARCH)
2822    {
2823        requested.remove(dac_read_search_access);
2824    }
2825    if requested.is_empty() {
2826        return Ok(());
2827    }
2828
2829    // CAP_DAC_OVERRIDE allows bypass of all checks (though see the comment for file-execute).
2830    let mut dac_override_access = Access::READ | Access::WRITE;
2831    dac_override_access |= if mode.is_dir() {
2832        Access::EXEC
2833    } else {
2834        // File execute access checks may not be bypassed unless at least one executable bit is set.
2835        (mode.user_access() | mode.group_access() | mode.other_access()) & Access::EXEC
2836    };
2837    if dac_override_access.intersects(requested) && has_capability(current_task, CAP_DAC_OVERRIDE) {
2838        requested.remove(dac_override_access);
2839    }
2840    if requested.is_empty() {
2841        return Ok(());
2842    }
2843
2844    return error!(EACCES);
2845}
2846
2847#[cfg(test)]
2848mod tests {
2849    use super::*;
2850    use crate::device::mem::mem_device_init;
2851    use crate::testing::*;
2852    use crate::vfs::buffers::VecOutputBuffer;
2853    use starnix_uapi::auth::Credentials;
2854    use starnix_uapi::file_mode::mode;
2855
2856    #[::fuchsia::test]
2857    async fn open_device_file() {
2858        spawn_kernel_and_run(async |locked, current_task| {
2859            mem_device_init(locked, &*current_task).expect("mem_device_init");
2860
2861            // Create a device file that points to the `zero` device (which is automatically
2862            // registered in the kernel).
2863            current_task
2864                .fs()
2865                .root()
2866                .create_node(
2867                    locked,
2868                    &current_task,
2869                    "zero".into(),
2870                    mode!(IFCHR, 0o666),
2871                    DeviceType::ZERO,
2872                )
2873                .expect("create_node");
2874
2875            const CONTENT_LEN: usize = 10;
2876            let mut buffer = VecOutputBuffer::new(CONTENT_LEN);
2877
2878            // Read from the zero device.
2879            let device_file = current_task
2880                .open_file(locked, "zero".into(), OpenFlags::RDONLY)
2881                .expect("open device file");
2882            device_file.read(locked, &current_task, &mut buffer).expect("read from zero");
2883
2884            // Assert the contents.
2885            assert_eq!(&[0; CONTENT_LEN], buffer.data());
2886        })
2887        .await;
2888    }
2889
2890    #[::fuchsia::test]
2891    async fn node_info_is_reflected_in_stat() {
2892        spawn_kernel_and_run(async |locked, current_task| {
2893            // Create a node.
2894            let node = &current_task
2895                .fs()
2896                .root()
2897                .create_node(
2898                    locked,
2899                    &current_task,
2900                    "zero".into(),
2901                    FileMode::IFCHR,
2902                    DeviceType::ZERO,
2903                )
2904                .expect("create_node")
2905                .entry
2906                .node;
2907            node.update_info(|info| {
2908                info.mode = FileMode::IFSOCK;
2909                info.size = 1;
2910                info.blocks = 2;
2911                info.blksize = 4;
2912                info.uid = 9;
2913                info.gid = 10;
2914                info.link_count = 11;
2915                info.time_status_change = UtcInstant::from_nanos(1);
2916                info.time_access = UtcInstant::from_nanos(2);
2917                info.time_modify = UtcInstant::from_nanos(3);
2918                info.rdev = DeviceType::new(13, 13);
2919            });
2920            let stat = node.stat(locked, &current_task).expect("stat");
2921
2922            assert_eq!(stat.st_mode, FileMode::IFSOCK.bits());
2923            assert_eq!(stat.st_size, 1);
2924            assert_eq!(stat.st_blksize, 4);
2925            assert_eq!(stat.st_blocks, 2);
2926            assert_eq!(stat.st_uid, 9);
2927            assert_eq!(stat.st_gid, 10);
2928            assert_eq!(stat.st_nlink, 11);
2929            assert_eq!(stat.st_ctime, 0);
2930            assert_eq!(stat.st_ctime_nsec, 1);
2931            assert_eq!(stat.st_atime, 0);
2932            assert_eq!(stat.st_atime_nsec, 2);
2933            assert_eq!(stat.st_mtime, 0);
2934            assert_eq!(stat.st_mtime_nsec, 3);
2935            assert_eq!(stat.st_rdev, DeviceType::new(13, 13).bits());
2936        })
2937        .await;
2938    }
2939
2940    #[::fuchsia::test]
2941    fn test_flock_operation() {
2942        assert!(FlockOperation::from_flags(0).is_err());
2943        assert!(FlockOperation::from_flags(u32::MAX).is_err());
2944
2945        let operation1 = FlockOperation::from_flags(LOCK_SH).expect("from_flags");
2946        assert!(!operation1.is_unlock());
2947        assert!(!operation1.is_lock_exclusive());
2948        assert!(operation1.is_blocking());
2949
2950        let operation2 = FlockOperation::from_flags(LOCK_EX | LOCK_NB).expect("from_flags");
2951        assert!(!operation2.is_unlock());
2952        assert!(operation2.is_lock_exclusive());
2953        assert!(!operation2.is_blocking());
2954
2955        let operation3 = FlockOperation::from_flags(LOCK_UN).expect("from_flags");
2956        assert!(operation3.is_unlock());
2957        assert!(!operation3.is_lock_exclusive());
2958        assert!(operation3.is_blocking());
2959    }
2960
2961    #[::fuchsia::test]
2962    async fn test_check_access() {
2963        spawn_kernel_and_run(async |locked, current_task| {
2964            let mut creds = Credentials::with_ids(1, 2);
2965            creds.groups = vec![3, 4];
2966            current_task.set_creds(creds);
2967
2968            // Create a node.
2969            let node = &current_task
2970                .fs()
2971                .root()
2972                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
2973                .expect("create_node")
2974                .entry
2975                .node;
2976            let check_access = |locked: &mut Locked<Unlocked>,
2977                                uid: uid_t,
2978                                gid: gid_t,
2979                                perm: u32,
2980                                access: Access| {
2981                node.update_info(|info| {
2982                    info.mode = mode!(IFREG, perm);
2983                    info.uid = uid;
2984                    info.gid = gid;
2985                });
2986                node.check_access(
2987                    locked,
2988                    &current_task,
2989                    &MountInfo::detached(),
2990                    access,
2991                    CheckAccessReason::InternalPermissionChecks,
2992                    security::Auditable::Location(std::panic::Location::caller()),
2993                )
2994            };
2995
2996            assert_eq!(check_access(locked, 0, 0, 0o700, Access::EXEC), error!(EACCES));
2997            assert_eq!(check_access(locked, 0, 0, 0o700, Access::READ), error!(EACCES));
2998            assert_eq!(check_access(locked, 0, 0, 0o700, Access::WRITE), error!(EACCES));
2999
3000            assert_eq!(check_access(locked, 0, 0, 0o070, Access::EXEC), error!(EACCES));
3001            assert_eq!(check_access(locked, 0, 0, 0o070, Access::READ), error!(EACCES));
3002            assert_eq!(check_access(locked, 0, 0, 0o070, Access::WRITE), error!(EACCES));
3003
3004            assert_eq!(check_access(locked, 0, 0, 0o007, Access::EXEC), Ok(()));
3005            assert_eq!(check_access(locked, 0, 0, 0o007, Access::READ), Ok(()));
3006            assert_eq!(check_access(locked, 0, 0, 0o007, Access::WRITE), Ok(()));
3007
3008            assert_eq!(check_access(locked, 1, 0, 0o700, Access::EXEC), Ok(()));
3009            assert_eq!(check_access(locked, 1, 0, 0o700, Access::READ), Ok(()));
3010            assert_eq!(check_access(locked, 1, 0, 0o700, Access::WRITE), Ok(()));
3011
3012            assert_eq!(check_access(locked, 1, 0, 0o100, Access::EXEC), Ok(()));
3013            assert_eq!(check_access(locked, 1, 0, 0o100, Access::READ), error!(EACCES));
3014            assert_eq!(check_access(locked, 1, 0, 0o100, Access::WRITE), error!(EACCES));
3015
3016            assert_eq!(check_access(locked, 1, 0, 0o200, Access::EXEC), error!(EACCES));
3017            assert_eq!(check_access(locked, 1, 0, 0o200, Access::READ), error!(EACCES));
3018            assert_eq!(check_access(locked, 1, 0, 0o200, Access::WRITE), Ok(()));
3019
3020            assert_eq!(check_access(locked, 1, 0, 0o400, Access::EXEC), error!(EACCES));
3021            assert_eq!(check_access(locked, 1, 0, 0o400, Access::READ), Ok(()));
3022            assert_eq!(check_access(locked, 1, 0, 0o400, Access::WRITE), error!(EACCES));
3023
3024            assert_eq!(check_access(locked, 0, 2, 0o700, Access::EXEC), error!(EACCES));
3025            assert_eq!(check_access(locked, 0, 2, 0o700, Access::READ), error!(EACCES));
3026            assert_eq!(check_access(locked, 0, 2, 0o700, Access::WRITE), error!(EACCES));
3027
3028            assert_eq!(check_access(locked, 0, 2, 0o070, Access::EXEC), Ok(()));
3029            assert_eq!(check_access(locked, 0, 2, 0o070, Access::READ), Ok(()));
3030            assert_eq!(check_access(locked, 0, 2, 0o070, Access::WRITE), Ok(()));
3031
3032            assert_eq!(check_access(locked, 0, 3, 0o070, Access::EXEC), Ok(()));
3033            assert_eq!(check_access(locked, 0, 3, 0o070, Access::READ), Ok(()));
3034            assert_eq!(check_access(locked, 0, 3, 0o070, Access::WRITE), Ok(()));
3035        })
3036        .await;
3037    }
3038
3039    #[::fuchsia::test]
3040    async fn set_security_xattr_fails_without_security_module_or_root() {
3041        spawn_kernel_and_run(async |locked, current_task| {
3042            let mut creds = Credentials::with_ids(1, 2);
3043            creds.groups = vec![3, 4];
3044            current_task.set_creds(creds);
3045
3046            // Create a node.
3047            let node = &current_task
3048                .fs()
3049                .root()
3050                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3051                .expect("create_node")
3052                .entry
3053                .node;
3054
3055            // Give read-write-execute access.
3056            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3057
3058            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3059            // should fail.
3060            assert_eq!(
3061                node.set_xattr(
3062                    locked,
3063                    &current_task,
3064                    &MountInfo::detached(),
3065                    "security.name".into(),
3066                    "security_label".into(),
3067                    XattrOp::Create,
3068                ),
3069                error!(EPERM)
3070            );
3071        })
3072        .await;
3073    }
3074
3075    #[::fuchsia::test]
3076    async fn set_non_user_xattr_fails_without_security_module_or_root() {
3077        spawn_kernel_and_run(async |locked, current_task| {
3078            let mut creds = Credentials::with_ids(1, 2);
3079            creds.groups = vec![3, 4];
3080            current_task.set_creds(creds);
3081
3082            // Create a node.
3083            let node = &current_task
3084                .fs()
3085                .root()
3086                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3087                .expect("create_node")
3088                .entry
3089                .node;
3090
3091            // Give read-write-execute access.
3092            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3093
3094            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3095            // should fail.
3096            assert_eq!(
3097                node.set_xattr(
3098                    locked,
3099                    &current_task,
3100                    &MountInfo::detached(),
3101                    "trusted.name".into(),
3102                    "some data".into(),
3103                    XattrOp::Create,
3104                ),
3105                error!(EPERM)
3106            );
3107        })
3108        .await;
3109    }
3110
3111    #[::fuchsia::test]
3112    async fn get_security_xattr_succeeds_without_read_access() {
3113        spawn_kernel_and_run(async |locked, current_task| {
3114            let mut creds = Credentials::with_ids(1, 2);
3115            creds.groups = vec![3, 4];
3116            current_task.set_creds(creds);
3117
3118            // Create a node.
3119            let node = &current_task
3120                .fs()
3121                .root()
3122                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3123                .expect("create_node")
3124                .entry
3125                .node;
3126
3127            // Only give read access to the root and give root access to the current task.
3128            node.update_info(|info| info.mode = mode!(IFREG, 0o100));
3129            current_task.set_creds(Credentials::with_ids(0, 0));
3130
3131            // Setting the label should succeed even without write access to the file.
3132            assert_eq!(
3133                node.set_xattr(
3134                    locked,
3135                    &current_task,
3136                    &MountInfo::detached(),
3137                    "security.name".into(),
3138                    "security_label".into(),
3139                    XattrOp::Create,
3140                ),
3141                Ok(())
3142            );
3143
3144            // Remove root access from the current task.
3145            current_task.set_creds(Credentials::with_ids(1, 1));
3146
3147            // Getting the label should succeed even without read access to the file.
3148            assert_eq!(
3149                node.get_xattr(
3150                    locked,
3151                    &current_task,
3152                    &MountInfo::detached(),
3153                    "security.name".into(),
3154                    4096
3155                ),
3156                Ok(ValueOrSize::Value("security_label".into()))
3157            );
3158        })
3159        .await;
3160    }
3161}