starnix_core/vfs/
fs_node.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::device::DeviceMode;
6use crate::mm::PAGE_SIZE;
7use crate::security::{self, Auditable, PermissionFlags};
8use crate::signals::{SignalInfo, send_standard_signal};
9use crate::task::{CurrentTask, CurrentTaskAndLocked, WaitQueue, Waiter, register_delayed_release};
10use crate::time::utc;
11use crate::vfs::fsverity::FsVerityState;
12use crate::vfs::pipe::{Pipe, PipeHandle};
13use crate::vfs::rw_queue::{RwQueue, RwQueueReadGuard};
14use crate::vfs::socket::SocketHandle;
15use crate::vfs::{
16    DefaultDirEntryOps, DirEntryOps, FileObject, FileObjectState, FileOps, FileSystem,
17    FileSystemHandle, FileWriteGuardState, FsStr, FsString, MAX_LFS_FILESIZE, MountInfo,
18    NamespaceNode, OPathOps, RecordLockCommand, RecordLockOwner, RecordLocks, WeakFileHandle,
19    checked_add_offset_and_length, inotify,
20};
21use bitflags::bitflags;
22use fuchsia_runtime::UtcInstant;
23use linux_uapi::{XATTR_SECURITY_PREFIX, XATTR_SYSTEM_PREFIX, XATTR_TRUSTED_PREFIX};
24use once_cell::race::OnceBool;
25use starnix_crypt::EncryptionKeyId;
26use starnix_lifecycle::{ObjectReleaser, ReleaserAction};
27use starnix_logging::{log_error, track_stub};
28use starnix_sync::{
29    BeforeFsNodeAppend, FileOpsCore, FsNodeAppend, LockBefore, LockEqualOrBefore, Locked, Mutex,
30    RwLock, RwLockReadGuard, Unlocked,
31};
32use starnix_types::ownership::{Releasable, ReleaseGuard};
33use starnix_types::time::{NANOS_PER_SECOND, timespec_from_time};
34use starnix_uapi::as_any::AsAny;
35use starnix_uapi::auth::{
36    CAP_CHOWN, CAP_DAC_OVERRIDE, CAP_DAC_READ_SEARCH, CAP_FOWNER, CAP_FSETID, CAP_MKNOD,
37    CAP_SYS_ADMIN, CAP_SYS_RESOURCE, FsCred, UserAndOrGroupId,
38};
39use starnix_uapi::device_type::DeviceType;
40use starnix_uapi::errors::{EACCES, ENOTSUP, EPERM, Errno};
41use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
42use starnix_uapi::inotify_mask::InotifyMask;
43use starnix_uapi::mount_flags::MountFlags;
44use starnix_uapi::open_flags::OpenFlags;
45use starnix_uapi::resource_limits::Resource;
46use starnix_uapi::seal_flags::SealFlags;
47use starnix_uapi::signals::SIGXFSZ;
48use starnix_uapi::{
49    FALLOC_FL_COLLAPSE_RANGE, FALLOC_FL_INSERT_RANGE, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE,
50    FALLOC_FL_UNSHARE_RANGE, FALLOC_FL_ZERO_RANGE, LOCK_EX, LOCK_NB, LOCK_SH, LOCK_UN,
51    STATX__RESERVED, STATX_ATIME, STATX_ATTR_VERITY, STATX_BASIC_STATS, STATX_BLOCKS, STATX_CTIME,
52    STATX_GID, STATX_INO, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_UID, XATTR_USER_PREFIX,
53    errno, error, fsverity_descriptor, gid_t, ino_t, statx, statx_timestamp, timespec, uapi, uid_t,
54};
55use std::sync::atomic::Ordering;
56use std::sync::{Arc, OnceLock, Weak};
57use syncio::zxio_node_attr_has_t;
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum FsNodeLinkBehavior {
61    Allowed,
62    Disallowed,
63}
64
65impl Default for FsNodeLinkBehavior {
66    fn default() -> Self {
67        FsNodeLinkBehavior::Allowed
68    }
69}
70
71pub enum AppendLockGuard<'a> {
72    Read(RwQueueReadGuard<'a, FsNodeAppend>),
73    AlreadyLocked(&'a AppendLockGuard<'a>),
74}
75
76pub trait AppendLockStrategy<L> {
77    /// Helper method for acquiring append lock in `truncate`/`allocate`. Acquires the lock when it's not already acquired.
78    fn lock<'a>(
79        &'a self,
80        locked: &'a mut Locked<L>,
81        current_task: &CurrentTask,
82        node: &'a FsNode,
83    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno>;
84}
85
86struct RealAppendLockStrategy {}
87
88impl AppendLockStrategy<BeforeFsNodeAppend> for RealAppendLockStrategy {
89    fn lock<'a>(
90        &'a self,
91        locked: &'a mut Locked<BeforeFsNodeAppend>,
92        current_task: &CurrentTask,
93        node: &'a FsNode,
94    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno> {
95        let (guard, new_locked) = node.ops().append_lock_read(locked, node, current_task)?;
96        Ok((AppendLockGuard::Read(guard), new_locked.cast_locked()))
97    }
98}
99
100pub struct AlreadyLockedAppendLockStrategy<'a> {
101    // Keep the reference to the guard, which will be returned in subsequent attempts to acquire this lock.
102    guard: &'a AppendLockGuard<'a>,
103}
104
105impl<'a> AlreadyLockedAppendLockStrategy<'a> {
106    pub fn new(guard: &'a AppendLockGuard<'a>) -> Self {
107        Self { guard }
108    }
109}
110
111impl AppendLockStrategy<FileOpsCore> for AlreadyLockedAppendLockStrategy<'_> {
112    fn lock<'a>(
113        &'a self,
114        locked: &'a mut Locked<FileOpsCore>,
115        _current_task: &CurrentTask,
116        _node: &'a FsNode,
117    ) -> Result<(AppendLockGuard<'a>, &'a mut Locked<FileOpsCore>), Errno> {
118        Ok((AppendLockGuard::AlreadyLocked(self.guard), locked.cast_locked::<FileOpsCore>()))
119    }
120}
121
122pub struct FsNode {
123    /// The inode number for this FsNode.
124    pub ino: ino_t,
125
126    /// The FsNodeOps for this FsNode.
127    ///
128    /// The FsNodeOps are implemented by the individual file systems to provide
129    /// specific behaviors for this FsNode.
130    ops: Box<dyn FsNodeOps>,
131
132    /// The FileSystem that owns this FsNode's tree.
133    fs: Weak<FileSystem>,
134
135    /// A RwLock to synchronize append operations for this node.
136    ///
137    /// FileObjects writing with O_APPEND should grab a write() lock on this
138    /// field to ensure they operate sequentially. FileObjects writing without
139    /// O_APPEND should grab read() lock so that they can operate in parallel.
140    pub append_lock: RwQueue<FsNodeAppend>,
141
142    /// Mutable information about this node.
143    ///
144    /// This data is used to populate the uapi::stat structure.
145    info: RwLock<FsNodeInfo>,
146
147    /// Data associated with an FsNode that is rarely needed.
148    rare_data: OnceLock<Box<FsNodeRareData>>,
149
150    /// Tracks lock state for this file.
151    pub write_guard_state: Mutex<FileWriteGuardState>,
152
153    /// Cached FsVerity state associated with this node.
154    pub fsverity: Mutex<FsVerityState>,
155
156    /// The security state associated with this node. Must always be acquired last
157    /// relative to other `FsNode` locks.
158    pub security_state: security::FsNodeState,
159}
160
161#[derive(Default)]
162struct FsNodeRareData {
163    /// The pipe located at this node, if any.
164    ///
165    /// Used if, and only if, the node has a mode of FileMode::IFIFO.
166    fifo: OnceLock<PipeHandle>,
167
168    /// The UNIX domain socket bound to this node, if any.
169    bound_socket: OnceLock<SocketHandle>,
170
171    /// Information about the locking information on this node.
172    ///
173    /// No other lock on this object may be taken while this lock is held.
174    flock_info: Mutex<FlockInfo>,
175
176    /// Records locks associated with this node.
177    record_locks: RecordLocks,
178
179    /// Whether this node can be linked into a directory.
180    ///
181    /// Only set for nodes created with `O_TMPFILE`.
182    link_behavior: OnceLock<FsNodeLinkBehavior>,
183
184    /// Inotify watchers on this node. See inotify(7).
185    watchers: inotify::InotifyWatchers,
186}
187
188impl FsNodeRareData {
189    fn ensure_fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
190        self.fifo.get_or_init(|| {
191            let mut default_pipe_capacity = (*PAGE_SIZE * 16) as usize;
192            if !security::is_task_capable_noaudit(current_task, CAP_SYS_RESOURCE) {
193                let kernel = current_task.kernel();
194                let max_size = kernel.system_limits.pipe_max_size.load(Ordering::Relaxed);
195                default_pipe_capacity = std::cmp::min(default_pipe_capacity, max_size);
196            }
197            Pipe::new(default_pipe_capacity)
198        })
199    }
200}
201
202pub enum FsNodeReleaserAction {}
203impl ReleaserAction<FsNode> for FsNodeReleaserAction {
204    fn release(fs_node: ReleaseGuard<FsNode>) {
205        register_delayed_release(fs_node);
206    }
207}
208pub type FsNodeReleaser = ObjectReleaser<FsNode, FsNodeReleaserAction>;
209pub type FsNodeHandle = Arc<FsNodeReleaser>;
210pub type WeakFsNodeHandle = Weak<FsNodeReleaser>;
211
212#[derive(Debug, Default, Clone, PartialEq)]
213pub struct FsNodeInfo {
214    pub mode: FileMode,
215    pub link_count: usize,
216    pub uid: uid_t,
217    pub gid: gid_t,
218    pub rdev: DeviceType,
219    pub size: usize,
220    pub blksize: usize,
221    pub blocks: usize,
222    pub time_status_change: UtcInstant,
223    pub time_access: UtcInstant,
224    pub time_modify: UtcInstant,
225    pub casefold: bool,
226    // If this node is fscrypt encrypted, stores the id of the user wrapping key used to encrypt it.
227    pub wrapping_key_id: Option<[u8; 16]>,
228    // Used to indicate to filesystems that manage timestamps that an access has occurred and to
229    // update the node's atime.
230    // This only impacts accesses within Starnix. Most Fuchsia programs are not expected to maintain
231    // access times. If the file handle is transferred out of Starnix, there may be inconsistencies.
232    pub pending_time_access_update: bool,
233}
234
235impl FsNodeInfo {
236    pub fn new(mode: FileMode, owner: FsCred) -> Self {
237        let now = utc::utc_now();
238        Self {
239            mode,
240            link_count: if mode.is_dir() { 2 } else { 1 },
241            uid: owner.uid,
242            gid: owner.gid,
243            blksize: DEFAULT_BYTES_PER_BLOCK,
244            time_status_change: now,
245            time_access: now,
246            time_modify: now,
247            ..Default::default()
248        }
249    }
250
251    pub fn storage_size(&self) -> usize {
252        self.blksize.saturating_mul(self.blocks)
253    }
254
255    pub fn chmod(&mut self, mode: FileMode) {
256        self.mode = (self.mode & !FileMode::PERMISSIONS) | (mode & FileMode::PERMISSIONS);
257    }
258
259    pub fn chown(&mut self, owner: Option<uid_t>, group: Option<gid_t>) {
260        if let Some(owner) = owner {
261            self.uid = owner;
262        }
263        if let Some(group) = group {
264            self.gid = group;
265        }
266        // Clear the setuid and setgid bits if the file is executable and a regular file.
267        if self.mode.is_reg() {
268            self.mode &= !FileMode::ISUID;
269            self.clear_sgid_bit();
270        }
271    }
272
273    fn clear_sgid_bit(&mut self) {
274        // If the group execute bit is not set, the setgid bit actually indicates mandatory
275        // locking and should not be cleared.
276        if self.mode.intersects(FileMode::IXGRP) {
277            self.mode &= !FileMode::ISGID;
278        }
279    }
280
281    fn clear_suid_and_sgid_bits(&mut self) {
282        self.mode &= !FileMode::ISUID;
283        self.clear_sgid_bit();
284    }
285
286    pub fn cred(&self) -> FsCred {
287        FsCred { uid: self.uid, gid: self.gid }
288    }
289
290    pub fn suid_and_sgid(
291        &self,
292        current_task: &CurrentTask,
293        fs_node: &FsNode,
294    ) -> Result<UserAndOrGroupId, Errno> {
295        let uid = self.mode.contains(FileMode::ISUID).then_some(self.uid);
296
297        // See <https://man7.org/linux/man-pages/man7/inode.7.html>:
298        //
299        //   For an executable file, the set-group-ID bit causes the
300        //   effective group ID of a process that executes the file to change
301        //   as described in execve(2).  For a file that does not have the
302        //   group execution bit (S_IXGRP) set, the set-group-ID bit indicates
303        //   mandatory file/record locking.
304        let gid = self.mode.contains(FileMode::ISGID | FileMode::IXGRP).then_some(self.gid);
305
306        let maybe_set_id = UserAndOrGroupId { uid, gid };
307        if maybe_set_id.is_some() {
308            // Check that uid and gid actually have execute access before
309            // returning them as the SUID or SGID.
310            check_access(
311                fs_node,
312                current_task,
313                security::PermissionFlags::EXEC,
314                self.uid,
315                self.gid,
316                self.mode,
317            )?;
318        }
319        Ok(maybe_set_id)
320    }
321}
322
323#[derive(Default)]
324struct FlockInfo {
325    /// Whether the node is currently locked. The meaning of the different values are:
326    /// - `None`: The node is not locked.
327    /// - `Some(false)`: The node is locked non exclusively.
328    /// - `Some(true)`: The node is locked exclusively.
329    locked_exclusive: Option<bool>,
330    /// The FileObject that hold the lock.
331    locking_handles: Vec<WeakFileHandle>,
332    /// The queue to notify process waiting on the lock.
333    wait_queue: WaitQueue,
334}
335
336impl FlockInfo {
337    /// Removes all file handle not holding `predicate` from the list of object holding the lock. If
338    /// this empties the list, unlocks the node and notifies all waiting processes.
339    pub fn retain<F>(&mut self, predicate: F)
340    where
341        F: Fn(&FileObject) -> bool,
342    {
343        if !self.locking_handles.is_empty() {
344            self.locking_handles
345                .retain(|w| if let Some(fh) = w.upgrade() { predicate(&fh) } else { false });
346            if self.locking_handles.is_empty() {
347                self.locked_exclusive = None;
348                self.wait_queue.notify_all();
349            }
350        }
351    }
352}
353
354/// `st_blksize` is measured in units of 512 bytes.
355pub const DEFAULT_BYTES_PER_BLOCK: usize = 512;
356
357pub struct FlockOperation {
358    operation: u32,
359}
360
361impl FlockOperation {
362    pub fn from_flags(operation: u32) -> Result<Self, Errno> {
363        if operation & !(LOCK_SH | LOCK_EX | LOCK_UN | LOCK_NB) != 0 {
364            return error!(EINVAL);
365        }
366        if [LOCK_SH, LOCK_EX, LOCK_UN].iter().filter(|&&o| operation & o == o).count() != 1 {
367            return error!(EINVAL);
368        }
369        Ok(Self { operation })
370    }
371
372    pub fn is_unlock(&self) -> bool {
373        self.operation & LOCK_UN > 0
374    }
375
376    pub fn is_lock_exclusive(&self) -> bool {
377        self.operation & LOCK_EX > 0
378    }
379
380    pub fn is_blocking(&self) -> bool {
381        self.operation & LOCK_NB == 0
382    }
383}
384
385impl FileObject {
386    /// Advisory locking.
387    ///
388    /// See flock(2).
389    pub fn flock(
390        &self,
391        locked: &mut Locked<Unlocked>,
392        current_task: &CurrentTask,
393        operation: FlockOperation,
394    ) -> Result<(), Errno> {
395        if self.flags().contains(OpenFlags::PATH) {
396            return error!(EBADF);
397        }
398        loop {
399            let mut flock_info = self.name.entry.node.ensure_rare_data().flock_info.lock();
400            if operation.is_unlock() {
401                flock_info.retain(|fh| !std::ptr::eq(fh, self));
402                return Ok(());
403            }
404            // Operation is a locking operation.
405            // 1. File is not locked
406            if flock_info.locked_exclusive.is_none() {
407                flock_info.locked_exclusive = Some(operation.is_lock_exclusive());
408                flock_info.locking_handles.push(self.weak_handle.clone());
409                return Ok(());
410            }
411
412            let file_lock_is_exclusive = flock_info.locked_exclusive == Some(true);
413            let fd_has_lock = flock_info
414                .locking_handles
415                .iter()
416                .find_map(|w| {
417                    w.upgrade().and_then(|fh| {
418                        if std::ptr::eq(&fh as &FileObject, self) { Some(()) } else { None }
419                    })
420                })
421                .is_some();
422
423            // 2. File is locked, but fd already have a lock
424            if fd_has_lock {
425                if operation.is_lock_exclusive() == file_lock_is_exclusive {
426                    // Correct lock is already held, return.
427                    return Ok(());
428                } else {
429                    // Incorrect lock is held. Release the lock and loop back to try to reacquire
430                    // it. flock doesn't guarantee atomic lock type switching.
431                    flock_info.retain(|fh| !std::ptr::eq(fh, self));
432                    continue;
433                }
434            }
435
436            // 3. File is locked, and fd doesn't have a lock.
437            if !file_lock_is_exclusive && !operation.is_lock_exclusive() {
438                // The lock is not exclusive, let's grab it.
439                flock_info.locking_handles.push(self.weak_handle.clone());
440                return Ok(());
441            }
442
443            // 4. The operation cannot be done at this time.
444            if !operation.is_blocking() {
445                return error!(EAGAIN);
446            }
447
448            // Register a waiter to be notified when the lock is released. Release the lock on
449            // FlockInfo, and wait.
450            let waiter = Waiter::new();
451            flock_info.wait_queue.wait_async(&waiter);
452            std::mem::drop(flock_info);
453            waiter.wait(locked, current_task)?;
454        }
455    }
456}
457
458// The inner mod is required because bitflags cannot pass the attribute through to the single
459// variant, and attributes cannot be applied to macro invocations.
460mod inner_flags {
461    // Part of the code for the AT_STATX_SYNC_AS_STAT case that's produced by the macro triggers the
462    // lint, but as a whole, the produced code is still correct.
463    #![allow(clippy::bad_bit_mask)] // TODO(b/303500202) Remove once addressed in bitflags.
464    use super::{bitflags, uapi};
465
466    bitflags! {
467        #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
468        pub struct StatxFlags: u32 {
469            const AT_SYMLINK_NOFOLLOW = uapi::AT_SYMLINK_NOFOLLOW;
470            const AT_EMPTY_PATH = uapi::AT_EMPTY_PATH;
471            const AT_NO_AUTOMOUNT = uapi::AT_NO_AUTOMOUNT;
472            const AT_STATX_SYNC_AS_STAT = uapi::AT_STATX_SYNC_AS_STAT;
473            const AT_STATX_FORCE_SYNC = uapi::AT_STATX_FORCE_SYNC;
474            const AT_STATX_DONT_SYNC = uapi::AT_STATX_DONT_SYNC;
475            const STATX_ATTR_VERITY = uapi::STATX_ATTR_VERITY;
476        }
477    }
478}
479
480pub use inner_flags::StatxFlags;
481
482#[derive(Copy, Clone, Debug, PartialEq, Eq)]
483pub enum UnlinkKind {
484    /// Unlink a directory.
485    Directory,
486
487    /// Unlink a non-directory.
488    NonDirectory,
489}
490
491pub enum SymlinkTarget {
492    Path(FsString),
493    Node(NamespaceNode),
494}
495
496#[derive(Clone, Copy, PartialEq, Eq)]
497pub enum XattrOp {
498    /// Set the value of the extended attribute regardless of whether it exists.
499    Set,
500    /// Create a new extended attribute. Fail if it already exists.
501    Create,
502    /// Replace the value of the extended attribute. Fail if it doesn't exist.
503    Replace,
504}
505
506impl XattrOp {
507    pub fn into_flags(self) -> u32 {
508        match self {
509            Self::Set => 0,
510            Self::Create => uapi::XATTR_CREATE,
511            Self::Replace => uapi::XATTR_REPLACE,
512        }
513    }
514}
515
516/// Returns a value, or the size required to contains it.
517#[derive(Clone, Debug, PartialEq)]
518pub enum ValueOrSize<T> {
519    Value(T),
520    Size(usize),
521}
522
523impl<T> ValueOrSize<T> {
524    pub fn map<F, U>(self, f: F) -> ValueOrSize<U>
525    where
526        F: FnOnce(T) -> U,
527    {
528        match self {
529            Self::Size(s) => ValueOrSize::Size(s),
530            Self::Value(v) => ValueOrSize::Value(f(v)),
531        }
532    }
533
534    #[cfg(test)]
535    pub fn unwrap(self) -> T {
536        match self {
537            Self::Size(_) => panic!("Unwrap ValueOrSize that is a Size"),
538            Self::Value(v) => v,
539        }
540    }
541}
542
543impl<T> From<T> for ValueOrSize<T> {
544    fn from(t: T) -> Self {
545        Self::Value(t)
546    }
547}
548
549#[derive(Copy, Clone, Eq, PartialEq, Debug)]
550pub enum FallocMode {
551    Allocate { keep_size: bool },
552    PunchHole,
553    Collapse,
554    Zero { keep_size: bool },
555    InsertRange,
556    UnshareRange,
557}
558
559impl FallocMode {
560    pub fn from_bits(mode: u32) -> Option<Self> {
561        // `fallocate()` allows only the following values for `mode`.
562        if mode == 0 {
563            Some(Self::Allocate { keep_size: false })
564        } else if mode == FALLOC_FL_KEEP_SIZE {
565            Some(Self::Allocate { keep_size: true })
566        } else if mode == FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE {
567            Some(Self::PunchHole)
568        } else if mode == FALLOC_FL_COLLAPSE_RANGE {
569            Some(Self::Collapse)
570        } else if mode == FALLOC_FL_ZERO_RANGE {
571            Some(Self::Zero { keep_size: false })
572        } else if mode == FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE {
573            Some(Self::Zero { keep_size: true })
574        } else if mode == FALLOC_FL_INSERT_RANGE {
575            Some(Self::InsertRange)
576        } else if mode == FALLOC_FL_UNSHARE_RANGE {
577            Some(Self::UnshareRange)
578        } else {
579            None
580        }
581    }
582}
583
584#[derive(Debug, Copy, Clone, PartialEq)]
585pub enum CheckAccessReason {
586    Access,
587    Chdir,
588    Chroot,
589    Exec,
590    ChangeTimestamps { now: bool },
591    InternalPermissionChecks,
592}
593
594pub trait FsNodeOps: Send + Sync + AsAny + 'static {
595    /// Delegate the access check to the node.
596    fn check_access(
597        &self,
598        _locked: &mut Locked<FileOpsCore>,
599        node: &FsNode,
600        current_task: &CurrentTask,
601        access: security::PermissionFlags,
602        info: &RwLock<FsNodeInfo>,
603        reason: CheckAccessReason,
604        audit_context: security::Auditable<'_>,
605    ) -> Result<(), Errno> {
606        node.default_check_access_impl(current_task, access, reason, info.read(), audit_context)
607    }
608
609    /// Build the [`DirEntryOps`] for a new [`DirEntry`] that will be associated
610    /// to this node.
611    fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
612        Box::new(DefaultDirEntryOps)
613    }
614
615    /// Build the `FileOps` for the file associated to this node.
616    ///
617    /// The returned FileOps will be used to create a FileObject, which might
618    /// be assigned an FdNumber.
619    fn create_file_ops(
620        &self,
621        locked: &mut Locked<FileOpsCore>,
622        node: &FsNode,
623        _current_task: &CurrentTask,
624        flags: OpenFlags,
625    ) -> Result<Box<dyn FileOps>, Errno>;
626
627    /// Find an existing child node and populate the child parameter. Return the node.
628    ///
629    /// The child parameter is an empty node. Operations other than initialize may panic before
630    /// initialize is called.
631    fn lookup(
632        &self,
633        _locked: &mut Locked<FileOpsCore>,
634        _node: &FsNode,
635        _current_task: &CurrentTask,
636        name: &FsStr,
637    ) -> Result<FsNodeHandle, Errno> {
638        // The default implementation here is suitable for filesystems that have permanent entries;
639        // entries that already exist will get found in the cache and shouldn't get this far.
640        error!(ENOENT, format!("looking for {name}"))
641    }
642
643    /// Create and return the given child node.
644    ///
645    /// The mode field of the FsNodeInfo indicates what kind of child to
646    /// create.
647    ///
648    /// This function is never called with FileMode::IFDIR. The mkdir function
649    /// is used to create directories instead.
650    fn mknod(
651        &self,
652        locked: &mut Locked<FileOpsCore>,
653        _node: &FsNode,
654        _current_task: &CurrentTask,
655        _name: &FsStr,
656        _mode: FileMode,
657        _dev: DeviceType,
658        _owner: FsCred,
659    ) -> Result<FsNodeHandle, Errno>;
660
661    /// Create and return the given child node as a subdirectory.
662    fn mkdir(
663        &self,
664        locked: &mut Locked<FileOpsCore>,
665        _node: &FsNode,
666        _current_task: &CurrentTask,
667        _name: &FsStr,
668        _mode: FileMode,
669        _owner: FsCred,
670    ) -> Result<FsNodeHandle, Errno>;
671
672    /// Creates a symlink with the given `target` path.
673    fn create_symlink(
674        &self,
675        locked: &mut Locked<FileOpsCore>,
676        _node: &FsNode,
677        _current_task: &CurrentTask,
678        _name: &FsStr,
679        _target: &FsStr,
680        _owner: FsCred,
681    ) -> Result<FsNodeHandle, Errno>;
682
683    /// Creates an anonymous file.
684    ///
685    /// The FileMode::IFMT of the FileMode is always FileMode::IFREG.
686    ///
687    /// Used by O_TMPFILE.
688    fn create_tmpfile(
689        &self,
690        _node: &FsNode,
691        _current_task: &CurrentTask,
692        _mode: FileMode,
693        _owner: FsCred,
694    ) -> Result<FsNodeHandle, Errno> {
695        error!(EOPNOTSUPP)
696    }
697
698    /// Reads the symlink from this node.
699    fn readlink(
700        &self,
701        _locked: &mut Locked<FileOpsCore>,
702        _node: &FsNode,
703        _current_task: &CurrentTask,
704    ) -> Result<SymlinkTarget, Errno> {
705        error!(EINVAL)
706    }
707
708    /// Create a hard link with the given name to the given child.
709    fn link(
710        &self,
711        _locked: &mut Locked<FileOpsCore>,
712        _node: &FsNode,
713        _current_task: &CurrentTask,
714        _name: &FsStr,
715        _child: &FsNodeHandle,
716    ) -> Result<(), Errno> {
717        error!(EPERM)
718    }
719
720    /// Remove the child with the given name, if the child exists.
721    ///
722    /// The UnlinkKind parameter indicates whether the caller intends to unlink
723    /// a directory or a non-directory child.
724    fn unlink(
725        &self,
726        locked: &mut Locked<FileOpsCore>,
727        _node: &FsNode,
728        _current_task: &CurrentTask,
729        _name: &FsStr,
730        _child: &FsNodeHandle,
731    ) -> Result<(), Errno>;
732
733    /// Acquire the necessary append lock for the operations that depend on them.
734    /// Should be done before calling `allocate` or `truncate` to avoid lock ordering issues.
735    fn append_lock_read<'a>(
736        &'a self,
737        locked: &'a mut Locked<BeforeFsNodeAppend>,
738        node: &'a FsNode,
739        current_task: &CurrentTask,
740    ) -> Result<(RwQueueReadGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
741        return node.append_lock.read_and(locked, current_task);
742    }
743
744    /// Change the length of the file.
745    fn truncate(
746        &self,
747        _locked: &mut Locked<FileOpsCore>,
748        _guard: &AppendLockGuard<'_>,
749        _node: &FsNode,
750        _current_task: &CurrentTask,
751        _length: u64,
752    ) -> Result<(), Errno> {
753        error!(EINVAL)
754    }
755
756    /// Manipulate allocated disk space for the file.
757    fn allocate(
758        &self,
759        _locked: &mut Locked<FileOpsCore>,
760        _guard: &AppendLockGuard<'_>,
761        _node: &FsNode,
762        _current_task: &CurrentTask,
763        _mode: FallocMode,
764        _offset: u64,
765        _length: u64,
766    ) -> Result<(), Errno> {
767        error!(EINVAL)
768    }
769
770    /// Update the supplied info with initial state (e.g. size) for the node.
771    ///
772    /// FsNode calls this method when created, to allow the FsNodeOps to
773    /// set appropriate initial values in the FsNodeInfo.
774    fn initial_info(&self, _info: &mut FsNodeInfo) {}
775
776    /// Update node.info as needed.
777    ///
778    /// FsNode calls this method before converting the FsNodeInfo struct into
779    /// the uapi::stat struct to give the file system a chance to update this data
780    /// before it is used by clients.
781    ///
782    /// File systems that keep the FsNodeInfo up-to-date do not need to
783    /// override this function.
784    ///
785    /// Return a read guard for the updated information.
786    fn fetch_and_refresh_info<'a>(
787        &self,
788        _locked: &mut Locked<FileOpsCore>,
789        _node: &FsNode,
790        _current_task: &CurrentTask,
791        info: &'a RwLock<FsNodeInfo>,
792    ) -> Result<RwLockReadGuard<'a, FsNodeInfo>, Errno> {
793        Ok(info.read())
794    }
795
796    /// Update node attributes persistently.
797    fn update_attributes(
798        &self,
799        _locked: &mut Locked<FileOpsCore>,
800        _current_task: &CurrentTask,
801        _info: &FsNodeInfo,
802        _has: zxio_node_attr_has_t,
803    ) -> Result<(), Errno> {
804        Ok(())
805    }
806
807    /// Get an extended attribute on the node.
808    ///
809    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
810    /// instead return the size of the attribute, and can return an ERANGE error if max_size is not
811    /// 0, and lesser than the required size.
812    fn get_xattr(
813        &self,
814        _locked: &mut Locked<FileOpsCore>,
815        _node: &FsNode,
816        _current_task: &CurrentTask,
817        _name: &FsStr,
818        _max_size: usize,
819    ) -> Result<ValueOrSize<FsString>, Errno> {
820        error!(ENOTSUP)
821    }
822
823    /// Set an extended attribute on the node.
824    fn set_xattr(
825        &self,
826        _locked: &mut Locked<FileOpsCore>,
827        _node: &FsNode,
828        _current_task: &CurrentTask,
829        _name: &FsStr,
830        _value: &FsStr,
831        _op: XattrOp,
832    ) -> Result<(), Errno> {
833        error!(ENOTSUP)
834    }
835
836    fn remove_xattr(
837        &self,
838        _locked: &mut Locked<FileOpsCore>,
839        _node: &FsNode,
840        _current_task: &CurrentTask,
841        _name: &FsStr,
842    ) -> Result<(), Errno> {
843        error!(ENOTSUP)
844    }
845
846    /// An implementation can systematically return a value. Otherwise, if `max_size` is 0, it can
847    /// instead return the size of the 0 separated string needed to represent the value, and can
848    /// return an ERANGE error if max_size is not 0, and lesser than the required size.
849    fn list_xattrs(
850        &self,
851        _locked: &mut Locked<FileOpsCore>,
852        _node: &FsNode,
853        _current_task: &CurrentTask,
854        _max_size: usize,
855    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
856        error!(ENOTSUP)
857    }
858
859    /// Called when the FsNode is freed by the Kernel.
860    fn forget(
861        self: Box<Self>,
862        _locked: &mut Locked<FileOpsCore>,
863        _current_task: &CurrentTask,
864        _info: FsNodeInfo,
865    ) -> Result<(), Errno> {
866        Ok(())
867    }
868
869    ////////////////////
870    // FS-Verity operations
871
872    /// Marks that FS-Verity is being built. Writes fsverity descriptor and merkle tree, the latter
873    /// computed by the filesystem.
874    /// This should ensure there are no writable file handles. Returns EEXIST if the file was
875    /// already fsverity-enabled. Returns EBUSY if this ioctl was already running on this file.
876    fn enable_fsverity(&self, _descriptor: &fsverity_descriptor) -> Result<(), Errno> {
877        error!(ENOTSUP)
878    }
879
880    /// Read fsverity descriptor, if the node is fsverity-enabled. Else returns ENODATA.
881    fn get_fsverity_descriptor(&self, _log_blocksize: u8) -> Result<fsverity_descriptor, Errno> {
882        error!(ENOTSUP)
883    }
884
885    /// Returns a descriptive name for this node, suitable to report to userspace in situations
886    /// where the node's path is unavailable (e.g. because it is anonymous, and has no path).
887    /// If no name is returned then a default name of the form "<class:[<node_id>]" will be used.
888    fn internal_name(&self, _node: &FsNode) -> Option<FsString> {
889        None
890    }
891
892    /// The key used to identify this node in the file system's node cache.
893    ///
894    /// For many file systems, this will be the same as the inode number. However, some file
895    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
896    fn node_key(&self, node: &FsNode) -> ino_t {
897        node.ino
898    }
899}
900
901impl<T> From<T> for Box<dyn FsNodeOps>
902where
903    T: FsNodeOps,
904{
905    fn from(ops: T) -> Box<dyn FsNodeOps> {
906        Box::new(ops)
907    }
908}
909
910/// Implements [`FsNodeOps`] methods in a way that makes sense for symlinks.
911/// You must implement [`FsNodeOps::readlink`].
912#[macro_export]
913macro_rules! fs_node_impl_symlink {
914    () => {
915        $crate::vfs::fs_node_impl_not_dir!();
916
917        fn create_file_ops(
918            &self,
919            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
920            node: &$crate::vfs::FsNode,
921            _current_task: &CurrentTask,
922            _flags: starnix_uapi::open_flags::OpenFlags,
923        ) -> Result<Box<dyn $crate::vfs::FileOps>, starnix_uapi::errors::Errno> {
924            assert!(node.is_lnk());
925            unreachable!("Symlink nodes cannot be opened.");
926        }
927    };
928}
929
930#[macro_export]
931macro_rules! fs_node_impl_dir_readonly {
932    () => {
933        fn check_access(
934            &self,
935            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
936            node: &$crate::vfs::FsNode,
937            current_task: &$crate::task::CurrentTask,
938            permission_flags: $crate::security::PermissionFlags,
939            info: &starnix_sync::RwLock<$crate::vfs::FsNodeInfo>,
940            reason: $crate::vfs::CheckAccessReason,
941            audit_context: $crate::security::Auditable<'_>,
942        ) -> Result<(), starnix_uapi::errors::Errno> {
943            let access = permission_flags.as_access();
944            if access.contains(starnix_uapi::file_mode::Access::WRITE) {
945                return starnix_uapi::error!(
946                    EROFS,
947                    format!("check_access failed: read-only directory")
948                );
949            }
950            node.default_check_access_impl(
951                current_task,
952                permission_flags,
953                reason,
954                info.read(),
955                audit_context,
956            )
957        }
958
959        fn mkdir(
960            &self,
961            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
962            _node: &$crate::vfs::FsNode,
963            _current_task: &$crate::task::CurrentTask,
964            name: &$crate::vfs::FsStr,
965            _mode: starnix_uapi::file_mode::FileMode,
966            _owner: starnix_uapi::auth::FsCred,
967        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
968            starnix_uapi::error!(EROFS, format!("mkdir failed: {:?}", name))
969        }
970
971        fn mknod(
972            &self,
973            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
974            _node: &$crate::vfs::FsNode,
975            _current_task: &$crate::task::CurrentTask,
976            name: &$crate::vfs::FsStr,
977            _mode: starnix_uapi::file_mode::FileMode,
978            _dev: starnix_uapi::device_type::DeviceType,
979            _owner: starnix_uapi::auth::FsCred,
980        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
981            starnix_uapi::error!(EROFS, format!("mknod failed: {:?}", name))
982        }
983
984        fn create_symlink(
985            &self,
986            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
987            _node: &$crate::vfs::FsNode,
988            _current_task: &$crate::task::CurrentTask,
989            name: &$crate::vfs::FsStr,
990            _target: &$crate::vfs::FsStr,
991            _owner: starnix_uapi::auth::FsCred,
992        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
993            starnix_uapi::error!(EROFS, format!("symlink failed: {:?}", name))
994        }
995
996        fn link(
997            &self,
998            _locked: &mut Locked<FileOpsCore>,
999            _node: &$crate::vfs::FsNode,
1000            _current_task: &$crate::task::CurrentTask,
1001            name: &$crate::vfs::FsStr,
1002            _child: &$crate::vfs::FsNodeHandle,
1003        ) -> Result<(), starnix_uapi::errors::Errno> {
1004            starnix_uapi::error!(EROFS, format!("link failed: {:?}", name))
1005        }
1006
1007        fn unlink(
1008            &self,
1009            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1010            _node: &$crate::vfs::FsNode,
1011            _current_task: &$crate::task::CurrentTask,
1012            name: &$crate::vfs::FsStr,
1013            _child: &$crate::vfs::FsNodeHandle,
1014        ) -> Result<(), starnix_uapi::errors::Errno> {
1015            starnix_uapi::error!(EROFS, format!("unlink failed: {:?}", name))
1016        }
1017    };
1018}
1019
1020/// Trait that objects can implement if they need to handle extended attribute storage. Allows
1021/// delegating extended attribute operations in [`FsNodeOps`] to another object.
1022///
1023/// See [`fs_node_impl_xattr_delegate`] for usage details.
1024pub trait XattrStorage {
1025    /// Delegate for [`FsNodeOps::get_xattr`].
1026    fn get_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<FsString, Errno>;
1027
1028    /// Delegate for [`FsNodeOps::set_xattr`].
1029    fn set_xattr(
1030        &self,
1031        locked: &mut Locked<FileOpsCore>,
1032        name: &FsStr,
1033        value: &FsStr,
1034        op: XattrOp,
1035    ) -> Result<(), Errno>;
1036
1037    /// Delegate for [`FsNodeOps::remove_xattr`].
1038    fn remove_xattr(&self, locked: &mut Locked<FileOpsCore>, name: &FsStr) -> Result<(), Errno>;
1039
1040    /// Delegate for [`FsNodeOps::list_xattrs`].
1041    fn list_xattrs(&self, locked: &mut Locked<FileOpsCore>) -> Result<Vec<FsString>, Errno>;
1042}
1043
1044/// Implements extended attribute ops for [`FsNodeOps`] by delegating to another object which
1045/// implements the [`XattrStorage`] trait or a similar interface. For example:
1046///
1047/// ```
1048/// struct Xattrs {}
1049///
1050/// impl XattrStorage for Xattrs {
1051///     // implement XattrStorage
1052/// }
1053///
1054/// struct Node {
1055///     xattrs: Xattrs
1056/// }
1057///
1058/// impl FsNodeOps for Node {
1059///     // Delegate extended attribute ops in FsNodeOps to self.xattrs
1060///     fs_node_impl_xattr_delegate!(self, self.xattrs);
1061///
1062///     // add other FsNodeOps impls here
1063/// }
1064/// ```
1065#[macro_export]
1066macro_rules! fs_node_impl_xattr_delegate {
1067    ($self:ident, $delegate:expr) => {
1068        fn get_xattr(
1069            &$self,
1070            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1071            _node: &FsNode,
1072            _current_task: &CurrentTask,
1073            name: &$crate::vfs::FsStr,
1074            _size: usize,
1075        ) -> Result<$crate::vfs::ValueOrSize<$crate::vfs::FsString>, starnix_uapi::errors::Errno> {
1076            Ok($delegate.get_xattr(locked, name)?.into())
1077        }
1078
1079        fn set_xattr(
1080            &$self,
1081            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1082            _node: &FsNode,
1083            _current_task: &CurrentTask,
1084            name: &$crate::vfs::FsStr,
1085            value: &$crate::vfs::FsStr,
1086            op: $crate::vfs::XattrOp,
1087        ) -> Result<(), starnix_uapi::errors::Errno> {
1088            $delegate.set_xattr(locked, name, value, op)
1089        }
1090
1091        fn remove_xattr(
1092            &$self,
1093            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1094            _node: &FsNode,
1095            _current_task: &CurrentTask,
1096            name: &$crate::vfs::FsStr,
1097        ) -> Result<(), starnix_uapi::errors::Errno> {
1098            $delegate.remove_xattr(locked, name)
1099        }
1100
1101        fn list_xattrs(
1102            &$self,
1103            locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1104            _node: &FsNode,
1105            _current_task: &CurrentTask,
1106            _size: usize,
1107        ) -> Result<$crate::vfs::ValueOrSize<Vec<$crate::vfs::FsString>>, starnix_uapi::errors::Errno> {
1108            Ok($delegate.list_xattrs(locked)?.into())
1109        }
1110    };
1111}
1112
1113/// Stubs out [`FsNodeOps`] methods that only apply to directories.
1114#[macro_export]
1115macro_rules! fs_node_impl_not_dir {
1116    () => {
1117        fn lookup(
1118            &self,
1119            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1120            _node: &$crate::vfs::FsNode,
1121            _current_task: &$crate::task::CurrentTask,
1122            _name: &$crate::vfs::FsStr,
1123        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1124            starnix_uapi::error!(ENOTDIR)
1125        }
1126
1127        fn mknod(
1128            &self,
1129            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1130            _node: &$crate::vfs::FsNode,
1131            _current_task: &$crate::task::CurrentTask,
1132            _name: &$crate::vfs::FsStr,
1133            _mode: starnix_uapi::file_mode::FileMode,
1134            _dev: starnix_uapi::device_type::DeviceType,
1135            _owner: starnix_uapi::auth::FsCred,
1136        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1137            starnix_uapi::error!(ENOTDIR)
1138        }
1139
1140        fn mkdir(
1141            &self,
1142            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1143            _node: &$crate::vfs::FsNode,
1144            _current_task: &$crate::task::CurrentTask,
1145            _name: &$crate::vfs::FsStr,
1146            _mode: starnix_uapi::file_mode::FileMode,
1147            _owner: starnix_uapi::auth::FsCred,
1148        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1149            starnix_uapi::error!(ENOTDIR)
1150        }
1151
1152        fn create_symlink(
1153            &self,
1154            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1155            _node: &$crate::vfs::FsNode,
1156            _current_task: &$crate::task::CurrentTask,
1157            _name: &$crate::vfs::FsStr,
1158            _target: &$crate::vfs::FsStr,
1159            _owner: starnix_uapi::auth::FsCred,
1160        ) -> Result<$crate::vfs::FsNodeHandle, starnix_uapi::errors::Errno> {
1161            starnix_uapi::error!(ENOTDIR)
1162        }
1163
1164        fn unlink(
1165            &self,
1166            _locked: &mut starnix_sync::Locked<starnix_sync::FileOpsCore>,
1167            _node: &$crate::vfs::FsNode,
1168            _current_task: &$crate::task::CurrentTask,
1169            _name: &$crate::vfs::FsStr,
1170            _child: &$crate::vfs::FsNodeHandle,
1171        ) -> Result<(), starnix_uapi::errors::Errno> {
1172            starnix_uapi::error!(ENOTDIR)
1173        }
1174    };
1175}
1176
1177#[derive(Copy, Clone, Debug, PartialEq, Eq)]
1178pub enum TimeUpdateType {
1179    Now,
1180    Omit,
1181    Time(UtcInstant),
1182}
1183
1184// Public re-export of macros allows them to be used like regular rust items.
1185pub use {
1186    fs_node_impl_dir_readonly, fs_node_impl_not_dir, fs_node_impl_symlink,
1187    fs_node_impl_xattr_delegate,
1188};
1189
1190pub struct SpecialNode;
1191
1192impl FsNodeOps for SpecialNode {
1193    fs_node_impl_not_dir!();
1194
1195    fn create_file_ops(
1196        &self,
1197        _locked: &mut Locked<FileOpsCore>,
1198        _node: &FsNode,
1199        _current_task: &CurrentTask,
1200        _flags: OpenFlags,
1201    ) -> Result<Box<dyn FileOps>, Errno> {
1202        unreachable!("Special nodes cannot be opened.");
1203    }
1204}
1205
1206impl FsNode {
1207    /// Create a node without inserting it into the FileSystem node cache.
1208    ///
1209    /// This is usually not what you want!
1210    /// Only use if you're also using get_or_create_node, like ext4.
1211    pub fn new_uncached(
1212        ino: ino_t,
1213        ops: impl Into<Box<dyn FsNodeOps>>,
1214        fs: &FileSystemHandle,
1215        info: FsNodeInfo,
1216    ) -> FsNodeHandle {
1217        let ops = ops.into();
1218        FsNodeHandle::new(Self::new_internal(ino, ops, Arc::downgrade(fs), info).into())
1219    }
1220
1221    fn new_internal(
1222        ino: ino_t,
1223        ops: Box<dyn FsNodeOps>,
1224        fs: Weak<FileSystem>,
1225        info: FsNodeInfo,
1226    ) -> Self {
1227        // Allow the FsNodeOps to populate initial info.
1228        let info = {
1229            let mut info = info;
1230            ops.initial_info(&mut info);
1231            info
1232        };
1233
1234        // The linter will fail in non test mode as it will not see the lock check.
1235        #[allow(clippy::let_and_return)]
1236        {
1237            let result = Self {
1238                ino,
1239                ops,
1240                fs,
1241                info: RwLock::new(info),
1242                append_lock: Default::default(),
1243                rare_data: Default::default(),
1244                write_guard_state: Default::default(),
1245                fsverity: Mutex::new(FsVerityState::None),
1246                security_state: Default::default(),
1247            };
1248            #[cfg(any(test, debug_assertions))]
1249            {
1250                #[allow(
1251                    clippy::undocumented_unsafe_blocks,
1252                    reason = "Force documented unsafe blocks in Starnix"
1253                )]
1254                let locked = unsafe { Unlocked::new() };
1255                let _l1 = result.append_lock.read_for_lock_ordering(locked);
1256                let _l2 = result.info.read();
1257                let _l3 = result.write_guard_state.lock();
1258                let _l4 = result.fsverity.lock();
1259                // TODO(https://fxbug.dev/367585803): Add lock levels to SELinux implementation.
1260                let _l5 = result.security_state.lock();
1261            }
1262            result
1263        }
1264    }
1265
1266    pub fn fs(&self) -> FileSystemHandle {
1267        self.fs.upgrade().expect("FileSystem did not live long enough")
1268    }
1269
1270    pub fn ops(&self) -> &dyn FsNodeOps {
1271        self.ops.as_ref()
1272    }
1273
1274    /// Returns an error if this node is encrypted and locked. Does not require
1275    /// fetch_and_refresh_info because FS_IOC_SET_ENCRYPTION_POLICY updates info and once a node is
1276    /// encrypted, it remains encrypted forever.
1277    pub fn fail_if_locked(&self, _current_task: &CurrentTask) -> Result<(), Errno> {
1278        let node_info = self.info();
1279        if let Some(wrapping_key_id) = node_info.wrapping_key_id {
1280            let crypt_service = self.fs().crypt_service().ok_or_else(|| errno!(ENOKEY))?;
1281            if !crypt_service.contains_key(EncryptionKeyId::from(wrapping_key_id)) {
1282                return error!(ENOKEY);
1283            }
1284        }
1285        Ok(())
1286    }
1287
1288    /// Returns the `FsNode`'s `FsNodeOps` as a `&T`, or `None` if the downcast fails.
1289    pub fn downcast_ops<T>(&self) -> Option<&T>
1290    where
1291        T: 'static,
1292    {
1293        self.ops().as_any().downcast_ref::<T>()
1294    }
1295
1296    pub fn on_file_closed(&self, file: &FileObjectState) {
1297        if let Some(rare_data) = self.rare_data.get() {
1298            let mut flock_info = rare_data.flock_info.lock();
1299            // This function will drop the flock from `file` because the `WeakFileHandle` for
1300            // `file` will no longer upgrade to an `FileHandle`.
1301            flock_info.retain(|_| true);
1302        }
1303        self.record_lock_release(RecordLockOwner::FileObject(file.id));
1304    }
1305
1306    pub fn record_lock(
1307        &self,
1308        locked: &mut Locked<Unlocked>,
1309        current_task: &CurrentTask,
1310        file: &FileObject,
1311        cmd: RecordLockCommand,
1312        flock: uapi::flock,
1313    ) -> Result<Option<uapi::flock>, Errno> {
1314        self.ensure_rare_data().record_locks.lock(locked, current_task, file, cmd, flock)
1315    }
1316
1317    /// Release all record locks acquired by the given owner.
1318    pub fn record_lock_release(&self, owner: RecordLockOwner) {
1319        if let Some(rare_data) = self.rare_data.get() {
1320            rare_data.record_locks.release_locks(owner);
1321        }
1322    }
1323
1324    pub fn create_dir_entry_ops(&self) -> Box<dyn DirEntryOps> {
1325        self.ops().create_dir_entry_ops()
1326    }
1327
1328    pub fn create_file_ops<L>(
1329        &self,
1330        locked: &mut Locked<L>,
1331        current_task: &CurrentTask,
1332        flags: OpenFlags,
1333    ) -> Result<Box<dyn FileOps>, Errno>
1334    where
1335        L: LockEqualOrBefore<FileOpsCore>,
1336    {
1337        let locked = locked.cast_locked::<FileOpsCore>();
1338        self.ops().create_file_ops(locked, self, current_task, flags)
1339    }
1340
1341    pub fn open(
1342        &self,
1343        locked: &mut Locked<Unlocked>,
1344        current_task: &CurrentTask,
1345        namespace_node: &NamespaceNode,
1346        flags: OpenFlags,
1347        access_check: AccessCheck,
1348    ) -> Result<Box<dyn FileOps>, Errno> {
1349        // If O_PATH is set, there is no need to create a real FileOps because
1350        // most file operations are disabled.
1351        if flags.contains(OpenFlags::PATH) {
1352            return Ok(Box::new(OPathOps::new()));
1353        }
1354
1355        let access = access_check.resolve(flags);
1356        if access.is_nontrivial() {
1357            if flags.contains(OpenFlags::NOATIME) {
1358                self.check_o_noatime_allowed(current_task)?;
1359            }
1360
1361            // `flags` doesn't contain any information about the EXEC permission. Instead the syscalls
1362            // used to execute a file (`sys_execve` and `sys_execveat`) call `open()` with the EXEC
1363            // permission request in `access`.
1364            let mut permission_flags = PermissionFlags::from(access);
1365
1366            // The `APPEND` flag exists only in `flags`, to modify the behaviour of
1367            // `PermissionFlags::WRITE`
1368            if flags.contains(OpenFlags::APPEND) {
1369                permission_flags |= security::PermissionFlags::APPEND;
1370            }
1371
1372            // TODO: https://fxbug.dev/455782510 - Remove this once non-open() checks are fully
1373            // enforced.
1374            permission_flags |= security::PermissionFlags::FOR_OPEN;
1375
1376            self.check_access(
1377                locked,
1378                current_task,
1379                &namespace_node.mount,
1380                permission_flags,
1381                CheckAccessReason::InternalPermissionChecks,
1382                namespace_node,
1383            )?;
1384        }
1385
1386        let (mode, rdev) = {
1387            // Don't hold the info lock while calling into open_device or self.ops().
1388            // TODO: The mode and rdev are immutable and shouldn't require a lock to read.
1389            let info = self.info();
1390            (info.mode, info.rdev)
1391        };
1392
1393        match mode & FileMode::IFMT {
1394            FileMode::IFCHR => {
1395                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1396                    return error!(EACCES);
1397                }
1398                current_task.kernel().open_device(
1399                    locked,
1400                    current_task,
1401                    namespace_node,
1402                    flags,
1403                    rdev,
1404                    DeviceMode::Char,
1405                )
1406            }
1407            FileMode::IFBLK => {
1408                if namespace_node.mount.flags().contains(MountFlags::NODEV) {
1409                    return error!(EACCES);
1410                }
1411                current_task.kernel().open_device(
1412                    locked,
1413                    current_task,
1414                    namespace_node,
1415                    flags,
1416                    rdev,
1417                    DeviceMode::Block,
1418                )
1419            }
1420            FileMode::IFIFO => Pipe::open(locked, current_task, self.fifo(current_task), flags),
1421            // UNIX domain sockets can't be opened.
1422            FileMode::IFSOCK => error!(ENXIO),
1423            _ => self.create_file_ops(locked, current_task, flags),
1424        }
1425    }
1426
1427    pub fn lookup<L>(
1428        &self,
1429        locked: &mut Locked<L>,
1430        current_task: &CurrentTask,
1431        mount: &MountInfo,
1432        name: &FsStr,
1433    ) -> Result<FsNodeHandle, Errno>
1434    where
1435        L: LockEqualOrBefore<FileOpsCore>,
1436    {
1437        self.check_access(
1438            locked,
1439            current_task,
1440            mount,
1441            Access::EXEC,
1442            CheckAccessReason::InternalPermissionChecks,
1443            &[Auditable::Name(name), std::panic::Location::caller().into()],
1444        )?;
1445        let locked = locked.cast_locked::<FileOpsCore>();
1446        self.ops().lookup(locked, self, current_task, name)
1447    }
1448
1449    pub fn create_node<L>(
1450        &self,
1451        locked: &mut Locked<L>,
1452        current_task: &CurrentTask,
1453        mount: &MountInfo,
1454        name: &FsStr,
1455        mut mode: FileMode,
1456        dev: DeviceType,
1457        mut owner: FsCred,
1458    ) -> Result<FsNodeHandle, Errno>
1459    where
1460        L: LockEqualOrBefore<FileOpsCore>,
1461    {
1462        assert!(mode & FileMode::IFMT != FileMode::EMPTY, "mknod called without node type.");
1463        self.check_access(
1464            locked,
1465            current_task,
1466            mount,
1467            Access::WRITE,
1468            CheckAccessReason::InternalPermissionChecks,
1469            security::Auditable::Name(name),
1470        )?;
1471        if mode.is_reg() {
1472            security::check_fs_node_create_access(current_task, self, mode, name)?;
1473        } else if mode.is_dir() {
1474            // Even though the man page for mknod(2) says that mknod "cannot be used to create
1475            // directories" in starnix the mkdir syscall (`sys_mkdirat`) ends up calling
1476            //create_node.
1477            security::check_fs_node_mkdir_access(current_task, self, mode, name)?;
1478        } else if !matches!(
1479            mode.fmt(),
1480            FileMode::IFCHR | FileMode::IFBLK | FileMode::IFIFO | FileMode::IFSOCK
1481        ) {
1482            security::check_fs_node_mknod_access(current_task, self, mode, name, dev)?;
1483        }
1484
1485        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1486
1487        let new_node = if mode.is_dir() {
1488            let locked = locked.cast_locked::<FileOpsCore>();
1489            self.ops().mkdir(locked, self, current_task, name, mode, owner)?
1490        } else {
1491            // https://man7.org/linux/man-pages/man2/mknod.2.html says on error EPERM:
1492            //
1493            //   mode requested creation of something other than a regular
1494            //   file, FIFO (named pipe), or UNIX domain socket, and the
1495            //   caller is not privileged (Linux: does not have the
1496            //   CAP_MKNOD capability); also returned if the filesystem
1497            //   containing pathname does not support the type of node
1498            //   requested.
1499            if !matches!(mode.fmt(), FileMode::IFREG | FileMode::IFIFO | FileMode::IFSOCK) {
1500                security::check_task_capable(current_task, CAP_MKNOD)?;
1501            }
1502            let locked = locked.cast_locked::<FileOpsCore>();
1503            self.ops().mknod(locked, self, current_task, name, mode, dev, owner)?
1504        };
1505
1506        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1507
1508        Ok(new_node)
1509    }
1510
1511    pub fn create_symlink<L>(
1512        &self,
1513        locked: &mut Locked<L>,
1514        current_task: &CurrentTask,
1515        mount: &MountInfo,
1516        name: &FsStr,
1517        target: &FsStr,
1518        owner: FsCred,
1519    ) -> Result<FsNodeHandle, Errno>
1520    where
1521        L: LockEqualOrBefore<FileOpsCore>,
1522    {
1523        self.check_access(
1524            locked,
1525            current_task,
1526            mount,
1527            Access::WRITE,
1528            CheckAccessReason::InternalPermissionChecks,
1529            security::Auditable::Name(name),
1530        )?;
1531        security::check_fs_node_symlink_access(current_task, self, name, target)?;
1532
1533        let locked = locked.cast_locked::<FileOpsCore>();
1534        let new_node =
1535            self.ops().create_symlink(locked, self, current_task, name, target, owner)?;
1536
1537        self.init_new_node_security_on_create(locked, current_task, &new_node, name)?;
1538
1539        Ok(new_node)
1540    }
1541
1542    /// Requests that the LSM initialise a security label for the `new_node`, and optionally provide
1543    /// an extended attribute to write to the file to persist it.  If no LSM is enabled, no extended
1544    /// attribute returned, or if the filesystem does not support extended attributes, then the call
1545    /// returns success. All other failure modes return an `Errno` that should be early-returned.
1546    fn init_new_node_security_on_create<L>(
1547        &self,
1548        locked: &mut Locked<L>,
1549        current_task: &CurrentTask,
1550        new_node: &FsNode,
1551        name: &FsStr,
1552    ) -> Result<(), Errno>
1553    where
1554        L: LockEqualOrBefore<FileOpsCore>,
1555    {
1556        let locked = locked.cast_locked::<FileOpsCore>();
1557        security::fs_node_init_on_create(current_task, &new_node, self, name)?
1558            .map(|xattr| {
1559                match new_node.ops().set_xattr(
1560                    locked,
1561                    &new_node,
1562                    current_task,
1563                    xattr.name,
1564                    xattr.value.as_slice().into(),
1565                    XattrOp::Create,
1566                ) {
1567                    Err(e) => {
1568                        if e.code == ENOTSUP {
1569                            // This should only occur if a task has an "fscreate" context set, and
1570                            // creates a new file in a filesystem that does not support xattrs.
1571                            Ok(())
1572                        } else {
1573                            Err(e)
1574                        }
1575                    }
1576                    result => result,
1577                }
1578            })
1579            .unwrap_or_else(|| Ok(()))
1580    }
1581
1582    pub fn create_tmpfile<L>(
1583        &self,
1584        locked: &mut Locked<L>,
1585        current_task: &CurrentTask,
1586        mount: &MountInfo,
1587        mut mode: FileMode,
1588        mut owner: FsCred,
1589        link_behavior: FsNodeLinkBehavior,
1590    ) -> Result<FsNodeHandle, Errno>
1591    where
1592        L: LockEqualOrBefore<FileOpsCore>,
1593    {
1594        self.check_access(
1595            locked,
1596            current_task,
1597            mount,
1598            Access::WRITE,
1599            CheckAccessReason::InternalPermissionChecks,
1600            security::Auditable::Location(std::panic::Location::caller()),
1601        )?;
1602        self.update_metadata_for_child(current_task, &mut mode, &mut owner);
1603        let node = self.ops().create_tmpfile(self, current_task, mode, owner)?;
1604        self.init_new_node_security_on_create(locked, current_task, &node, "".into())?;
1605        if link_behavior == FsNodeLinkBehavior::Disallowed {
1606            node.ensure_rare_data().link_behavior.set(link_behavior).unwrap();
1607        }
1608        Ok(node)
1609    }
1610
1611    // This method does not attempt to update the atime of the node.
1612    // Use `NamespaceNode::readlink` which checks the mount flags and updates the atime accordingly.
1613    pub fn readlink<L>(
1614        &self,
1615        locked: &mut Locked<L>,
1616        current_task: &CurrentTask,
1617    ) -> Result<SymlinkTarget, Errno>
1618    where
1619        L: LockEqualOrBefore<FileOpsCore>,
1620    {
1621        // TODO: 378864856 - Is there a permission check here other than security checks?
1622        security::check_fs_node_read_link_access(current_task, self)?;
1623        self.ops().readlink(locked.cast_locked::<FileOpsCore>(), self, current_task)
1624    }
1625
1626    pub fn link<L>(
1627        &self,
1628        locked: &mut Locked<L>,
1629        current_task: &CurrentTask,
1630        mount: &MountInfo,
1631        name: &FsStr,
1632        child: &FsNodeHandle,
1633    ) -> Result<FsNodeHandle, Errno>
1634    where
1635        L: LockEqualOrBefore<FileOpsCore>,
1636    {
1637        self.check_access(
1638            locked,
1639            current_task,
1640            mount,
1641            Access::WRITE,
1642            CheckAccessReason::InternalPermissionChecks,
1643            security::Auditable::Location(std::panic::Location::caller()),
1644        )?;
1645
1646        if child.is_dir() {
1647            return error!(EPERM);
1648        }
1649
1650        if let Some(child_rare_data) = child.rare_data.get() {
1651            if matches!(child_rare_data.link_behavior.get(), Some(FsNodeLinkBehavior::Disallowed)) {
1652                return error!(ENOENT);
1653            }
1654        }
1655
1656        // Check that `current_task` has permission to create the hard link.
1657        //
1658        // See description of /proc/sys/fs/protected_hardlinks in
1659        // https://man7.org/linux/man-pages/man5/proc.5.html for details of the security
1660        // vulnerabilities.
1661        //
1662        let fsuid = current_task.with_current_creds(|creds| creds.fsuid);
1663        let (child_uid, mode) = {
1664            let info = child.info();
1665            (info.uid, info.mode)
1666        };
1667        // Check that the the filesystem UID of the calling process (`current_task`) is the same as
1668        // the UID of the existing file. The check can be bypassed if the calling process has
1669        // `CAP_FOWNER` capability.
1670        if child_uid != fsuid && !security::is_task_capable_noaudit(current_task, CAP_FOWNER) {
1671            // If current_task is not the user of the existing file, it needs to have read and write
1672            // access to the existing file.
1673            child
1674                .check_access(
1675                    locked,
1676                    current_task,
1677                    mount,
1678                    Access::READ | Access::WRITE,
1679                    CheckAccessReason::InternalPermissionChecks,
1680                    security::Auditable::Name(name),
1681                )
1682                .map_err(|e| {
1683                    // `check_access(..)` returns EACCES when the access rights doesn't match - change
1684                    // it to EPERM to match Linux standards.
1685                    if e == EACCES { errno!(EPERM) } else { e }
1686                })?;
1687            // There are also security issues that may arise when users link to setuid, setgid, or
1688            // special files.
1689            if mode.contains(FileMode::ISGID | FileMode::IXGRP) {
1690                return error!(EPERM);
1691            };
1692            if mode.contains(FileMode::ISUID) {
1693                return error!(EPERM);
1694            };
1695            if !mode.contains(FileMode::IFREG) {
1696                return error!(EPERM);
1697            };
1698        }
1699
1700        security::check_fs_node_link_access(current_task, self, child)?;
1701
1702        let locked = locked.cast_locked::<FileOpsCore>();
1703        self.ops().link(locked, self, current_task, name, child)?;
1704        Ok(child.clone())
1705    }
1706
1707    pub fn unlink<L>(
1708        &self,
1709        locked: &mut Locked<L>,
1710        current_task: &CurrentTask,
1711        mount: &MountInfo,
1712        name: &FsStr,
1713        child: &FsNodeHandle,
1714    ) -> Result<(), Errno>
1715    where
1716        L: LockEqualOrBefore<FileOpsCore>,
1717    {
1718        // The user must be able to search and write to the directory.
1719        self.check_access(
1720            locked,
1721            current_task,
1722            mount,
1723            Access::EXEC | Access::WRITE,
1724            CheckAccessReason::InternalPermissionChecks,
1725            security::Auditable::Name(name),
1726        )?;
1727        self.check_sticky_bit(current_task, child)?;
1728        if child.is_dir() {
1729            security::check_fs_node_rmdir_access(current_task, self, child, name)?;
1730        } else {
1731            security::check_fs_node_unlink_access(current_task, self, child, name)?;
1732        }
1733        let locked = locked.cast_locked::<FileOpsCore>();
1734        self.ops().unlink(locked, self, current_task, name, child)?;
1735        self.update_ctime_mtime();
1736        Ok(())
1737    }
1738
1739    pub fn truncate<L>(
1740        &self,
1741        locked: &mut Locked<L>,
1742        current_task: &CurrentTask,
1743        mount: &MountInfo,
1744        length: u64,
1745    ) -> Result<(), Errno>
1746    where
1747        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1748    {
1749        self.truncate_with_strategy(locked, RealAppendLockStrategy {}, current_task, mount, length)
1750    }
1751
1752    pub fn truncate_with_strategy<L, M>(
1753        &self,
1754        locked: &mut Locked<L>,
1755        strategy: impl AppendLockStrategy<M>,
1756        current_task: &CurrentTask,
1757        mount: &MountInfo,
1758        length: u64,
1759    ) -> Result<(), Errno>
1760    where
1761        M: LockEqualOrBefore<FileOpsCore>,
1762        L: LockEqualOrBefore<M>,
1763    {
1764        if self.is_dir() {
1765            return error!(EISDIR);
1766        }
1767
1768        {
1769            let locked = locked.cast_locked::<M>();
1770            self.check_access(
1771                locked,
1772                current_task,
1773                mount,
1774                Access::WRITE,
1775                CheckAccessReason::InternalPermissionChecks,
1776                security::Auditable::Location(std::panic::Location::caller()),
1777            )?;
1778        }
1779
1780        self.truncate_common(locked, strategy, current_task, length)
1781    }
1782
1783    /// Avoid calling this method directly. You probably want to call `FileObject::ftruncate()`
1784    /// which will also perform all file-descriptor based verifications.
1785    pub fn ftruncate<L>(
1786        &self,
1787        locked: &mut Locked<L>,
1788        current_task: &CurrentTask,
1789        length: u64,
1790    ) -> Result<(), Errno>
1791    where
1792        L: LockEqualOrBefore<BeforeFsNodeAppend>,
1793    {
1794        if self.is_dir() {
1795            // When truncating a file descriptor, if the descriptor references a directory,
1796            // return EINVAL. This is different from the truncate() syscall which returns EISDIR.
1797            //
1798            // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#ERRORS
1799            return error!(EINVAL);
1800        }
1801
1802        // For ftruncate, we do not need to check that the file node is writable.
1803        //
1804        // The file object that calls this method must verify that the file was opened
1805        // with write permissions.
1806        //
1807        // This matters because a file could be opened with O_CREAT + O_RDWR + 0444 mode.
1808        // The file descriptor returned from such an operation can be truncated, even
1809        // though the file was created with a read-only mode.
1810        //
1811        // See https://man7.org/linux/man-pages/man2/ftruncate.2.html#DESCRIPTION
1812        // which says:
1813        //
1814        // "With ftruncate(), the file must be open for writing; with truncate(),
1815        // the file must be writable."
1816
1817        self.truncate_common(locked, RealAppendLockStrategy {}, current_task, length)
1818    }
1819
1820    // Called by `truncate` and `ftruncate` above.
1821    fn truncate_common<L, M>(
1822        &self,
1823        locked: &mut Locked<L>,
1824        strategy: impl AppendLockStrategy<M>,
1825        current_task: &CurrentTask,
1826        length: u64,
1827    ) -> Result<(), Errno>
1828    where
1829        M: LockEqualOrBefore<FileOpsCore>,
1830        L: LockEqualOrBefore<M>,
1831    {
1832        if length > MAX_LFS_FILESIZE as u64 {
1833            return error!(EINVAL);
1834        }
1835        {
1836            let locked = locked.cast_locked::<M>().cast_locked::<FileOpsCore>();
1837            if length > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1838                send_standard_signal(locked, current_task, SignalInfo::default(SIGXFSZ));
1839                return error!(EFBIG);
1840            }
1841        }
1842        let locked = locked.cast_locked::<M>();
1843        self.clear_suid_and_sgid_bits(locked, current_task)?;
1844        // We have to take the append lock since otherwise it would be possible to truncate and for
1845        // an append to continue using the old size.
1846        let (guard, locked) = strategy.lock(locked, current_task, self)?;
1847        self.ops().truncate(locked, &guard, self, current_task, length)?;
1848        self.update_ctime_mtime();
1849        Ok(())
1850    }
1851
1852    /// Avoid calling this method directly. You probably want to call `FileObject::fallocate()`
1853    /// which will also perform additional verifications.
1854    pub fn fallocate<L>(
1855        &self,
1856        locked: &mut Locked<L>,
1857        current_task: &CurrentTask,
1858        mode: FallocMode,
1859        offset: u64,
1860        length: u64,
1861    ) -> Result<(), Errno>
1862    where
1863        L: LockBefore<BeforeFsNodeAppend>,
1864    {
1865        self.fallocate_with_strategy(
1866            locked,
1867            RealAppendLockStrategy {},
1868            current_task,
1869            mode,
1870            offset,
1871            length,
1872        )
1873    }
1874
1875    pub fn fallocate_with_strategy<L, M>(
1876        &self,
1877        locked: &mut Locked<L>,
1878        strategy: impl AppendLockStrategy<M>,
1879        current_task: &CurrentTask,
1880        mode: FallocMode,
1881        offset: u64,
1882        length: u64,
1883    ) -> Result<(), Errno>
1884    where
1885        M: LockEqualOrBefore<FileOpsCore>,
1886        L: LockEqualOrBefore<M>,
1887    {
1888        let allocate_size = checked_add_offset_and_length(offset as usize, length as usize)
1889            .map_err(|_| errno!(EFBIG))? as u64;
1890        {
1891            let locked = locked.cast_locked::<M>().cast_locked::<FileOpsCore>();
1892            if allocate_size > current_task.thread_group().get_rlimit(locked, Resource::FSIZE) {
1893                send_standard_signal(locked, current_task, SignalInfo::default(SIGXFSZ));
1894                return error!(EFBIG);
1895            }
1896        }
1897
1898        let locked = locked.cast_locked::<M>();
1899        self.clear_suid_and_sgid_bits(locked, current_task)?;
1900        let (guard, locked) = strategy.lock(locked, current_task, self)?;
1901        self.ops().allocate(locked, &guard, self, current_task, mode, offset, length)?;
1902        self.update_ctime_mtime();
1903        Ok(())
1904    }
1905
1906    fn update_metadata_for_child(
1907        &self,
1908        current_task: &CurrentTask,
1909        mode: &mut FileMode,
1910        owner: &mut FsCred,
1911    ) {
1912        // The setgid bit on a directory causes the gid to be inherited by new children and the
1913        // setgid bit to be inherited by new child directories. See SetgidDirTest in gvisor.
1914        {
1915            let self_info = self.info();
1916            if self_info.mode.contains(FileMode::ISGID) {
1917                owner.gid = self_info.gid;
1918                if mode.is_dir() {
1919                    *mode |= FileMode::ISGID;
1920                }
1921            }
1922        }
1923
1924        if !mode.is_dir() {
1925            // https://man7.org/linux/man-pages/man7/inode.7.html says:
1926            //
1927            //   For an executable file, the set-group-ID bit causes the
1928            //   effective group ID of a process that executes the file to change
1929            //   as described in execve(2).
1930            //
1931            // We need to check whether the current task has permission to create such a file.
1932            // See a similar check in `FsNode::chmod`.
1933            let (fsgid, is_in_group) = current_task
1934                .with_current_creds(|creds| (creds.fsgid, creds.is_in_group(owner.gid)));
1935            if owner.gid != fsgid
1936                && !is_in_group
1937                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
1938            {
1939                *mode &= !FileMode::ISGID;
1940            }
1941        }
1942    }
1943
1944    /// Checks if O_NOATIME is allowed,
1945    pub fn check_o_noatime_allowed(&self, current_task: &CurrentTask) -> Result<(), Errno> {
1946        let fsuid = current_task.with_current_creds(|creds| creds.fsuid);
1947
1948        // Per open(2),
1949        //
1950        //   O_NOATIME (since Linux 2.6.8)
1951        //      ...
1952        //
1953        //      This flag can be employed only if one of the following
1954        //      conditions is true:
1955        //
1956        //      *  The effective UID of the process matches the owner UID
1957        //         of the file.
1958        //
1959        //      *  The calling process has the CAP_FOWNER capability in
1960        //         its user namespace and the owner UID of the file has a
1961        //         mapping in the namespace.
1962        if fsuid != self.info().uid {
1963            security::check_task_capable(current_task, CAP_FOWNER)?;
1964        }
1965        Ok(())
1966    }
1967
1968    pub fn default_check_access_impl(
1969        &self,
1970        current_task: &CurrentTask,
1971        permission_flags: security::PermissionFlags,
1972        reason: CheckAccessReason,
1973        info: RwLockReadGuard<'_, FsNodeInfo>,
1974        audit_context: Auditable<'_>,
1975    ) -> Result<(), Errno> {
1976        let (node_uid, node_gid, mode) = (info.uid, info.gid, info.mode);
1977        std::mem::drop(info);
1978        if let CheckAccessReason::ChangeTimestamps { now } = reason {
1979            // To set the timestamps to the current time the caller must either have write access to
1980            // the file, be the file owner, or hold the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
1981            // To set the timestamps to other values the caller must either be the file owner or hold
1982            // the CAP_FOWNER capability.
1983            let fsuid = current_task.with_current_creds(|creds| creds.fsuid);
1984            if fsuid == node_uid {
1985                return Ok(());
1986            }
1987            if now {
1988                if security::is_task_capable_noaudit(current_task, CAP_FOWNER) {
1989                    return Ok(());
1990                }
1991            } else {
1992                security::check_task_capable(current_task, CAP_FOWNER)?;
1993                return Ok(());
1994            }
1995        }
1996        check_access(self, current_task, permission_flags, node_uid, node_gid, mode)?;
1997        security::fs_node_permission(current_task, self, permission_flags, audit_context)
1998    }
1999
2000    /// Check whether the node can be accessed in the current context with the specified access
2001    /// flags (read, write, or exec). Accounts for capabilities and whether the current user is the
2002    /// owner or is in the file's group.
2003    pub fn check_access<'a, L>(
2004        &self,
2005        locked: &mut Locked<L>,
2006        current_task: &CurrentTask,
2007        mount: &MountInfo,
2008        access: impl Into<security::PermissionFlags>,
2009        reason: CheckAccessReason,
2010        audit_context: impl Into<security::Auditable<'a>>,
2011    ) -> Result<(), Errno>
2012    where
2013        L: LockEqualOrBefore<FileOpsCore>,
2014    {
2015        let mut permission_flags = access.into();
2016        if permission_flags.contains(security::PermissionFlags::WRITE) {
2017            mount.check_readonly_filesystem()?;
2018        }
2019        if permission_flags.contains(security::PermissionFlags::EXEC) && !self.is_dir() {
2020            mount.check_noexec_filesystem()?;
2021        }
2022        if reason == CheckAccessReason::Access {
2023            permission_flags |= PermissionFlags::ACCESS;
2024        }
2025        self.ops().check_access(
2026            locked.cast_locked::<FileOpsCore>(),
2027            self,
2028            current_task,
2029            permission_flags,
2030            &self.info,
2031            reason,
2032            audit_context.into(),
2033        )
2034    }
2035
2036    /// Check whether the stick bit, `S_ISVTX`, forbids the `current_task` from removing the given
2037    /// `child`. If this node has `S_ISVTX`, then either the child must be owned by the `fsuid` of
2038    /// `current_task` or `current_task` must have `CAP_FOWNER`.
2039    pub fn check_sticky_bit(
2040        &self,
2041        current_task: &CurrentTask,
2042        child: &FsNodeHandle,
2043    ) -> Result<(), Errno> {
2044        let fsuid = current_task.with_current_creds(|creds| creds.fsuid);
2045        if self.info().mode.contains(FileMode::ISVTX) && child.info().uid != fsuid {
2046            security::check_task_capable(current_task, CAP_FOWNER)?;
2047        }
2048        Ok(())
2049    }
2050
2051    pub fn fifo(&self, current_task: &CurrentTask) -> &PipeHandle {
2052        assert!(self.is_fifo());
2053        self.ensure_rare_data().ensure_fifo(current_task)
2054    }
2055
2056    /// Returns the UNIX domain socket bound to this node, if any.
2057    pub fn bound_socket(&self) -> Option<&SocketHandle> {
2058        if let Some(rare_data) = self.rare_data.get() { rare_data.bound_socket.get() } else { None }
2059    }
2060
2061    /// Register the provided socket as the UNIX domain socket bound to this node.
2062    ///
2063    /// It is a fatal error to call this method again if it has already been called on this node.
2064    pub fn set_bound_socket(&self, socket: SocketHandle) {
2065        assert!(self.ensure_rare_data().bound_socket.set(socket).is_ok());
2066    }
2067
2068    pub fn update_attributes<L, F>(
2069        &self,
2070        locked: &mut Locked<L>,
2071        current_task: &CurrentTask,
2072        mutator: F,
2073    ) -> Result<(), Errno>
2074    where
2075        L: LockEqualOrBefore<FileOpsCore>,
2076        F: FnOnce(&mut FsNodeInfo) -> Result<(), Errno>,
2077    {
2078        let mut info = self.info.write();
2079        let mut new_info = info.clone();
2080        mutator(&mut new_info)?;
2081
2082        let new_access = new_info.mode.user_access()
2083            | new_info.mode.group_access()
2084            | new_info.mode.other_access();
2085
2086        if new_access.intersects(Access::EXEC) {
2087            let write_guard_state = self.write_guard_state.lock();
2088            if let Ok(seals) = write_guard_state.get_seals() {
2089                if seals.contains(SealFlags::NO_EXEC) {
2090                    return error!(EPERM);
2091                }
2092            }
2093        }
2094
2095        // `mutator`s should not update the attribute change time, which is managed by this API.
2096        assert_eq!(info.time_status_change, new_info.time_status_change);
2097        if *info == new_info {
2098            return Ok(());
2099        }
2100        new_info.time_status_change = utc::utc_now();
2101
2102        let mut has = zxio_node_attr_has_t { ..Default::default() };
2103        has.modification_time = info.time_modify != new_info.time_modify;
2104        has.access_time = info.time_access != new_info.time_access;
2105        has.mode = info.mode != new_info.mode;
2106        has.uid = info.uid != new_info.uid;
2107        has.gid = info.gid != new_info.gid;
2108        has.rdev = info.rdev != new_info.rdev;
2109        has.casefold = info.casefold != new_info.casefold;
2110        has.wrapping_key_id = info.wrapping_key_id != new_info.wrapping_key_id;
2111
2112        security::check_fs_node_setattr_access(current_task, &self, &has)?;
2113
2114        // Call `update_attributes(..)` to persist the changes for the following fields.
2115        if has.modification_time
2116            || has.access_time
2117            || has.mode
2118            || has.uid
2119            || has.gid
2120            || has.rdev
2121            || has.casefold
2122            || has.wrapping_key_id
2123        {
2124            let locked = locked.cast_locked::<FileOpsCore>();
2125            self.ops().update_attributes(locked, current_task, &new_info, has)?;
2126        }
2127
2128        *info = new_info;
2129        Ok(())
2130    }
2131
2132    /// Set the permissions on this FsNode to the given values.
2133    ///
2134    /// Does not change the IFMT of the node.
2135    pub fn chmod<L>(
2136        &self,
2137        locked: &mut Locked<L>,
2138        current_task: &CurrentTask,
2139        mount: &MountInfo,
2140        mut mode: FileMode,
2141    ) -> Result<(), Errno>
2142    where
2143        L: LockEqualOrBefore<FileOpsCore>,
2144    {
2145        mount.check_readonly_filesystem()?;
2146        self.update_attributes(locked, current_task, |info| {
2147            let (euid, egid, in_group) = current_task
2148                .with_current_creds(|creds| (creds.euid, creds.egid, creds.is_in_group(info.gid)));
2149            if info.uid != euid {
2150                security::check_task_capable(current_task, CAP_FOWNER)?;
2151            } else if info.gid != egid
2152                && !in_group
2153                && mode.intersects(FileMode::ISGID)
2154                && !security::is_task_capable_noaudit(current_task, CAP_FOWNER)
2155            {
2156                mode &= !FileMode::ISGID;
2157            }
2158            info.chmod(mode);
2159            Ok(())
2160        })
2161    }
2162
2163    /// Sets the owner and/or group on this FsNode.
2164    pub fn chown<L>(
2165        &self,
2166        locked: &mut Locked<L>,
2167        current_task: &CurrentTask,
2168        mount: &MountInfo,
2169        owner: Option<uid_t>,
2170        group: Option<gid_t>,
2171    ) -> Result<(), Errno>
2172    where
2173        L: LockEqualOrBefore<FileOpsCore>,
2174    {
2175        mount.check_readonly_filesystem()?;
2176        self.update_attributes(locked, current_task, |info| {
2177            if security::is_task_capable_noaudit(current_task, CAP_CHOWN) {
2178                info.chown(owner, group);
2179                return Ok(());
2180            }
2181
2182            // Nobody can change the owner.
2183            if let Some(uid) = owner {
2184                if info.uid != uid {
2185                    return error!(EPERM);
2186                }
2187            }
2188
2189            let (euid, is_in_group) = current_task
2190                .with_current_creds(|creds| (creds.euid, group.map(|gid| creds.is_in_group(gid))));
2191
2192            // The owner can change the group.
2193            if info.uid == euid {
2194                // To a group that it belongs.
2195                if let Some(is_in_group) = is_in_group {
2196                    if !is_in_group {
2197                        return error!(EPERM);
2198                    }
2199                }
2200                info.chown(None, group);
2201                return Ok(());
2202            }
2203
2204            // Any other user can call chown(file, -1, -1)
2205            if owner.is_some() || group.is_some() {
2206                return error!(EPERM);
2207            }
2208
2209            // But not on set-user-ID or set-group-ID files.
2210            // If we were to chown them, they would drop the set-ID bit.
2211            if info.mode.is_reg()
2212                && (info.mode.contains(FileMode::ISUID)
2213                    || info.mode.contains(FileMode::ISGID | FileMode::IXGRP))
2214            {
2215                return error!(EPERM);
2216            }
2217
2218            info.chown(None, None);
2219            Ok(())
2220        })
2221    }
2222
2223    /// Forcefully change the owner and group of this node.
2224    ///
2225    /// # Safety
2226    ///
2227    /// This function skips all the security checks and just updates the owner and group. Also, does
2228    /// not check if the filesystem is read-only and does not update the attribute change time.
2229    ///
2230    /// This function is used to set the owner and group of /proc/pid to the credentials of the
2231    /// current task. Please consider carefully whether you want to use this function for another
2232    /// purpose.
2233    pub unsafe fn force_chown(&self, creds: FsCred) {
2234        self.update_info(|info| {
2235            info.chown(Some(creds.uid), Some(creds.gid));
2236        });
2237    }
2238
2239    /// Whether this node is a regular file.
2240    pub fn is_reg(&self) -> bool {
2241        self.info().mode.is_reg()
2242    }
2243
2244    /// Whether this node is a directory.
2245    pub fn is_dir(&self) -> bool {
2246        self.info().mode.is_dir()
2247    }
2248
2249    /// Whether this node is a socket.
2250    pub fn is_sock(&self) -> bool {
2251        self.info().mode.is_sock()
2252    }
2253
2254    /// Whether this node is a FIFO.
2255    pub fn is_fifo(&self) -> bool {
2256        self.info().mode.is_fifo()
2257    }
2258
2259    /// Whether this node is a symbolic link.
2260    pub fn is_lnk(&self) -> bool {
2261        self.info().mode.is_lnk()
2262    }
2263
2264    pub fn dev(&self) -> DeviceType {
2265        self.fs().dev_id
2266    }
2267
2268    pub fn stat<L>(
2269        &self,
2270        locked: &mut Locked<L>,
2271        current_task: &CurrentTask,
2272    ) -> Result<uapi::stat, Errno>
2273    where
2274        L: LockEqualOrBefore<FileOpsCore>,
2275    {
2276        security::check_fs_node_getattr_access(current_task, self)?;
2277
2278        let info = self.fetch_and_refresh_info(locked, current_task)?;
2279
2280        let time_to_kernel_timespec_pair = |t| {
2281            let timespec { tv_sec, tv_nsec } = timespec_from_time(t);
2282            let time = tv_sec.try_into().map_err(|_| errno!(EINVAL))?;
2283            let time_nsec = tv_nsec.try_into().map_err(|_| errno!(EINVAL))?;
2284            Ok((time, time_nsec))
2285        };
2286
2287        let (st_atime, st_atime_nsec) = time_to_kernel_timespec_pair(info.time_access)?;
2288        let (st_mtime, st_mtime_nsec) = time_to_kernel_timespec_pair(info.time_modify)?;
2289        let (st_ctime, st_ctime_nsec) = time_to_kernel_timespec_pair(info.time_status_change)?;
2290
2291        Ok(uapi::stat {
2292            st_dev: self.dev().bits(),
2293            st_ino: self.ino,
2294            st_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2295            st_mode: info.mode.bits(),
2296            st_uid: info.uid,
2297            st_gid: info.gid,
2298            st_rdev: info.rdev.bits(),
2299            st_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2300            st_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2301            st_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2302            st_atime,
2303            st_atime_nsec,
2304            st_mtime,
2305            st_mtime_nsec,
2306            st_ctime,
2307            st_ctime_nsec,
2308            ..Default::default()
2309        })
2310    }
2311
2312    // TODO(https://fxbug.dev/454730248): This is probably the wrong way to implement O_APPEND.
2313    pub fn get_size<L>(
2314        &self,
2315        locked: &mut Locked<L>,
2316        current_task: &CurrentTask,
2317    ) -> Result<usize, Errno>
2318    where
2319        L: LockEqualOrBefore<FileOpsCore>,
2320    {
2321        let info = self.fetch_and_refresh_info(locked, current_task)?;
2322        Ok(info.size.try_into().map_err(|_| errno!(EINVAL))?)
2323    }
2324
2325    fn statx_timestamp_from_time(time: UtcInstant) -> statx_timestamp {
2326        let nanos = time.into_nanos();
2327        statx_timestamp {
2328            tv_sec: nanos / NANOS_PER_SECOND,
2329            tv_nsec: (nanos % NANOS_PER_SECOND) as u32,
2330            ..Default::default()
2331        }
2332    }
2333
2334    pub fn statx<L>(
2335        &self,
2336        locked: &mut Locked<L>,
2337        current_task: &CurrentTask,
2338        flags: StatxFlags,
2339        mask: u32,
2340    ) -> Result<statx, Errno>
2341    where
2342        L: LockEqualOrBefore<FileOpsCore>,
2343    {
2344        security::check_fs_node_getattr_access(current_task, self)?;
2345
2346        // Ignore mask for now and fill in all of the fields.
2347        let info = if flags.contains(StatxFlags::AT_STATX_DONT_SYNC) {
2348            self.info()
2349        } else {
2350            self.fetch_and_refresh_info(locked, current_task)?
2351        };
2352        if mask & STATX__RESERVED == STATX__RESERVED {
2353            return error!(EINVAL);
2354        }
2355
2356        track_stub!(TODO("https://fxbug.dev/302594110"), "statx attributes");
2357        let stx_mnt_id = 0;
2358        let mut stx_attributes = 0;
2359        let stx_attributes_mask = STATX_ATTR_VERITY as u64;
2360
2361        if matches!(*self.fsverity.lock(), FsVerityState::FsVerity) {
2362            stx_attributes |= STATX_ATTR_VERITY as u64;
2363        }
2364
2365        Ok(statx {
2366            stx_mask: STATX_NLINK
2367                | STATX_UID
2368                | STATX_GID
2369                | STATX_ATIME
2370                | STATX_MTIME
2371                | STATX_CTIME
2372                | STATX_INO
2373                | STATX_SIZE
2374                | STATX_BLOCKS
2375                | STATX_BASIC_STATS,
2376            stx_blksize: info.blksize.try_into().map_err(|_| errno!(EINVAL))?,
2377            stx_attributes,
2378            stx_nlink: info.link_count.try_into().map_err(|_| errno!(EINVAL))?,
2379            stx_uid: info.uid,
2380            stx_gid: info.gid,
2381            stx_mode: info.mode.bits().try_into().map_err(|_| errno!(EINVAL))?,
2382            stx_ino: self.ino,
2383            stx_size: info.size.try_into().map_err(|_| errno!(EINVAL))?,
2384            stx_blocks: info.blocks.try_into().map_err(|_| errno!(EINVAL))?,
2385            stx_attributes_mask,
2386            stx_ctime: Self::statx_timestamp_from_time(info.time_status_change),
2387            stx_mtime: Self::statx_timestamp_from_time(info.time_modify),
2388            stx_atime: Self::statx_timestamp_from_time(info.time_access),
2389
2390            stx_rdev_major: info.rdev.major(),
2391            stx_rdev_minor: info.rdev.minor(),
2392
2393            stx_dev_major: self.fs().dev_id.major(),
2394            stx_dev_minor: self.fs().dev_id.minor(),
2395            stx_mnt_id,
2396            ..Default::default()
2397        })
2398    }
2399
2400    /// Checks whether `current_task` has capabilities required for the specified `access` to the
2401    /// extended attribute `name`.
2402    fn check_xattr_access<L>(
2403        &self,
2404        locked: &mut Locked<L>,
2405        current_task: &CurrentTask,
2406        mount: &MountInfo,
2407        name: &FsStr,
2408        access: Access,
2409    ) -> Result<(), Errno>
2410    where
2411        L: LockEqualOrBefore<FileOpsCore>,
2412    {
2413        assert!(access == Access::READ || access == Access::WRITE);
2414
2415        let enodata_if_read =
2416            |e: Errno| if access == Access::READ && e.code == EPERM { errno!(ENODATA) } else { e };
2417
2418        // man xattr(7) describes the different access checks applied to each extended attribute
2419        // namespace.
2420        if name.starts_with(XATTR_USER_PREFIX.to_bytes()) {
2421            {
2422                let info = self.info();
2423                if !info.mode.is_reg() && !info.mode.is_dir() {
2424                    return Err(enodata_if_read(errno!(EPERM)));
2425                }
2426            }
2427
2428            // TODO: https://fxbug.dev/460734830 - Perform capability check(s) if file has sticky
2429            // bit set.
2430
2431            self.check_access(
2432                locked,
2433                current_task,
2434                mount,
2435                access,
2436                CheckAccessReason::InternalPermissionChecks,
2437                security::Auditable::Name(name),
2438            )?;
2439        } else if name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()) {
2440            // Trusted extended attributes require `CAP_SYS_ADMIN` to read or write.
2441            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2442        } else if name.starts_with(XATTR_SYSTEM_PREFIX.to_bytes()) {
2443            // System extended attributes have attribute-specific access policy.
2444            // TODO: https://fxbug.dev/460734830 -  Revise how system extended attributes are
2445            // access-controlled.
2446            security::check_task_capable(current_task, CAP_SYS_ADMIN).map_err(enodata_if_read)?;
2447        } else if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2448            if access == Access::WRITE {
2449                // Writes require `CAP_SYS_ADMIN`, unless the LSM owning `name` specifies to skip.
2450                if !security::fs_node_xattr_skipcap(current_task, name) {
2451                    security::check_task_capable(current_task, CAP_SYS_ADMIN)
2452                        .map_err(enodata_if_read)?;
2453                }
2454            }
2455        } else {
2456            panic!("Unknown extended attribute prefix: {}", name);
2457        }
2458        Ok(())
2459    }
2460
2461    pub fn get_xattr<L>(
2462        &self,
2463        locked: &mut Locked<L>,
2464        current_task: &CurrentTask,
2465        mount: &MountInfo,
2466        name: &FsStr,
2467        max_size: usize,
2468    ) -> Result<ValueOrSize<FsString>, Errno>
2469    where
2470        L: LockEqualOrBefore<FileOpsCore>,
2471    {
2472        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2473        self.check_xattr_access(locked, current_task, mount, name, Access::READ)?;
2474
2475        // LSM access checks must be performed after discretionary checks.
2476        security::check_fs_node_getxattr_access(current_task, self, name)?;
2477
2478        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2479            // If the attribute is in the security.* domain then allow the LSM to handle the
2480            // request, or to delegate to `FsNodeOps::get_xattr()`.
2481            security::fs_node_getsecurity(locked, current_task, self, name, max_size)
2482        } else {
2483            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2484            self.ops().get_xattr(
2485                locked.cast_locked::<FileOpsCore>(),
2486                self,
2487                current_task,
2488                name,
2489                max_size,
2490            )
2491        }
2492    }
2493
2494    pub fn set_xattr<L>(
2495        &self,
2496        locked: &mut Locked<L>,
2497        current_task: &CurrentTask,
2498        mount: &MountInfo,
2499        name: &FsStr,
2500        value: &FsStr,
2501        op: XattrOp,
2502    ) -> Result<(), Errno>
2503    where
2504        L: LockEqualOrBefore<FileOpsCore>,
2505    {
2506        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2507        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2508
2509        // LSM access checks must be performed after discretionary checks.
2510        security::check_fs_node_setxattr_access(current_task, self, name, value, op)?;
2511
2512        if name.starts_with(XATTR_SECURITY_PREFIX.to_bytes()) {
2513            // If the attribute is in the security.* domain then allow the LSM to handle the
2514            // request, or to delegate to `FsNodeOps::set_xattr()`.
2515            security::fs_node_setsecurity(locked, current_task, self, name, value, op)
2516        } else {
2517            // If the attribute is outside security.*, delegate the read to the `FsNodeOps`.
2518            self.ops().set_xattr(
2519                locked.cast_locked::<FileOpsCore>(),
2520                self,
2521                current_task,
2522                name,
2523                value,
2524                op,
2525            )
2526        }
2527    }
2528
2529    pub fn remove_xattr<L>(
2530        &self,
2531        locked: &mut Locked<L>,
2532        current_task: &CurrentTask,
2533        mount: &MountInfo,
2534        name: &FsStr,
2535    ) -> Result<(), Errno>
2536    where
2537        L: LockEqualOrBefore<FileOpsCore>,
2538    {
2539        // Perform discretionary capability & access checks appropriate to the xattr prefix.
2540        self.check_xattr_access(locked, current_task, mount, name, Access::WRITE)?;
2541
2542        // LSM access checks must be performed after discretionary checks.
2543        security::check_fs_node_removexattr_access(current_task, self, name)?;
2544        self.ops().remove_xattr(locked.cast_locked::<FileOpsCore>(), self, current_task, name)
2545    }
2546
2547    pub fn list_xattrs<L>(
2548        &self,
2549        locked: &mut Locked<L>,
2550        current_task: &CurrentTask,
2551        max_size: usize,
2552    ) -> Result<ValueOrSize<Vec<FsString>>, Errno>
2553    where
2554        L: LockEqualOrBefore<FileOpsCore>,
2555    {
2556        security::check_fs_node_listxattr_access(current_task, self)?;
2557        Ok(self
2558            .ops()
2559            .list_xattrs(locked.cast_locked::<FileOpsCore>(), self, current_task, max_size)?
2560            .map(|mut v| {
2561                // Extended attributes may be listed even if the caller would not be able to read
2562                // (or modify) the attribute's value.
2563                // trusted.* attributes are only accessible with CAP_SYS_ADMIN and are omitted by
2564                // `listxattr()` unless the caller holds CAP_SYS_ADMIN.
2565                if !security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN) {
2566                    v.retain(|name| !name.starts_with(XATTR_TRUSTED_PREFIX.to_bytes()));
2567                }
2568                v
2569            }))
2570    }
2571
2572    /// Returns current `FsNodeInfo`.
2573    pub fn info(&self) -> RwLockReadGuard<'_, FsNodeInfo> {
2574        self.info.read()
2575    }
2576
2577    /// Refreshes the `FsNodeInfo` if necessary and returns a read guard.
2578    pub fn fetch_and_refresh_info<L>(
2579        &self,
2580        locked: &mut Locked<L>,
2581        current_task: &CurrentTask,
2582    ) -> Result<RwLockReadGuard<'_, FsNodeInfo>, Errno>
2583    where
2584        L: LockEqualOrBefore<FileOpsCore>,
2585    {
2586        self.ops().fetch_and_refresh_info(
2587            locked.cast_locked::<FileOpsCore>(),
2588            self,
2589            current_task,
2590            &self.info,
2591        )
2592    }
2593
2594    pub fn update_info<F, T>(&self, mutator: F) -> T
2595    where
2596        F: FnOnce(&mut FsNodeInfo) -> T,
2597    {
2598        let mut info = self.info.write();
2599        mutator(&mut info)
2600    }
2601
2602    /// Clear the SUID and SGID bits unless the `current_task` has `CAP_FSETID`
2603    pub fn clear_suid_and_sgid_bits<L>(
2604        &self,
2605        locked: &mut Locked<L>,
2606        current_task: &CurrentTask,
2607    ) -> Result<(), Errno>
2608    where
2609        L: LockEqualOrBefore<FileOpsCore>,
2610    {
2611        if !security::is_task_capable_noaudit(current_task, CAP_FSETID) {
2612            self.update_attributes(locked, current_task, |info| {
2613                info.clear_suid_and_sgid_bits();
2614                Ok(())
2615            })?;
2616        }
2617        Ok(())
2618    }
2619
2620    /// Update the ctime and mtime of a file to now.
2621    pub fn update_ctime_mtime(&self) {
2622        if self.fs().manages_timestamps() {
2623            return;
2624        }
2625        self.update_info(|info| {
2626            let now = utc::utc_now();
2627            info.time_status_change = now;
2628            info.time_modify = now;
2629        });
2630    }
2631
2632    /// Update the ctime of a file to now.
2633    pub fn update_ctime(&self) {
2634        if self.fs().manages_timestamps() {
2635            return;
2636        }
2637        self.update_info(|info| {
2638            let now = utc::utc_now();
2639            info.time_status_change = now;
2640        });
2641    }
2642
2643    /// Update the atime and mtime if the `current_task` has write access, is the file owner, or
2644    /// holds either the CAP_DAC_OVERRIDE or CAP_FOWNER capability.
2645    pub fn update_atime_mtime<L>(
2646        &self,
2647        locked: &mut Locked<L>,
2648        current_task: &CurrentTask,
2649        mount: &MountInfo,
2650        atime: TimeUpdateType,
2651        mtime: TimeUpdateType,
2652    ) -> Result<(), Errno>
2653    where
2654        L: LockEqualOrBefore<FileOpsCore>,
2655    {
2656        // If the filesystem is read-only, this always fail.
2657        mount.check_readonly_filesystem()?;
2658
2659        let now = matches!((atime, mtime), (TimeUpdateType::Now, TimeUpdateType::Now));
2660        self.check_access(
2661            locked,
2662            current_task,
2663            mount,
2664            Access::WRITE,
2665            CheckAccessReason::ChangeTimestamps { now },
2666            security::Auditable::Location(std::panic::Location::caller()),
2667        )?;
2668
2669        if !matches!((atime, mtime), (TimeUpdateType::Omit, TimeUpdateType::Omit)) {
2670            // This function is called by `utimes(..)` which will update the access and
2671            // modification time. We need to call `update_attributes()` to update the mtime of
2672            // filesystems that manages file timestamps.
2673            self.update_attributes(locked, current_task, |info| {
2674                let now = utc::utc_now();
2675                let get_time = |time: TimeUpdateType| match time {
2676                    TimeUpdateType::Now => Some(now),
2677                    TimeUpdateType::Time(t) => Some(t),
2678                    TimeUpdateType::Omit => None,
2679                };
2680                if let Some(time) = get_time(atime) {
2681                    info.time_access = time;
2682                }
2683                if let Some(time) = get_time(mtime) {
2684                    info.time_modify = time;
2685                }
2686                Ok(())
2687            })?;
2688        }
2689        Ok(())
2690    }
2691
2692    /// Returns a string describing this `FsNode` in the format used by "/proc/../fd" for anonymous
2693    /// file descriptors. By default this is in the form:
2694    ///   <class>:[<node_id>]
2695    /// though `FsNodeOps` may customize this as required.
2696    pub fn internal_name(&self) -> FsString {
2697        if let Some(name) = self.ops().internal_name(self) {
2698            return name;
2699        };
2700        let class = if self.is_sock() {
2701            "socket"
2702        } else if self.is_fifo() {
2703            "pipe"
2704        } else {
2705            "file"
2706        };
2707        format!("{}:[{}]", class, self.ino).into()
2708    }
2709
2710    /// The key used to identify this node in the file system's node cache.
2711    ///
2712    /// For many file systems, this will be the same as the inode number. However, some file
2713    /// systems, such as FUSE, sometimes use different `node_key` and inode numbers.
2714    pub fn node_key(&self) -> ino_t {
2715        self.ops().node_key(self)
2716    }
2717
2718    fn ensure_rare_data(&self) -> &FsNodeRareData {
2719        self.rare_data.get_or_init(|| Box::new(FsNodeRareData::default()))
2720    }
2721
2722    /// Returns the set of watchers for this node.
2723    ///
2724    /// Only call this function if you require this node to actually store a list of watchers. If
2725    /// you just wish to notify any watchers that might exist, please use `notify` instead.
2726    pub fn ensure_watchers(&self) -> &inotify::InotifyWatchers {
2727        &self.ensure_rare_data().watchers
2728    }
2729
2730    /// Notify the watchers of the given event.
2731    pub fn notify(
2732        &self,
2733        event_mask: InotifyMask,
2734        cookie: u32,
2735        name: &FsStr,
2736        mode: FileMode,
2737        is_dead: bool,
2738    ) {
2739        if let Some(rare_data) = self.rare_data.get() {
2740            rare_data.watchers.notify(event_mask, cookie, name, mode, is_dead);
2741        }
2742    }
2743}
2744
2745impl std::fmt::Debug for FsNode {
2746    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2747        f.debug_struct("FsNode")
2748            .field("fs", &self.fs().name())
2749            .field("info", &*self.info())
2750            .field("ops_ty", &self.ops().type_name())
2751            .finish()
2752    }
2753}
2754
2755impl Releasable for FsNode {
2756    type Context<'a> = CurrentTaskAndLocked<'a>;
2757
2758    fn release<'a>(self, context: CurrentTaskAndLocked<'a>) {
2759        let (locked, current_task) = context;
2760        if let Some(fs) = self.fs.upgrade() {
2761            fs.remove_node(&self);
2762        }
2763        if let Err(err) = self.ops.forget(
2764            locked.cast_locked::<FileOpsCore>(),
2765            current_task,
2766            self.info.into_inner(),
2767        ) {
2768            log_error!("Error on FsNodeOps::forget: {err:?}");
2769        }
2770    }
2771}
2772
2773fn check_access(
2774    fs_node: &FsNode,
2775    current_task: &CurrentTask,
2776    permission_flags: security::PermissionFlags,
2777    node_uid: uid_t,
2778    node_gid: gid_t,
2779    mode: FileMode,
2780) -> Result<(), Errno> {
2781    // Determine which of the access bits apply to the `current_task`.
2782    let (fsuid, is_in_group) =
2783        current_task.with_current_creds(|creds| (creds.fsuid, creds.is_in_group(node_gid)));
2784    let granted = if fsuid == node_uid {
2785        mode.user_access()
2786    } else if is_in_group {
2787        mode.group_access()
2788    } else {
2789        mode.other_access()
2790    };
2791
2792    let access = permission_flags.as_access();
2793    if granted.contains(access) {
2794        return Ok(());
2795    }
2796
2797    // Callers with CAP_DAC_READ_SEARCH override can read files & directories, and traverse
2798    // directories to which they lack permission.
2799    let mut requested = access & !granted;
2800
2801    // If this check was triggered by `access()`, or a variant, then check for a `dontaudit`
2802    // statement for the `audit_access` permission for this caller & file.
2803    let have_dont_audit = OnceBool::new();
2804    let has_capability = move |current_task, capability| {
2805        let dont_audit = have_dont_audit.get_or_init(|| {
2806            permission_flags.contains(PermissionFlags::ACCESS)
2807                && security::has_dontaudit_access(current_task, fs_node)
2808        });
2809        if dont_audit {
2810            security::is_task_capable_noaudit(current_task, capability)
2811        } else {
2812            security::check_task_capable(current_task, capability).is_ok()
2813        }
2814    };
2815
2816    // CAP_DAC_READ_SEARCH allows bypass of read checks, and directory traverse (eXecute) checks.
2817    let dac_read_search_access =
2818        if mode.is_dir() { Access::READ | Access::EXEC } else { Access::READ };
2819    if dac_read_search_access.intersects(requested)
2820        && has_capability(current_task, CAP_DAC_READ_SEARCH)
2821    {
2822        requested.remove(dac_read_search_access);
2823    }
2824    if requested.is_empty() {
2825        return Ok(());
2826    }
2827
2828    // CAP_DAC_OVERRIDE allows bypass of all checks (though see the comment for file-execute).
2829    let mut dac_override_access = Access::READ | Access::WRITE;
2830    dac_override_access |= if mode.is_dir() {
2831        Access::EXEC
2832    } else {
2833        // File execute access checks may not be bypassed unless at least one executable bit is set.
2834        (mode.user_access() | mode.group_access() | mode.other_access()) & Access::EXEC
2835    };
2836    if dac_override_access.intersects(requested) && has_capability(current_task, CAP_DAC_OVERRIDE) {
2837        requested.remove(dac_override_access);
2838    }
2839    if requested.is_empty() {
2840        return Ok(());
2841    }
2842
2843    return error!(EACCES);
2844}
2845
2846#[cfg(test)]
2847mod tests {
2848    use super::*;
2849    use crate::device::mem::mem_device_init;
2850    use crate::testing::*;
2851    use crate::vfs::buffers::VecOutputBuffer;
2852    use starnix_uapi::auth::Credentials;
2853    use starnix_uapi::file_mode::mode;
2854
2855    #[::fuchsia::test]
2856    async fn open_device_file() {
2857        spawn_kernel_and_run(async |locked, current_task| {
2858            mem_device_init(locked, &*current_task).expect("mem_device_init");
2859
2860            // Create a device file that points to the `zero` device (which is automatically
2861            // registered in the kernel).
2862            current_task
2863                .fs()
2864                .root()
2865                .create_node(
2866                    locked,
2867                    &current_task,
2868                    "zero".into(),
2869                    mode!(IFCHR, 0o666),
2870                    DeviceType::ZERO,
2871                )
2872                .expect("create_node");
2873
2874            const CONTENT_LEN: usize = 10;
2875            let mut buffer = VecOutputBuffer::new(CONTENT_LEN);
2876
2877            // Read from the zero device.
2878            let device_file = current_task
2879                .open_file(locked, "zero".into(), OpenFlags::RDONLY)
2880                .expect("open device file");
2881            device_file.read(locked, &current_task, &mut buffer).expect("read from zero");
2882
2883            // Assert the contents.
2884            assert_eq!(&[0; CONTENT_LEN], buffer.data());
2885        })
2886        .await;
2887    }
2888
2889    #[::fuchsia::test]
2890    async fn node_info_is_reflected_in_stat() {
2891        spawn_kernel_and_run(async |locked, current_task| {
2892            // Create a node.
2893            let node = &current_task
2894                .fs()
2895                .root()
2896                .create_node(
2897                    locked,
2898                    &current_task,
2899                    "zero".into(),
2900                    FileMode::IFCHR,
2901                    DeviceType::ZERO,
2902                )
2903                .expect("create_node")
2904                .entry
2905                .node;
2906            node.update_info(|info| {
2907                info.mode = FileMode::IFSOCK;
2908                info.size = 1;
2909                info.blocks = 2;
2910                info.blksize = 4;
2911                info.uid = 9;
2912                info.gid = 10;
2913                info.link_count = 11;
2914                info.time_status_change = UtcInstant::from_nanos(1);
2915                info.time_access = UtcInstant::from_nanos(2);
2916                info.time_modify = UtcInstant::from_nanos(3);
2917                info.rdev = DeviceType::new(13, 13);
2918            });
2919            let stat = node.stat(locked, &current_task).expect("stat");
2920
2921            assert_eq!(stat.st_mode, FileMode::IFSOCK.bits());
2922            assert_eq!(stat.st_size, 1);
2923            assert_eq!(stat.st_blksize, 4);
2924            assert_eq!(stat.st_blocks, 2);
2925            assert_eq!(stat.st_uid, 9);
2926            assert_eq!(stat.st_gid, 10);
2927            assert_eq!(stat.st_nlink, 11);
2928            assert_eq!(stat.st_ctime, 0);
2929            assert_eq!(stat.st_ctime_nsec, 1);
2930            assert_eq!(stat.st_atime, 0);
2931            assert_eq!(stat.st_atime_nsec, 2);
2932            assert_eq!(stat.st_mtime, 0);
2933            assert_eq!(stat.st_mtime_nsec, 3);
2934            assert_eq!(stat.st_rdev, DeviceType::new(13, 13).bits());
2935        })
2936        .await;
2937    }
2938
2939    #[::fuchsia::test]
2940    fn test_flock_operation() {
2941        assert!(FlockOperation::from_flags(0).is_err());
2942        assert!(FlockOperation::from_flags(u32::MAX).is_err());
2943
2944        let operation1 = FlockOperation::from_flags(LOCK_SH).expect("from_flags");
2945        assert!(!operation1.is_unlock());
2946        assert!(!operation1.is_lock_exclusive());
2947        assert!(operation1.is_blocking());
2948
2949        let operation2 = FlockOperation::from_flags(LOCK_EX | LOCK_NB).expect("from_flags");
2950        assert!(!operation2.is_unlock());
2951        assert!(operation2.is_lock_exclusive());
2952        assert!(!operation2.is_blocking());
2953
2954        let operation3 = FlockOperation::from_flags(LOCK_UN).expect("from_flags");
2955        assert!(operation3.is_unlock());
2956        assert!(!operation3.is_lock_exclusive());
2957        assert!(operation3.is_blocking());
2958    }
2959
2960    #[::fuchsia::test]
2961    async fn test_check_access() {
2962        spawn_kernel_and_run(async |locked, current_task| {
2963            let mut creds = Credentials::with_ids(1, 2);
2964            creds.groups = vec![3, 4];
2965            current_task.set_creds(creds);
2966
2967            // Create a node.
2968            let node = &current_task
2969                .fs()
2970                .root()
2971                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
2972                .expect("create_node")
2973                .entry
2974                .node;
2975            let check_access = |locked: &mut Locked<Unlocked>,
2976                                uid: uid_t,
2977                                gid: gid_t,
2978                                perm: u32,
2979                                access: Access| {
2980                node.update_info(|info| {
2981                    info.mode = mode!(IFREG, perm);
2982                    info.uid = uid;
2983                    info.gid = gid;
2984                });
2985                node.check_access(
2986                    locked,
2987                    &current_task,
2988                    &MountInfo::detached(),
2989                    access,
2990                    CheckAccessReason::InternalPermissionChecks,
2991                    security::Auditable::Location(std::panic::Location::caller()),
2992                )
2993            };
2994
2995            assert_eq!(check_access(locked, 0, 0, 0o700, Access::EXEC), error!(EACCES));
2996            assert_eq!(check_access(locked, 0, 0, 0o700, Access::READ), error!(EACCES));
2997            assert_eq!(check_access(locked, 0, 0, 0o700, Access::WRITE), error!(EACCES));
2998
2999            assert_eq!(check_access(locked, 0, 0, 0o070, Access::EXEC), error!(EACCES));
3000            assert_eq!(check_access(locked, 0, 0, 0o070, Access::READ), error!(EACCES));
3001            assert_eq!(check_access(locked, 0, 0, 0o070, Access::WRITE), error!(EACCES));
3002
3003            assert_eq!(check_access(locked, 0, 0, 0o007, Access::EXEC), Ok(()));
3004            assert_eq!(check_access(locked, 0, 0, 0o007, Access::READ), Ok(()));
3005            assert_eq!(check_access(locked, 0, 0, 0o007, Access::WRITE), Ok(()));
3006
3007            assert_eq!(check_access(locked, 1, 0, 0o700, Access::EXEC), Ok(()));
3008            assert_eq!(check_access(locked, 1, 0, 0o700, Access::READ), Ok(()));
3009            assert_eq!(check_access(locked, 1, 0, 0o700, Access::WRITE), Ok(()));
3010
3011            assert_eq!(check_access(locked, 1, 0, 0o100, Access::EXEC), Ok(()));
3012            assert_eq!(check_access(locked, 1, 0, 0o100, Access::READ), error!(EACCES));
3013            assert_eq!(check_access(locked, 1, 0, 0o100, Access::WRITE), error!(EACCES));
3014
3015            assert_eq!(check_access(locked, 1, 0, 0o200, Access::EXEC), error!(EACCES));
3016            assert_eq!(check_access(locked, 1, 0, 0o200, Access::READ), error!(EACCES));
3017            assert_eq!(check_access(locked, 1, 0, 0o200, Access::WRITE), Ok(()));
3018
3019            assert_eq!(check_access(locked, 1, 0, 0o400, Access::EXEC), error!(EACCES));
3020            assert_eq!(check_access(locked, 1, 0, 0o400, Access::READ), Ok(()));
3021            assert_eq!(check_access(locked, 1, 0, 0o400, Access::WRITE), error!(EACCES));
3022
3023            assert_eq!(check_access(locked, 0, 2, 0o700, Access::EXEC), error!(EACCES));
3024            assert_eq!(check_access(locked, 0, 2, 0o700, Access::READ), error!(EACCES));
3025            assert_eq!(check_access(locked, 0, 2, 0o700, Access::WRITE), error!(EACCES));
3026
3027            assert_eq!(check_access(locked, 0, 2, 0o070, Access::EXEC), Ok(()));
3028            assert_eq!(check_access(locked, 0, 2, 0o070, Access::READ), Ok(()));
3029            assert_eq!(check_access(locked, 0, 2, 0o070, Access::WRITE), Ok(()));
3030
3031            assert_eq!(check_access(locked, 0, 3, 0o070, Access::EXEC), Ok(()));
3032            assert_eq!(check_access(locked, 0, 3, 0o070, Access::READ), Ok(()));
3033            assert_eq!(check_access(locked, 0, 3, 0o070, Access::WRITE), Ok(()));
3034        })
3035        .await;
3036    }
3037
3038    #[::fuchsia::test]
3039    async fn set_security_xattr_fails_without_security_module_or_root() {
3040        spawn_kernel_and_run(async |locked, current_task| {
3041            let mut creds = Credentials::with_ids(1, 2);
3042            creds.groups = vec![3, 4];
3043            current_task.set_creds(creds);
3044
3045            // Create a node.
3046            let node = &current_task
3047                .fs()
3048                .root()
3049                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3050                .expect("create_node")
3051                .entry
3052                .node;
3053
3054            // Give read-write-execute access.
3055            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3056
3057            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3058            // should fail.
3059            assert_eq!(
3060                node.set_xattr(
3061                    locked,
3062                    &current_task,
3063                    &MountInfo::detached(),
3064                    "security.name".into(),
3065                    "security_label".into(),
3066                    XattrOp::Create,
3067                ),
3068                error!(EPERM)
3069            );
3070        })
3071        .await;
3072    }
3073
3074    #[::fuchsia::test]
3075    async fn set_non_user_xattr_fails_without_security_module_or_root() {
3076        spawn_kernel_and_run(async |locked, current_task| {
3077            let mut creds = Credentials::with_ids(1, 2);
3078            creds.groups = vec![3, 4];
3079            current_task.set_creds(creds);
3080
3081            // Create a node.
3082            let node = &current_task
3083                .fs()
3084                .root()
3085                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3086                .expect("create_node")
3087                .entry
3088                .node;
3089
3090            // Give read-write-execute access.
3091            node.update_info(|info| info.mode = mode!(IFREG, 0o777));
3092
3093            // Without a security module, and without CAP_SYS_ADMIN capabilities, setting the xattr
3094            // should fail.
3095            assert_eq!(
3096                node.set_xattr(
3097                    locked,
3098                    &current_task,
3099                    &MountInfo::detached(),
3100                    "trusted.name".into(),
3101                    "some data".into(),
3102                    XattrOp::Create,
3103                ),
3104                error!(EPERM)
3105            );
3106        })
3107        .await;
3108    }
3109
3110    #[::fuchsia::test]
3111    async fn get_security_xattr_succeeds_without_read_access() {
3112        spawn_kernel_and_run(async |locked, current_task| {
3113            let mut creds = Credentials::with_ids(1, 2);
3114            creds.groups = vec![3, 4];
3115            current_task.set_creds(creds);
3116
3117            // Create a node.
3118            let node = &current_task
3119                .fs()
3120                .root()
3121                .create_node(locked, &current_task, "foo".into(), FileMode::IFREG, DeviceType::NONE)
3122                .expect("create_node")
3123                .entry
3124                .node;
3125
3126            // Only give read access to the root and give root access to the current task.
3127            node.update_info(|info| info.mode = mode!(IFREG, 0o100));
3128            current_task.set_creds(Credentials::root());
3129
3130            // Setting the label should succeed even without write access to the file.
3131            assert_eq!(
3132                node.set_xattr(
3133                    locked,
3134                    &current_task,
3135                    &MountInfo::detached(),
3136                    "security.name".into(),
3137                    "security_label".into(),
3138                    XattrOp::Create,
3139                ),
3140                Ok(())
3141            );
3142
3143            // Remove root access from the current task.
3144            current_task.set_creds(Credentials::with_ids(1, 1));
3145
3146            // Getting the label should succeed even without read access to the file.
3147            assert_eq!(
3148                node.get_xattr(
3149                    locked,
3150                    &current_task,
3151                    &MountInfo::detached(),
3152                    "security.name".into(),
3153                    4096
3154                ),
3155                Ok(ValueOrSize::Value("security_label".into()))
3156            );
3157        })
3158        .await;
3159    }
3160}