Skip to main content

starnix_core/vfs/
fd_table.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::security;
6use crate::task::{CurrentTask, CurrentTaskAndLocked, register_delayed_release};
7use crate::vfs::{FdNumber, FileHandle, FileReleaser};
8use bitflags::bitflags;
9use fuchsia_rcu::subtle::{RcuPtrRef, rcu_ptr_to_arc};
10use fuchsia_rcu::{RcuArc, RcuReadGuard, RcuReadScope, rcu_drop};
11use fuchsia_rcu_collections::rcu_array::RcuArray;
12use linux_uapi::{FD_CLOEXEC, FIOCLEX, FIONCLEX};
13use starnix_sync::{
14    FdTableWriterQueueLock, FileOpsCore, LockBefore, LockDepGuard, LockDepMutex, LockEqualOrBefore,
15    Locked, ThreadGroupLimits, Unlocked,
16};
17use starnix_syscalls::SyscallResult;
18use starnix_types::ownership::Releasable;
19use starnix_uapi::errors::Errno;
20use starnix_uapi::open_flags::OpenFlags;
21use starnix_uapi::resource_limits::Resource;
22use starnix_uapi::{errno, error};
23use static_assertions::const_assert;
24use std::sync::Arc;
25use std::sync::atomic::{AtomicI32, AtomicUsize, Ordering, fence};
26
27bitflags! {
28    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
29    pub struct FdFlags: u32 {
30        /// Whether the file descriptor should be closed when the process execs.
31        const CLOEXEC = FD_CLOEXEC;
32    }
33}
34
35impl std::convert::From<FdFlags> for SyscallResult {
36    fn from(value: FdFlags) -> Self {
37        value.bits().into()
38    }
39}
40
41/// An identifier for an `FdTable`.
42///
43/// Used by flock to drop file locks when a file descriptor is closed.
44#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
45pub struct FdTableId(usize);
46
47impl FdTableId {
48    fn new(id: *const FdTableInner) -> Self {
49        Self(id as usize)
50    }
51
52    pub fn raw(&self) -> usize {
53        self.0
54    }
55}
56
57/// We store the CLOEXEC bit and the address of the `FileObject` in a single `usize` so that we can
58/// operate on an FdTable entry atomically. This mask is used to select the CLOEXEC bit.
59const FLAGS_MASK: usize = 0x1;
60
61/// An encoded entry in an `FdTable`.
62///
63/// Encodes both the `FileHandle` and the CLOEXEC bit. Can either hold an entry or be empty.
64#[derive(Debug, Default)]
65struct EncodedEntry {
66    /// Rather than using a separate "flags" field, we encode the table entry into a single usize.
67    ///
68    /// If `value` is zero, the entry is empty.
69    ///
70    /// The lowest bit of `value` is the CLOEXEC bit.
71    ///
72    /// The remaining bits of `value` are a `FileHandle` converted to a raw pointer.
73    value: AtomicUsize,
74}
75
76// An assert to ensure that the lowest bit of the `FileHandle` is available to store the CLOEXEC
77// bit.
78const_assert!(std::mem::align_of::<*const FileReleaser>() >= 1 << FLAGS_MASK);
79
80impl EncodedEntry {
81    /// Encodes a `FileHandle` and `FdFlags` into a single `usize`.
82    ///
83    /// The returned value holds a reference to the `FileObject` and must be released to avoid a
84    /// memory leak.
85    fn encode(file: FileHandle, flags: FdFlags) -> usize {
86        let ptr = Arc::into_raw(file) as usize;
87        let flags = (flags.bits() as usize) & FLAGS_MASK;
88        ptr | flags
89    }
90
91    /// Releases the `FileHandle` for a previously encoded value.
92    ///
93    /// # Safety
94    ///
95    /// `value` must have been encoded by `Self::encode`.
96    unsafe fn release(id: FdTableId, value: usize) {
97        let ptr = Self::decode_ptr(value);
98        if !ptr.is_null() {
99            // SAFETY: The pointer is valid because it was encoded in `self.value`.
100            let file = unsafe { Arc::from_raw(ptr) };
101            // Concurrent readers expect the `FileHandle` to be retained for the entire RCU grace
102            // period. `FlushedFile` delayed release may be processed before the grace period
103            // expires. We must defer a reference to RCU to ensure delayed release does not drop the
104            // last reference and free the file before RCU readers are done with it.
105            register_delayed_release(FlushedFile(file.clone(), id));
106            rcu_drop(file)
107        }
108    }
109
110    /// Decodes the `FdFlags` from an encoded `usize`.
111    fn decode_flags(value: usize) -> FdFlags {
112        FdFlags::from_bits_truncate((value & FLAGS_MASK) as u32)
113    }
114
115    /// Decodes the `FileHandle` from an encoded `usize`.
116    fn decode_ptr(value: usize) -> *const FileReleaser {
117        (value & !FLAGS_MASK) as *const _
118    }
119
120    /// Creates a new `EncodedEntry` from a `FdTableEntry`.
121    fn new(entry: FdTableEntry) -> Self {
122        Self { value: AtomicUsize::new(Self::encode(entry.file, entry.flags)) }
123    }
124
125    /// Whether this entry contains a valid `FileHandle`.
126    fn is_some(&self) -> bool {
127        let value = self.value.load(Ordering::Acquire);
128        value != 0
129    }
130
131    /// Whether this entry is empty.
132    fn is_none(&self) -> bool {
133        !self.is_some()
134    }
135
136    /// Sets the `FdFlags` for this entry, preserving the `FileHandle`.
137    fn set_flags(&self, flags: FdFlags) {
138        loop {
139            let old_value = self.value.load(Ordering::Relaxed);
140            assert!(old_value != 0);
141            let new_value = old_value & !FLAGS_MASK | (flags.bits() as usize) & FLAGS_MASK;
142            if self
143                .value
144                .compare_exchange_weak(old_value, new_value, Ordering::AcqRel, Ordering::Relaxed)
145                .is_ok()
146            {
147                return;
148            }
149        }
150    }
151
152    /// Sets the `FileHandle` for this entry, preserving the `FdFlags`.
153    fn set_file(&self, id: FdTableId, file: FileHandle) {
154        let ptr = Arc::into_raw(file) as usize;
155        loop {
156            let old_value = self.value.load(Ordering::Relaxed);
157            assert!(old_value != 0);
158            let flags = old_value & FLAGS_MASK;
159            let new_value = ptr | flags;
160            if self
161                .value
162                .compare_exchange_weak(old_value, new_value, Ordering::AcqRel, Ordering::Relaxed)
163                .is_ok()
164            {
165                // SAFETY: The value was previously encoded by `Self::encode`.
166                unsafe { Self::release(id, old_value) };
167                return;
168            }
169        }
170    }
171
172    /// Reads the entry, returning a guard that maintains a consistent view of it.
173    fn read<'a>(&self, scope: &'a RcuReadScope) -> Option<FdTableEntryGuard<'a>> {
174        let value = self.value.load(Ordering::Acquire);
175        if value == 0 {
176            return None;
177        }
178        let ptr = Self::decode_ptr(value);
179        let flags = Self::decode_flags(value);
180        // SAFETY: The pointer is valid because it was encoded in `self.value`.
181        let file = unsafe { RcuPtrRef::new(scope, ptr) };
182        Some(FdTableEntryGuard { file, flags })
183    }
184
185    /// Sets the `FileHandle` and `FdFlags` for this entry.
186    fn set_entry(&self, id: FdTableId, entry: FdTableEntry) -> bool {
187        // SAFETY: The value is encoded by `Self::encode`.
188        unsafe { self.set(id, Self::encode(entry.file, entry.flags)) }
189    }
190
191    /// Makes the entry empty.
192    fn clear(&self, id: FdTableId) -> bool {
193        // SAFETY: The value is zero.
194        unsafe { self.set(id, 0) }
195    }
196
197    /// Sets the value of this entry to the given value.
198    ///
199    /// Most clients should call `set_entry` or `clear` instead.
200    ///
201    /// # Safety
202    ///
203    /// The value must be encoded by `Self::encode` or be zero.
204    unsafe fn set(&self, id: FdTableId, value: usize) -> bool {
205        let old_value = self.value.swap(value, Ordering::AcqRel);
206        if old_value != 0 {
207            // SAFETY: The value was previously encoded by `Self::encode`.
208            unsafe { Self::release(id, old_value) };
209            true
210        } else {
211            false
212        }
213    }
214}
215
216impl Clone for EncodedEntry {
217    fn clone(&self) -> Self {
218        if let Some(guard) = self.read(&RcuReadScope::new()) {
219            Self::new(guard.to_entry())
220        } else {
221            Self::default()
222        }
223    }
224}
225
226impl Drop for EncodedEntry {
227    fn drop(&mut self) {
228        let value = self.value.load(Ordering::Acquire);
229        let ptr = Self::decode_ptr(value);
230        if !ptr.is_null() {
231            // SAFETY: The pointer is valid because it was encoded in `self.value`.
232            let _file = unsafe { Arc::from_raw(ptr) };
233        }
234    }
235}
236
237/// An entry in the `FdTable`.
238#[derive(Debug, Clone)]
239struct FdTableEntry {
240    /// The file handle.
241    file: FileHandle,
242
243    /// The flags associated with the file handle.
244    flags: FdFlags,
245}
246
247/// A guard for reading an `FdTableEntry`.
248///
249/// This provides memory-safe access to decoded `FdTableEntry` data, which is guarded by RCU.
250struct FdTableEntryGuard<'a> {
251    /// The pointer to the file handle.
252    file: RcuPtrRef<'a, FileReleaser>,
253
254    /// The flags associated with the file handle.
255    flags: FdFlags,
256}
257
258impl<'a> FdTableEntryGuard<'a> {
259    fn flags(&self) -> FdFlags {
260        self.flags
261    }
262
263    /// Acquire a strong reference to the file handle.
264    fn to_handle(&self) -> FileHandle {
265        // SAFETY: We can pass `self.file` to `rcu_ptr_to_arc` because it was obtained from
266        // `Arc::into_raw` via `EncodedEntry::encode` and `EncodedEntry::decode_ptr`.
267        unsafe { rcu_ptr_to_arc(self.file) }
268    }
269
270    /// Upgrade this guard to a full `FdTableEntry` independent of the guard lifetime.
271    fn to_entry(&self) -> FdTableEntry {
272        FdTableEntry { file: self.to_handle(), flags: self.flags }
273    }
274}
275
276/// A `FileHandle` that has been closed and is waiting to be flushed.
277struct FlushedFile(FileHandle, FdTableId);
278
279impl Releasable for FlushedFile {
280    type Context<'a> = CurrentTaskAndLocked<'a>;
281    fn release<'a>(self, context: Self::Context<'a>) {
282        let (locked, current_task) = context;
283        let FlushedFile(file, id) = self;
284        file.flush(locked, current_task, id);
285    }
286}
287
288/// A read-only view of an `FdTable`.
289///
290/// When reading an `FdTable`, we use an `FdTableView` to have a coherent view of the table even
291/// though the table can be modified by other threads concurrently.
292///
293/// The actual entries in the slice can still be modified by other threads. However, the view
294/// provided by the `FdTableView` is protected by an RCU read lock.
295struct FdTableView<'a> {
296    /// The entries in the table.
297    slice: &'a [EncodedEntry],
298}
299
300impl<'a> FdTableView<'a> {
301    /// Returns the number of entries in the table.
302    fn len(&self) -> usize {
303        self.slice.len()
304    }
305
306    /// Whether the view contains a given `FdNumber`.
307    fn is_some(&self, fd: FdNumber) -> bool {
308        self.slice.get(fd.raw() as usize).map_or(false, |entry| entry.is_some())
309    }
310
311    /// Whether the view does not contain a given `FdNumber`.
312    fn is_none(&self, fd: FdNumber) -> bool {
313        !self.is_some(fd)
314    }
315
316    /// Returns the `FileHandle` for a given `FdNumber`, if any.
317    fn get_file(&self, scope: &RcuReadScope, fd: FdNumber) -> Option<FileHandle> {
318        self.slice
319            .get(fd.raw() as usize)
320            .and_then(|entry| entry.read(scope))
321            .map(|guard| guard.to_handle())
322    }
323
324    /// Returns the `FdTableEntry` for a given `FdNumber`, if any.
325    fn get_entry(&self, scope: &RcuReadScope, fd: FdNumber) -> Option<FdTableEntry> {
326        self.slice
327            .get(fd.raw() as usize)
328            .and_then(|entry| entry.read(scope))
329            .map(|guard| guard.to_entry())
330    }
331}
332
333struct FdTableWriteGuard<'a> {
334    store: &'a FdTableInner,
335    _write_guard: LockDepGuard<'a, ()>,
336}
337
338impl<'a> FdTableWriteGuard<'a> {
339    /// The lowest available `FdNumber`.
340    fn next_fd(&self) -> FdNumber {
341        self.store.next_fd.get()
342    }
343
344    /// Recalculates the lowest available FD >= minfd based on the contents of the map.
345    fn calculate_lowest_available_fd(&self, view: &FdTableView<'_>, minfd: &FdNumber) -> FdNumber {
346        let mut fd: FdNumber = *minfd;
347        while view.is_some(fd) {
348            fd = FdNumber::from_raw(fd.raw() + 1);
349        }
350        fd
351    }
352
353    // Returns the (possibly memoized) lowest available FD >= minfd in this map.
354    fn get_lowest_available_fd(&self, scope: &RcuReadScope, minfd: FdNumber) -> FdNumber {
355        if minfd > self.store.next_fd.get() {
356            let view = self.store.read(scope);
357            return self.calculate_lowest_available_fd(&view, &minfd);
358        }
359        self.store.next_fd.get()
360    }
361
362    /// Returns the `FileHandle` for a given `FdNumber`, if any.
363    fn get_file(&self, scope: &RcuReadScope, fd: FdNumber) -> Option<FileHandle> {
364        self.store.read(scope).get_file(scope, fd)
365    }
366
367    /// Inserts a new entry into the `FdTable`.
368    ///
369    /// Returns whether the `FdTable` previously contained an entry for the given `FdNumber`.
370    fn insert_entry(
371        &self,
372        scope: &RcuReadScope,
373        fd: FdNumber,
374        rlimit: u64,
375        entry: FdTableEntry,
376    ) -> Result<bool, Errno> {
377        let raw_fd = fd.raw();
378        if raw_fd < 0 {
379            return error!(EBADF);
380        }
381        if raw_fd as u64 >= rlimit {
382            return error!(EMFILE);
383        }
384        let mut view = self.store.read(scope);
385        if raw_fd == self.store.next_fd.get().raw() {
386            self.store
387                .next_fd
388                .set(self.calculate_lowest_available_fd(&view, &FdNumber::from_raw(raw_fd + 1)));
389        }
390        let raw_fd = raw_fd as usize;
391        if view.len() <= raw_fd {
392            // SAFETY: The write guard excludes concurrent writers.
393            unsafe { self.store.entries.ensure_at_least(raw_fd + 1) };
394            view = self.store.read(scope);
395        }
396        let id = self.store.id();
397        Ok(view.slice[raw_fd].set_entry(id, entry))
398    }
399
400    /// Removes an entry from the `FdTable`.
401    ///
402    /// Returns whether the `FdTable` previously contained an entry for the given `FdNumber`.
403    fn remove_entry(&self, scope: &RcuReadScope, fd: &FdNumber) -> bool {
404        let raw_fd = fd.raw() as usize;
405        let view = self.store.read(scope);
406        if raw_fd >= view.len() {
407            return false;
408        }
409        let id = self.store.id();
410        let removed = view.slice[raw_fd].clear(id);
411        if removed && raw_fd < self.store.next_fd.get().raw() as usize {
412            self.store.next_fd.set(*fd);
413        }
414        removed
415    }
416
417    /// Sets the flags for a given `FdNumber`.
418    ///
419    /// Returns `Errno` if the `FdTable` does not contain an entry for the given `FdNumber`.
420    fn set_fd_flags(
421        &self,
422        scope: &RcuReadScope,
423        fd: FdNumber,
424        flags: FdFlags,
425    ) -> Result<(), Errno> {
426        let view = self.store.read(scope);
427        if view.is_none(fd) {
428            return error!(EBADF);
429        }
430        let raw_fd = fd.raw() as usize;
431        view.slice[raw_fd].set_flags(flags);
432        Ok(())
433    }
434
435    /// Retains only the entries for which the given predicate returns `true`.
436    ///
437    /// The predicate is called with the `FdNumber` and a mutable reference to the `FdFlags` for
438    /// each entry in the `FdTable`. If the predicate returns `false`, the entry is removed from
439    /// the `FdTable`. Otherwise, the `FdFlags` are updated to the value modified by the predicate.
440    fn retain<F>(&self, scope: &RcuReadScope, mut predicate: F)
441    where
442        F: FnMut(FdNumber, &mut FdFlags) -> bool,
443    {
444        let id = self.store.id();
445        let view = self.store.read(scope);
446        for (index, encoded_entry) in view.slice.iter().enumerate() {
447            let fd = FdNumber::from_raw(index as i32);
448            if let Some(guard) = encoded_entry.read(scope) {
449                let mut modified_flags = guard.flags();
450                if !predicate(fd, &mut modified_flags) {
451                    encoded_entry.clear(id);
452                } else if modified_flags != guard.flags() {
453                    encoded_entry.set_flags(modified_flags);
454                }
455            }
456        }
457        self.store.next_fd.set(self.calculate_lowest_available_fd(&view, &FdNumber::from_raw(0)));
458    }
459
460    /// Retain none of the entries in the table.
461    fn clear(&self) {
462        self.retain(&RcuReadScope::new(), |_, _| false);
463    }
464
465    /// Replaces the `FileHandle` for each entry in the `FdTable` with the result of the given
466    /// predicate.
467    ///
468    /// The predicate is called with the `FileHandle` for each entry in the `FdTable`. If the
469    /// predicate returns `Some(file)`, the entry is updated with the new `FileHandle`. Otherwise,
470    /// the entry is left unchanged.
471    fn remap<F>(&self, scope: &RcuReadScope, predicate: F)
472    where
473        F: Fn(&FileHandle) -> Option<FileHandle>,
474    {
475        let id = self.store.id();
476        let view = self.store.read(scope);
477        for encoded_entry in view.slice.iter() {
478            if let Some(guard) = encoded_entry.read(scope) {
479                let file = guard.to_handle();
480                if let Some(replacement_file) = predicate(&file) {
481                    encoded_entry.set_file(id, replacement_file);
482                }
483            }
484        }
485    }
486}
487
488/// An `FdNumber` that can be atomically updated.
489///
490/// Used for the `next_fd` field of `FdTableInner`, which is only modified by the `FdTable` when
491/// holding the `writer_queue` lock.
492#[derive(Debug, Default)]
493struct AtomicFdNumber {
494    /// The raw value of the `FdNumber`.
495    value: AtomicI32,
496}
497
498impl AtomicFdNumber {
499    /// Returns the current value of the `FdNumber`.
500    ///
501    /// Uses `Ordering::Relaxed`.
502    fn get(&self) -> FdNumber {
503        FdNumber::from_raw(self.value.load(Ordering::Relaxed))
504    }
505
506    /// Sets the value of the `FdNumber`.
507    ///
508    /// Uses `Ordering::Relaxed`.
509    fn set(&self, value: FdNumber) {
510        self.value.store(value.raw(), Ordering::Relaxed);
511    }
512}
513
514impl Clone for AtomicFdNumber {
515    fn clone(&self) -> Self {
516        Self { value: AtomicI32::new(self.value.load(Ordering::Relaxed)) }
517    }
518}
519
520/// The state of an `FdTable` that is shared between tasks.
521///
522/// The `writer_queue` is used to serialize concurrent writers to the `FdTable`, and to prevent
523/// writers from being blocked by readers.
524#[derive(Debug)]
525struct FdTableInner {
526    // The number of shared references to this table.
527    share_count: AtomicUsize,
528
529    /// The entries of the `FdTable`.
530    entries: RcuArray<EncodedEntry>,
531
532    /// The next available `FdNumber`.
533    next_fd: AtomicFdNumber,
534
535    /// A mutex used to serialize concurrent writers to the `FdTable`, and to prevent writers from
536    /// being blocked by readers.
537    writer_queue: LockDepMutex<(), FdTableWriterQueueLock>,
538}
539
540impl Default for FdTableInner {
541    fn default() -> Self {
542        FdTableInner {
543            share_count: AtomicUsize::new(1),
544            entries: Default::default(),
545            next_fd: AtomicFdNumber::default(),
546            writer_queue: LockDepMutex::new(()),
547        }
548    }
549}
550
551impl Clone for FdTableInner {
552    fn clone(&self) -> Self {
553        let _guard = self.writer_queue.lock();
554        Self {
555            share_count: AtomicUsize::new(1),
556            entries: self.entries.clone(),
557            next_fd: self.next_fd.clone(),
558            writer_queue: LockDepMutex::new(()),
559        }
560    }
561}
562
563impl Drop for FdTableInner {
564    fn drop(&mut self) {
565        let scope = RcuReadScope::new();
566        let view = self.read(&scope);
567        for entry in view.slice.iter() {
568            assert!(entry.is_none());
569        }
570    }
571}
572
573impl FdTableInner {
574    /// Returns the `FdTableId` of the `FdTableInner`.
575    fn id(&self) -> FdTableId {
576        FdTableId::new(self as *const Self)
577    }
578
579    /// Gets the number of `FdTable` instances sharing this `FdTableInner`.
580    fn share_count(&self) -> usize {
581        self.share_count.load(Ordering::Relaxed)
582    }
583
584    /// Increases the share count for this `FdTableInner`.
585    fn share(&self) {
586        self.share_count.fetch_add(1, Ordering::Relaxed);
587    }
588
589    /// Decreases the share count for this `FdTableInner`. The table is cleared when the count
590    /// reaches zero.
591    fn unshare(self: Arc<Self>) {
592        // Explicitly clear the table when the last sharer of this table drops its reference.
593        //
594        // We cannot rely on the table being implicitly cleared by `Drop`. RCU drops are deferred to
595        // guarantee memory safety. This introduces nondeterminism to the teardown process, but file
596        // cleanup must be done deterministically.
597        if self.share_count.fetch_sub(1, Ordering::Release) == 1 {
598            fence(Ordering::Acquire);
599            // Clearing releases `FileHandle`s, which transitively calls `FileOps::flush()`. This
600            // effectively makes clear into a blocking operation which can re-enter userspace.
601            // Blocking in this way is unsafe while an RCU read lock is held. `FdTable` is managed
602            // by RCU, so a read lock will generally be held in contexts where `unshare()` is
603            // called. Use a delayed release to clear the table at a known point outside of an RCU
604            // read scope, through a reference held outside RCU.
605            register_delayed_release(ClearFdTable(self));
606        }
607    }
608
609    /// Returns a `FdTableView` that provides read-only access to the state of the `FdTableInner`.
610    fn read<'a>(&self, scope: &'a RcuReadScope) -> FdTableView<'a> {
611        let slice = self.entries.as_slice(scope);
612        FdTableView { slice }
613    }
614
615    /// Returns a `FdTableWriteGuard` that provides exclusive access to the state of the
616    /// `FdTableInner`.
617    fn write(&self) -> FdTableWriteGuard<'_> {
618        FdTableWriteGuard { store: self, _write_guard: self.writer_queue.lock() }
619    }
620}
621
622/// An `FdTableInner` that is waiting to be cleared.
623struct ClearFdTable(Arc<FdTableInner>);
624
625impl Releasable for ClearFdTable {
626    type Context<'a> = CurrentTaskAndLocked<'a>;
627    fn release<'a>(self, _context: Self::Context<'a>) {
628        self.0.write().clear();
629    }
630}
631
632/// An RCU smart pointer wrapper for `FdTableInner` that automatically tracks active sharers via the
633/// underlying `share_count` and triggers deterministic cleanup when the last sharer drops its
634/// reference.
635#[derive(Debug)]
636struct FdTableInnerArc {
637    inner: RcuArc<FdTableInner>,
638}
639
640impl Default for FdTableInnerArc {
641    fn default() -> Self {
642        Self::new(FdTableInner::default())
643    }
644}
645
646impl Clone for FdTableInnerArc {
647    fn clone(&self) -> Self {
648        let inner = self.inner.to_arc();
649        inner.share();
650        Self { inner: RcuArc::new(inner) }
651    }
652}
653
654impl Drop for FdTableInnerArc {
655    fn drop(&mut self) {
656        self.inner.to_arc().unshare();
657    }
658}
659
660impl FdTableInnerArc {
661    pub fn new(inner: FdTableInner) -> Self {
662        assert_eq!(inner.share_count(), 1, "FdTableInner must only be shared via clone()");
663        Self { inner: RcuArc::new(Arc::new(inner)) }
664    }
665
666    pub fn read(&self) -> RcuReadGuard<FdTableInner> {
667        self.inner.read()
668    }
669
670    pub fn update(&self, scope: &RcuReadScope, new_inner: FdTableInner) {
671        let old_inner = self.inner.update_swap(scope, Arc::new(new_inner));
672        old_inner.unshare();
673    }
674}
675
676/// An `FdTable` is a table of file descriptors.
677#[derive(Debug, Clone, Default)]
678pub struct FdTable {
679    /// The state of the `FdTable` that is shared between tasks.
680    inner: FdTableInnerArc,
681}
682
683/// The target `FdNumber` for a duplicated file descriptor.
684pub enum TargetFdNumber {
685    /// The duplicated `FdNumber` will be the smallest available `FdNumber`.
686    Default,
687
688    /// The duplicated `FdNumber` should be this specific `FdNumber`.
689    Specific(FdNumber),
690
691    /// The duplicated `FdNumber` should be greater than this `FdNumber`.
692    Minimum(FdNumber),
693}
694
695impl FdTable {
696    /// Returns the `FdTableId` of the `FdTable`.
697    pub fn id(&self) -> FdTableId {
698        self.inner.read().id()
699    }
700
701    /// Returns new unshared `FdTable` that is a snapshot of the state of the `FdTable`.
702    pub fn fork(&self) -> FdTable {
703        let forked = self.inner.read().clone();
704        FdTable { inner: FdTableInnerArc::new(forked) }
705    }
706
707    /// Ensures that this `FdTable` is not shared by any other `FdTable` instances.
708    pub fn unshare(&self) {
709        let unshared = self.inner.read().clone();
710        self.inner.update(&RcuReadScope::new(), unshared);
711    }
712
713    /// Releases the `FdTable`, closing any files opened exclusively by this table.
714    pub fn release(&self) {
715        self.inner.update(&RcuReadScope::new(), Default::default());
716    }
717
718    /// Trims close-on-exec file descriptors from the table.
719    pub fn exec(&self, locked: &mut Locked<Unlocked>, current_task: &CurrentTask) {
720        self.retain(locked, current_task, |_fd, flags| !flags.contains(FdFlags::CLOEXEC));
721    }
722
723    /// Inserts a file descriptor into the table.
724    pub fn insert<L>(
725        &self,
726        locked: &mut Locked<L>,
727        current_task: &CurrentTask,
728        fd: FdNumber,
729        file: FileHandle,
730    ) -> Result<(), Errno>
731    where
732        L: LockBefore<ThreadGroupLimits>,
733    {
734        let flags = FdFlags::empty();
735        let rlimit = current_task.thread_group().get_rlimit(locked, Resource::NOFILE);
736        let inner = self.inner.read();
737        let guard = inner.write();
738        guard.insert_entry(inner.scope(), fd, rlimit, FdTableEntry { file, flags })?;
739        Ok(())
740    }
741
742    /// Adds a file descriptor to the table.
743    ///
744    /// The file descriptor will be assigned the next available number.
745    ///
746    /// Returns the assigned file descriptor number.
747    ///
748    /// This function is the most common way to add a file descriptor to the table.
749    pub fn add<L>(
750        &self,
751        locked: &mut Locked<L>,
752        current_task: &CurrentTask,
753        file: FileHandle,
754        flags: FdFlags,
755    ) -> Result<FdNumber, Errno>
756    where
757        L: LockEqualOrBefore<FileOpsCore>,
758    {
759        let locked = locked.cast_locked::<FileOpsCore>();
760        let rlimit = current_task.thread_group().get_rlimit(locked, Resource::NOFILE);
761        let inner = self.inner.read();
762        let guard = inner.write();
763        let fd = guard.next_fd();
764        guard.insert_entry(inner.scope(), fd, rlimit, FdTableEntry { file, flags })?;
765        Ok(fd)
766    }
767
768    /// Duplicates a file descriptor.
769    ///
770    /// If `target` is `TargetFdNumber::Minimum`, a new `FdNumber` is allocated. Returns the new
771    /// `FdNumber`.
772    pub fn duplicate<L>(
773        &self,
774        locked: &mut Locked<L>,
775        current_task: &CurrentTask,
776        oldfd: FdNumber,
777        target: TargetFdNumber,
778        flags: FdFlags,
779    ) -> Result<FdNumber, Errno>
780    where
781        L: LockBefore<ThreadGroupLimits>,
782    {
783        let rlimit = current_task.thread_group().get_rlimit(locked, Resource::NOFILE);
784        let inner = self.inner.read();
785        let guard = inner.write();
786        let file = guard.get_file(inner.scope(), oldfd).ok_or_else(|| errno!(EBADF))?;
787
788        let fd = match target {
789            TargetFdNumber::Specific(fd) => {
790                // We need to check the rlimit before we remove the entry from state
791                // because we cannot error out after removing the entry.
792                if fd.raw() as u64 >= rlimit {
793                    // ltp_dup201 shows that we're supposed to return EBADF in this
794                    // situation, instead of EMFILE, which is what we normally return
795                    // when we're past the rlimit.
796                    return error!(EBADF);
797                }
798                guard.remove_entry(inner.scope(), &fd);
799                fd
800            }
801            TargetFdNumber::Minimum(fd) => guard.get_lowest_available_fd(inner.scope(), fd),
802            TargetFdNumber::Default => {
803                guard.get_lowest_available_fd(inner.scope(), FdNumber::from_raw(0))
804            }
805        };
806        let existing_entry =
807            guard.insert_entry(inner.scope(), fd, rlimit, FdTableEntry { file, flags })?;
808        assert!(!existing_entry);
809        Ok(fd)
810    }
811
812    /// Returns the file handle associated with the given file descriptor.
813    ///
814    /// Returns the file handle even if the file was opened with `O_PATH`.
815    ///
816    /// This operation is uncommon. Most clients should use `get` instead, which fails if the file
817    /// was opened with `O_PATH`.
818    pub fn get_allowing_opath(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
819        self.get_allowing_opath_with_flags(fd).map(|(file, _flags)| file)
820    }
821
822    /// Returns the file handle and flags associated with the given file descriptor.
823    ///
824    /// Returns the file handle even if the file was opened with `O_PATH`.
825    ///
826    /// This operation is uncommon. Most clients should use `get` instead, which fails if the file
827    /// was opened with `O_PATH`.
828    pub fn get_allowing_opath_with_flags(
829        &self,
830        fd: FdNumber,
831    ) -> Result<(FileHandle, FdFlags), Errno> {
832        let inner = self.inner.read();
833        let view = inner.read(inner.scope());
834        view.get_entry(inner.scope(), fd)
835            .map(|entry| (entry.file, entry.flags))
836            .ok_or_else(|| errno!(EBADF))
837    }
838
839    /// Returns the file handle associated with the given file descriptor.
840    ///
841    /// This operation fails if the file was opened with `O_PATH`.
842    pub fn get(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
843        let file = self.get_allowing_opath(fd)?;
844        if file.flags().contains(OpenFlags::PATH) {
845            return error!(EBADF);
846        }
847        Ok(file)
848    }
849
850    /// Closes the file descriptor associated with the given file descriptor.
851    ///
852    /// This operation fails if the file descriptor is not valid.
853    pub fn close(&self, fd: FdNumber) -> Result<(), Errno> {
854        let inner = self.inner.read();
855        let guard = inner.write();
856        if guard.remove_entry(inner.scope(), &fd) { Ok(()) } else { error!(EBADF) }
857    }
858
859    /// Returns the flags associated with the given file descriptor.
860    ///
861    /// Returns the flags even if the file was opened with `O_PATH`.
862    pub fn get_fd_flags_allowing_opath(&self, fd: FdNumber) -> Result<FdFlags, Errno> {
863        self.get_allowing_opath_with_flags(fd).map(|(_file, flags)| flags)
864    }
865
866    /// Updates the flags of the specified FD with the `request`ed change.
867    ///
868    /// This operation fails if the file descriptor was opened with `O_PATH` or is not valid.
869    pub fn ioctl_fd_flags(
870        &self,
871        current_task: &CurrentTask,
872        fd: FdNumber,
873        request: u32,
874    ) -> Result<(), Errno> {
875        let inner = self.inner.read();
876        let guard = inner.write();
877        let file = guard.get_file(inner.scope(), fd).ok_or_else(|| errno!(EBADF))?;
878        if file.flags().contains(OpenFlags::PATH) {
879            return error!(EBADF);
880        }
881        let flags = match request {
882            FIOCLEX => FdFlags::CLOEXEC,
883            FIONCLEX => FdFlags::empty(),
884            _ => {
885                return error!(EINVAL);
886            }
887        };
888        security::check_file_ioctl_access(current_task, &file, request)?;
889        guard.set_fd_flags(inner.scope(), fd, flags)
890    }
891
892    /// Sets the flags associated with the given file descriptor.
893    ///
894    /// This operation fails if the file descriptor is not valid.
895    pub fn set_fd_flags_allowing_opath(&self, fd: FdNumber, flags: FdFlags) -> Result<(), Errno> {
896        let inner = self.inner.read();
897        let guard = inner.write();
898        guard.set_fd_flags(inner.scope(), fd, flags)
899    }
900
901    /// Retains only the FDs matching the given `predicate`.
902    ///
903    /// The predicate is called with the `FdNumber` and a mutable reference to the `FdFlags` for
904    /// each entry in the `FdTable`. If the predicate returns `false`, the entry is removed from
905    /// the `FdTable`. Otherwise, the `FdFlags` are updated to the value modified by the predicate.
906    pub fn retain<L, F>(&self, _locked: &mut Locked<L>, _current_task: &CurrentTask, predicate: F)
907    where
908        L: LockEqualOrBefore<FileOpsCore>,
909        F: Fn(FdNumber, &mut FdFlags) -> bool,
910    {
911        let inner = self.inner.read();
912        let guard = inner.write();
913        guard.retain(inner.scope(), predicate);
914    }
915
916    /// Returns a vector of all current file descriptors in the table.
917    pub fn get_all_fds(&self) -> Vec<FdNumber> {
918        let inner = self.inner.read();
919        let view = inner.read(inner.scope());
920        view.slice
921            .iter()
922            .enumerate()
923            .filter_map(|(index, encoded_entry)| {
924                if encoded_entry.is_none() { None } else { Some(FdNumber::from_raw(index as i32)) }
925            })
926            .collect()
927    }
928
929    /// Executes `predicate(file) => maybe_replacement` on every non-empty table entry.
930    ///
931    /// Replaces `file` with `replacement_file` in the table when
932    /// `maybe_replacement == Some(replacement_file)`.
933    pub fn remap<L, F: Fn(&FileHandle) -> Option<FileHandle>>(
934        &self,
935        _locked: &mut Locked<L>,
936        _current_task: &CurrentTask,
937        predicate: F,
938    ) where
939        L: LockEqualOrBefore<FileOpsCore>,
940    {
941        let inner = self.inner.read();
942        let guard = inner.write();
943        guard.remap(inner.scope(), predicate);
944    }
945}
946
947#[cfg(test)]
948mod test {
949    use super::*;
950    use crate::fs::fuchsia::SyslogFile;
951    use crate::testing::*;
952
953    fn add(
954        locked: &mut Locked<Unlocked>,
955        current_task: &CurrentTask,
956        files: &FdTable,
957        file: FileHandle,
958    ) -> Result<FdNumber, Errno> {
959        files.add(locked, current_task, file, FdFlags::empty())
960    }
961
962    #[::fuchsia::test]
963    async fn test_fd_table_install() {
964        spawn_kernel_and_run(async |locked, current_task| {
965            let files = FdTable::default();
966            let file = SyslogFile::new_file(locked, &current_task);
967
968            let fd0 = add(locked, &current_task, &files, file.clone()).unwrap();
969            assert_eq!(fd0.raw(), 0);
970            let fd1 = add(locked, &current_task, &files, file.clone()).unwrap();
971            assert_eq!(fd1.raw(), 1);
972
973            assert!(Arc::ptr_eq(&files.get(fd0).unwrap(), &file));
974            assert!(Arc::ptr_eq(&files.get(fd1).unwrap(), &file));
975            assert_eq!(files.get(FdNumber::from_raw(fd1.raw() + 1)).map(|_| ()), error!(EBADF));
976
977            files.release();
978        })
979        .await;
980    }
981
982    #[::fuchsia::test]
983    async fn test_fd_table_fork() {
984        spawn_kernel_and_run(async |locked, current_task| {
985            let files = FdTable::default();
986            let file = SyslogFile::new_file(locked, &current_task);
987
988            let fd0 = add(locked, &current_task, &files, file.clone()).unwrap();
989            let fd1 = add(locked, &current_task, &files, file).unwrap();
990            let fd2 = FdNumber::from_raw(2);
991
992            let forked = files.fork();
993
994            assert_eq!(
995                Arc::as_ptr(&files.get(fd0).unwrap()),
996                Arc::as_ptr(&forked.get(fd0).unwrap())
997            );
998            assert_eq!(
999                Arc::as_ptr(&files.get(fd1).unwrap()),
1000                Arc::as_ptr(&forked.get(fd1).unwrap())
1001            );
1002            assert!(files.get(fd2).is_err());
1003            assert!(forked.get(fd2).is_err());
1004
1005            files.set_fd_flags_allowing_opath(fd0, FdFlags::CLOEXEC).unwrap();
1006            assert_eq!(FdFlags::CLOEXEC, files.get_fd_flags_allowing_opath(fd0).unwrap());
1007            assert_ne!(FdFlags::CLOEXEC, forked.get_fd_flags_allowing_opath(fd0).unwrap());
1008
1009            forked.release();
1010            files.release();
1011        })
1012        .await;
1013    }
1014
1015    #[::fuchsia::test]
1016    async fn test_fd_table_exec() {
1017        spawn_kernel_and_run(async |locked, current_task| {
1018            let files = FdTable::default();
1019            let file = SyslogFile::new_file(locked, &current_task);
1020
1021            let fd0 = add(locked, &current_task, &files, file.clone()).unwrap();
1022            let fd1 = add(locked, &current_task, &files, file).unwrap();
1023
1024            files.set_fd_flags_allowing_opath(fd0, FdFlags::CLOEXEC).unwrap();
1025
1026            assert!(files.get(fd0).is_ok());
1027            assert!(files.get(fd1).is_ok());
1028
1029            files.exec(locked, &current_task);
1030
1031            assert!(files.get(fd0).is_err());
1032            assert!(files.get(fd1).is_ok());
1033
1034            files.release();
1035        })
1036        .await;
1037    }
1038
1039    #[::fuchsia::test]
1040    async fn test_fd_table_pack_values() {
1041        spawn_kernel_and_run(async |locked, current_task| {
1042            let files = FdTable::default();
1043            let file = SyslogFile::new_file(locked, &current_task);
1044
1045            // Add two FDs.
1046            let fd0 = add(locked, &current_task, &files, file.clone()).unwrap();
1047            let fd1 = add(locked, &current_task, &files, file.clone()).unwrap();
1048            assert_eq!(fd0.raw(), 0);
1049            assert_eq!(fd1.raw(), 1);
1050
1051            // Close FD 0
1052            assert!(files.close(fd0).is_ok());
1053            assert!(files.close(fd0).is_err());
1054            // Now it's gone.
1055            assert!(files.get(fd0).is_err());
1056
1057            // The next FD we insert fills in the hole we created.
1058            let another_fd = add(locked, &current_task, &files, file).unwrap();
1059            assert_eq!(another_fd.raw(), 0);
1060
1061            files.release();
1062        })
1063        .await;
1064    }
1065
1066    #[::fuchsia::test]
1067    async fn test_fd_table_shared_release() {
1068        spawn_kernel_and_run(async |locked, current_task| {
1069            let files = FdTable::default();
1070            let file = SyslogFile::new_file(locked, &current_task);
1071
1072            let fd = add(locked, &current_task, &files, file).unwrap();
1073            assert_eq!(files.get_all_fds(), vec![fd]);
1074
1075            // Share the table by cloning `FdTable`
1076            let shared_files = files.clone();
1077            assert_eq!(shared_files.get_all_fds(), vec![fd]);
1078
1079            // Release the original files. Since `shared_files` holds a shared reference, the table
1080            // should not be cleared.
1081            files.release();
1082            assert_eq!(files.get_all_fds(), vec![]);
1083            assert_eq!(shared_files.get_all_fds(), vec![fd]);
1084
1085            // Release the shared files. This should clear the table.
1086            shared_files.release();
1087            assert_eq!(shared_files.get_all_fds(), vec![]);
1088        })
1089        .await;
1090    }
1091}