starnix_core/vfs/
io_uring.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#![allow(non_upper_case_globals)]
6// Expects are used for programming errors.
7#![allow(clippy::unwrap_in_result)]
8
9use crate::mm::memory::MemoryObject;
10use crate::mm::{
11    DesiredAddress, IOVecPtr, MappingName, MappingOptions, MemoryAccessor, MemoryAccessorExt,
12    PAGE_SIZE, ProtectionFlags, read_to_object_as_bytes,
13};
14use crate::task::CurrentTask;
15use crate::vfs::socket::syscalls::{
16    MsgHdrPtr, MsgHdrRef, WithAlternateBuffer, recvmsg_impl, sys_recvfrom, sys_sendmsg, sys_sendto,
17};
18use crate::vfs::syscalls::{
19    sys_pread64, sys_preadv2, sys_pwrite64, sys_pwritev2, sys_read, sys_write,
20};
21use crate::vfs::{
22    Anon, FdNumber, FileHandle, FileObject, FileOps, NamespaceNode, fileops_impl_dataless,
23    fileops_impl_nonseekable, fileops_impl_noop_sync,
24};
25use bitflags::bitflags;
26use starnix_logging::{set_zx_name, track_stub};
27use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, OrderedMutex, TerminalLock, Unlocked};
28use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
29use starnix_types::user_buffer::{UserBuffer, UserBuffers};
30use starnix_uapi::errors::Errno;
31use starnix_uapi::file_mode::Access;
32use starnix_uapi::open_flags::OpenFlags;
33use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
34use starnix_uapi::user_value::UserValue;
35use starnix_uapi::{
36    IORING_FEAT_SINGLE_MMAP, IORING_OFF_CQ_RING, IORING_OFF_SQ_RING, IORING_OFF_SQES, errno, error,
37    io_cqring_offsets, io_sqring_offsets, io_uring_cqe, io_uring_op, io_uring_op_IORING_OP_ACCEPT,
38    io_uring_op_IORING_OP_ASYNC_CANCEL, io_uring_op_IORING_OP_CLOSE, io_uring_op_IORING_OP_CONNECT,
39    io_uring_op_IORING_OP_EPOLL_CTL, io_uring_op_IORING_OP_FADVISE,
40    io_uring_op_IORING_OP_FALLOCATE, io_uring_op_IORING_OP_FILES_UPDATE,
41    io_uring_op_IORING_OP_FSYNC, io_uring_op_IORING_OP_LINK_TIMEOUT, io_uring_op_IORING_OP_MADVISE,
42    io_uring_op_IORING_OP_NOP, io_uring_op_IORING_OP_OPENAT, io_uring_op_IORING_OP_OPENAT2,
43    io_uring_op_IORING_OP_POLL_ADD, io_uring_op_IORING_OP_POLL_REMOVE, io_uring_op_IORING_OP_READ,
44    io_uring_op_IORING_OP_READ_FIXED, io_uring_op_IORING_OP_READV, io_uring_op_IORING_OP_RECV,
45    io_uring_op_IORING_OP_RECVMSG, io_uring_op_IORING_OP_SEND, io_uring_op_IORING_OP_SENDMSG,
46    io_uring_op_IORING_OP_STATX, io_uring_op_IORING_OP_SYNC_FILE_RANGE,
47    io_uring_op_IORING_OP_TIMEOUT, io_uring_op_IORING_OP_TIMEOUT_REMOVE,
48    io_uring_op_IORING_OP_WRITE, io_uring_op_IORING_OP_WRITE_FIXED, io_uring_op_IORING_OP_WRITEV,
49    io_uring_params, io_uring_sqe, io_uring_sqe_flags_bit_IOSQE_ASYNC_BIT,
50    io_uring_sqe_flags_bit_IOSQE_BUFFER_SELECT_BIT,
51    io_uring_sqe_flags_bit_IOSQE_CQE_SKIP_SUCCESS_BIT, io_uring_sqe_flags_bit_IOSQE_FIXED_FILE_BIT,
52    io_uring_sqe_flags_bit_IOSQE_IO_DRAIN_BIT, io_uring_sqe_flags_bit_IOSQE_IO_HARDLINK_BIT,
53    io_uring_sqe_flags_bit_IOSQE_IO_LINK_BIT, off_t, socklen_t, uapi,
54};
55use std::sync::Arc;
56use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
57
58// See https://github.com/google/gvisor/blob/master/pkg/abi/linux/iouring.go#L47
59pub const IORING_MAX_ENTRIES: u32 = 1 << 15; // 32768
60const IORING_MAX_CQ_ENTRIES: u32 = 2 * IORING_MAX_ENTRIES;
61
62bitflags! {
63    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64    pub struct IoRingSetupFlags: u32 {
65        const IoPoll = starnix_uapi::IORING_SETUP_IOPOLL;
66        const SqPoll = starnix_uapi::IORING_SETUP_SQPOLL;
67        const SqAff = starnix_uapi::IORING_SETUP_SQ_AFF;
68        const CqSize = starnix_uapi::IORING_SETUP_CQSIZE;
69        const Clamp = starnix_uapi::IORING_SETUP_CLAMP;
70        const AttachWq = starnix_uapi::IORING_SETUP_ATTACH_WQ;
71        const RDisabled = starnix_uapi::IORING_SETUP_R_DISABLED;
72        const SubmitAll = starnix_uapi::IORING_SETUP_SUBMIT_ALL;
73        const CoopTaskRun = starnix_uapi::IORING_SETUP_COOP_TASKRUN;
74        const TaskRunFlag = starnix_uapi::IORING_SETUP_TASKRUN_FLAG;
75        const SqE128 = starnix_uapi::IORING_SETUP_SQE128;
76        const CqE32 = starnix_uapi::IORING_SETUP_CQE32;
77        const SingleIssuer = starnix_uapi::IORING_SETUP_SINGLE_ISSUER;
78        const DeferTaskRun = starnix_uapi::IORING_SETUP_DEFER_TASKRUN;
79        const NoMmap = starnix_uapi::IORING_SETUP_NO_MMAP;
80        const RegisteredFdOnly = starnix_uapi::IORING_SETUP_REGISTERED_FD_ONLY;
81        const NoSqArray = starnix_uapi::IORING_SETUP_NO_SQARRAY;
82
83        /// The flags that we support. Specifying a flag outside of this set will generate an
84        /// error.
85        const SupportedFlags = starnix_uapi::IORING_SETUP_CQSIZE |
86                               starnix_uapi::IORING_SETUP_COOP_TASKRUN |
87                               starnix_uapi::IORING_SETUP_TASKRUN_FLAG |
88                               starnix_uapi::IORING_SETUP_SINGLE_ISSUER |
89                               starnix_uapi::IORING_SETUP_DEFER_TASKRUN;
90
91        /// The flags that we ignore. Specifying a flags in this set will not generate an
92        /// error but will have no effect.
93        // TODO(https://fxbug.dev/297431387): Implement these flags.
94        const IgnoredFlags = starnix_uapi::IORING_SETUP_COOP_TASKRUN |
95                             starnix_uapi::IORING_SETUP_TASKRUN_FLAG |
96                             starnix_uapi::IORING_SETUP_SINGLE_ISSUER |
97                             starnix_uapi::IORING_SETUP_DEFER_TASKRUN;
98    }
99
100    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
101    struct SqEntryFlags: u8 {
102        const FIXED_FILE = 1 << io_uring_sqe_flags_bit_IOSQE_FIXED_FILE_BIT;
103        const IO_DRAIN = 1 << io_uring_sqe_flags_bit_IOSQE_IO_DRAIN_BIT;
104        const IO_LINK = 1 << io_uring_sqe_flags_bit_IOSQE_IO_LINK_BIT;
105        const IO_HARDLINK = 1 << io_uring_sqe_flags_bit_IOSQE_IO_HARDLINK_BIT;
106        const ASYNC = 1 << io_uring_sqe_flags_bit_IOSQE_ASYNC_BIT;
107        const BUFFER_SELECT = 1 << io_uring_sqe_flags_bit_IOSQE_BUFFER_SELECT_BIT;
108        const CQE_SKIP_SUCCESS = 1 << io_uring_sqe_flags_bit_IOSQE_CQE_SKIP_SUCCESS_BIT;
109    }
110}
111
112impl IoRingSetupFlags {
113    fn build_and_validate_from(value: u32) -> Result<Self, Errno> {
114        let Some(flags) = IoRingSetupFlags::from_bits(value) else {
115            track_stub!(
116                TODO("https://fxbug.dev/297431387"),
117                "io_uring_setup undefined flag(s)",
118                value
119            );
120            return error!(EINVAL);
121        };
122
123        let unsupported_flags = flags.difference(IoRingSetupFlags::SupportedFlags);
124        if !unsupported_flags.is_empty() {
125            track_stub!(
126                TODO("https://fxbug.dev/297431387"),
127                "io_uring_setup unsupported flags",
128                unsupported_flags.bits()
129            );
130            return error!(EINVAL);
131        }
132        let ignored_flags = flags.intersection(IoRingSetupFlags::IgnoredFlags);
133        if !ignored_flags.is_empty() {
134            track_stub!(
135                TODO("https://fxbug.dev/297431387"),
136                "io_uring_setup ignored flags",
137                ignored_flags.bits()
138            );
139        }
140
141        // IORING_SETUP_COOP_TASKRUN requires IORING_SETUP_SINGLE_ISSUER
142        if flags.contains(IoRingSetupFlags::DeferTaskRun)
143            && !flags.contains(IoRingSetupFlags::SingleIssuer)
144        {
145            return error!(EINVAL);
146        }
147
148        return Ok(flags);
149    }
150}
151
152type RingIndex = u32;
153
154type UserRingBufferHeader = uapi::io_uring_buf_ring__bindgen_ty_1__bindgen_ty_1;
155type UserRingBufferEntry = uapi::io_uring_buf;
156
157static_assertions::const_assert_eq!(
158    std::mem::size_of::<u16>(),
159    uapi::size_of_field!(UserRingBufferHeader, tail)
160);
161static_assertions::const_assert_eq!(
162    std::mem::size_of::<UserRingBufferHeader>(),
163    std::mem::size_of::<UserRingBufferEntry>()
164);
165
166/// The control header at the start of the shared buffer.
167///
168/// This structure is not declared in the Linux UAPI. Instead, userspace learns about its structure
169/// from the SQ and CQ offsets returned by `io_uring_setup()`.
170///
171/// We determined this structure by running `io_uring_setup()` and observing the placement of each
172/// field. The total size of the structure is 64 bytes, which we determined by looking at the
173/// offset of the cqes offset. It's likely that many of the bytes at the end of this structure are
174/// just padding for alignment.
175#[repr(C)]
176#[derive(Debug, Default, Copy, Clone, IntoBytes, FromBytes, KnownLayout, Immutable)]
177struct ControlHeader {
178    /// The index of the first element in the submission queue.
179    ///
180    /// These values use the full range of u32, wrapping around on overflow. To find the entry in
181    /// the ring buffer, you need to take this index modulo `sq_ring_entries` or, equivalently,
182    /// mask this value with `sq_ring_mask`.
183    sq_head: u32,
184
185    /// The index of the first element beyond the end of the submission queue.
186    ///
187    /// The number of items in the queue is defined to be `sq_tail` - `sq_head`, which means the
188    /// queue is empty if the head and tail are equal.
189    sq_tail: u32,
190
191    /// The index of the first element in the completion queue.
192    ///
193    /// These values use the full range of u32, wrapping around on overflow. To find the entry in
194    /// the ring buffer, you need to take this index modulo `cq_ring_entries` or, equivalently,
195    /// mask this value with `cq_ring_mask`.
196    cq_head: u32,
197
198    /// The index of the first element beyond the end of the completion queue.
199    ///
200    /// The number of items in the queue is defined to be `cq_tail` - `cq_head`, which means the
201    /// queue is empty if the head and tail are equal.
202    cq_tail: u32,
203
204    /// The mask to apply to map `sq_head` and `sq_tail` into the ring buffer.
205    sq_ring_mask: u32,
206
207    /// The mask to apply to map `cq_head` and `cq_tail` into the ring buffer.
208    cq_ring_mask: u32,
209
210    /// The number of entries in the submission queue.
211    sq_ring_entries: u32,
212
213    /// The number of entries in the completion queue.
214    cq_ring_entries: u32,
215
216    /// The number of submission queue entries that were dropped for being malformed.
217    sq_dropped: u32,
218
219    sq_flags: u32,
220    cq_flags: u32,
221
222    /// The number of completion queue entries that were not placed in the completion queue because
223    /// there were no slots available in the ring buffer.
224    cq_overflow: u32,
225
226    _padding: [u8; 16],
227}
228
229const RING_ALIGNMENT: usize = 64;
230
231// From params.cq_off.cqes reported by sys_io_uring_setup.
232static_assertions::const_assert_eq!(std::mem::size_of::<ControlHeader>(), RING_ALIGNMENT);
233
234/// An entry in the submission queue.
235///
236/// We cannot use the bindgen type generated for `io_uring_sqe` directly because that type contains
237/// unions. Instead, we redefine the type here and assert that the layout matches the one that
238/// defined by bindgen.
239#[repr(C)]
240#[derive(Debug, Default, Copy, Clone, IntoBytes, FromBytes, KnownLayout, Immutable)]
241struct SqEntry {
242    opcode: u8,
243    flags: u8,
244    ioprio: u16,
245    raw_fd: i32,
246    field0: u64,
247    field1: u64,
248    len: u32,
249    op_flags: u32,
250    user_data: u64,
251    buf_index_or_group: u16,
252    personality: u16,
253    field2: u32,
254    field3: [u64; 2usize],
255}
256
257uapi::check_arch_independent_same_layout! {
258    SqEntry = io_uring_sqe {
259        opcode => opcode,
260        flags => flags,
261        ioprio => ioprio,
262        raw_fd => fd,
263        field0 => __bindgen_anon_1,
264        field1 => __bindgen_anon_2,
265        len => len,
266        op_flags => __bindgen_anon_3,
267        user_data => user_data,
268        buf_index_or_group => __bindgen_anon_4,
269        personality => personality,
270        field2 => __bindgen_anon_5,
271        field3 => __bindgen_anon_6,
272    }
273}
274
275uapi::check_arch_independent_layout! {
276    io_uring_recvmsg_out{
277        namelen,
278        controllen,
279        payloadlen,
280        flags,
281    }
282}
283
284impl SqEntry {
285    fn complete(&self, result: Result<SyscallResult, Errno>, flags: u32) -> CqEntry {
286        let res = match result {
287            Ok(return_value) => return_value.value() as i32,
288            Err(errno) => errno.return_value() as i32,
289        };
290        CqEntry { user_data: self.user_data, res, flags }
291    }
292
293    fn fd(&self) -> FdNumber {
294        FdNumber::from_raw(self.raw_fd)
295    }
296
297    fn iovec_addr<Arch: ArchSpecific>(&self, arch: &Arch) -> IOVecPtr {
298        IOVecPtr::new(arch, self.field1)
299    }
300
301    fn iovec_count(&self) -> UserValue<i32> {
302        (self.len as i32).into()
303    }
304
305    fn address(&self) -> UserAddress {
306        self.field1.into()
307    }
308
309    fn length(&self) -> usize {
310        self.len as usize
311    }
312
313    fn offset(&self) -> off_t {
314        self.field0 as off_t
315    }
316
317    fn buf_index(&self) -> usize {
318        self.buf_index_or_group as usize
319    }
320
321    fn group(&self) -> u16 {
322        self.buf_index_or_group
323    }
324}
325
326/// An entry in the completion queue.
327///
328/// We cannot use the bindgen type generated for `io_uring_cqe` directly because that type contains
329/// a variable length array. Instead, we redefine the type here and assert that the layout matches
330/// the one that defined by bindgen.
331#[repr(C)]
332#[derive(Debug, Default, Copy, Clone, IntoBytes, FromBytes, KnownLayout, Immutable)]
333struct CqEntry {
334    pub user_data: u64,
335    pub res: i32,
336    pub flags: u32,
337}
338
339static_assertions::assert_eq_size!(CqEntry, io_uring_cqe);
340static_assertions::const_assert_eq!(
341    std::mem::offset_of!(CqEntry, user_data),
342    std::mem::offset_of!(io_uring_cqe, user_data)
343);
344static_assertions::const_assert_eq!(
345    std::mem::offset_of!(CqEntry, res),
346    std::mem::offset_of!(io_uring_cqe, res)
347);
348static_assertions::const_assert_eq!(
349    std::mem::offset_of!(CqEntry, flags),
350    std::mem::offset_of!(io_uring_cqe, flags)
351);
352
353const CQES_OFFSET: usize = std::mem::size_of::<ControlHeader>();
354
355#[inline]
356fn align_ring_field(offset: usize) -> usize {
357    offset.next_multiple_of(RING_ALIGNMENT)
358}
359struct IoUringMetadata {
360    /// The number of entries in the submission queue.
361    sq_entries: u32,
362
363    /// The number of entries in the completion queue.
364    cq_entries: u32,
365}
366
367impl IoUringMetadata {
368    /// The offset of the compleition queue entry with the given index.
369    ///
370    /// The offset is from the start of the `ring_buffer` VMO.
371    fn cq_entry_offset(&self, index: u32) -> u64 {
372        let index = index % self.cq_entries;
373        (CQES_OFFSET + index as usize * std::mem::size_of::<io_uring_cqe>()) as u64
374    }
375
376    /// The offset of first completion queue entry in the `ring_buffer` VMO.
377    fn cqes_offset(&self) -> usize {
378        CQES_OFFSET
379    }
380
381    /// The offset of submission queue indirection array in the `ring_buffer` VMO.
382    fn array_offset(&self) -> usize {
383        CQES_OFFSET
384            + align_ring_field(self.cq_entries as usize * std::mem::size_of::<io_uring_cqe>())
385    }
386
387    /// The offset of submission queue indirection array entry with the given index in the
388    /// `ring_buffer` VMO.
389    fn array_entry_offset(&self, index: u32) -> u64 {
390        let index = index % self.sq_entries;
391        (self.array_offset() + index as usize * std::mem::size_of::<RingIndex>()) as u64
392    }
393
394    /// The number of bytes in the `ring_buffer` VMO.
395    fn ring_buffer_size(&self) -> usize {
396        self.array_offset() + self.sq_entries as usize * std::mem::size_of::<RingIndex>()
397    }
398
399    /// The offset of the submission queue entry with the given index in the `sq_entries` VMO.
400    ///
401    /// This index is the actual index of the submission queue entry, after indirecting through the
402    /// indirecton array.
403    fn sq_entry_offset(&self, index: u32) -> u64 {
404        let index = index % self.sq_entries;
405        (index as usize * std::mem::size_of::<io_uring_sqe>()) as u64
406    }
407
408    /// The number of bytes in the `sq_entries` VMO.
409    fn sq_entries_size(&self) -> usize {
410        self.sq_entries as usize * std::mem::size_of::<io_uring_sqe>()
411    }
412}
413
414#[repr(u32)]
415enum Op {
416    Accept = io_uring_op_IORING_OP_ACCEPT,
417    AsyncCancel = io_uring_op_IORING_OP_ASYNC_CANCEL,
418    Close = io_uring_op_IORING_OP_CLOSE,
419    Connect = io_uring_op_IORING_OP_CONNECT,
420    EpollCtl = io_uring_op_IORING_OP_EPOLL_CTL,
421    FAdvise = io_uring_op_IORING_OP_FADVISE,
422    FAllocate = io_uring_op_IORING_OP_FALLOCATE,
423    FilesUpdate = io_uring_op_IORING_OP_FILES_UPDATE,
424    FSync = io_uring_op_IORING_OP_FSYNC,
425    LinkTimeout = io_uring_op_IORING_OP_LINK_TIMEOUT,
426    MAdvise = io_uring_op_IORING_OP_MADVISE,
427    NOP = io_uring_op_IORING_OP_NOP,
428    OpenAt = io_uring_op_IORING_OP_OPENAT,
429    OpenAt2 = io_uring_op_IORING_OP_OPENAT2,
430    PollAdd = io_uring_op_IORING_OP_POLL_ADD,
431    PollRemove = io_uring_op_IORING_OP_POLL_REMOVE,
432    Read = io_uring_op_IORING_OP_READ,
433    ReadV = io_uring_op_IORING_OP_READV,
434    ReadFixed = io_uring_op_IORING_OP_READ_FIXED,
435    Recv = io_uring_op_IORING_OP_RECV,
436    RecvMsg = io_uring_op_IORING_OP_RECVMSG,
437    Send = io_uring_op_IORING_OP_SEND,
438    SendMsg = io_uring_op_IORING_OP_SENDMSG,
439    StatX = io_uring_op_IORING_OP_STATX,
440    SyncFileRange = io_uring_op_IORING_OP_SYNC_FILE_RANGE,
441    Timeout = io_uring_op_IORING_OP_TIMEOUT,
442    TimeoutRemove = io_uring_op_IORING_OP_TIMEOUT_REMOVE,
443    Write = io_uring_op_IORING_OP_WRITE,
444    WriteV = io_uring_op_IORING_OP_WRITEV,
445    WriteFixed = io_uring_op_IORING_OP_WRITE_FIXED,
446}
447
448impl Op {
449    fn from_code(opcode: io_uring_op) -> Result<Op, Errno> {
450        match opcode {
451            io_uring_op_IORING_OP_ACCEPT => Ok(Self::Accept),
452            io_uring_op_IORING_OP_ASYNC_CANCEL => Ok(Self::AsyncCancel),
453            io_uring_op_IORING_OP_CLOSE => Ok(Self::Close),
454            io_uring_op_IORING_OP_CONNECT => Ok(Self::Connect),
455            io_uring_op_IORING_OP_EPOLL_CTL => Ok(Self::EpollCtl),
456            io_uring_op_IORING_OP_FADVISE => Ok(Self::FAdvise),
457            io_uring_op_IORING_OP_FALLOCATE => Ok(Self::FAllocate),
458            io_uring_op_IORING_OP_FILES_UPDATE => Ok(Self::FilesUpdate),
459            io_uring_op_IORING_OP_FSYNC => Ok(Self::FSync),
460            io_uring_op_IORING_OP_LINK_TIMEOUT => Ok(Self::LinkTimeout),
461            io_uring_op_IORING_OP_MADVISE => Ok(Self::MAdvise),
462            io_uring_op_IORING_OP_NOP => Ok(Self::NOP),
463            io_uring_op_IORING_OP_OPENAT => Ok(Self::OpenAt),
464            io_uring_op_IORING_OP_OPENAT2 => Ok(Self::OpenAt2),
465            io_uring_op_IORING_OP_POLL_ADD => Ok(Self::PollAdd),
466            io_uring_op_IORING_OP_POLL_REMOVE => Ok(Self::PollRemove),
467            io_uring_op_IORING_OP_READ => Ok(Self::Read),
468            io_uring_op_IORING_OP_READV => Ok(Self::ReadV),
469            io_uring_op_IORING_OP_READ_FIXED => Ok(Self::ReadFixed),
470            io_uring_op_IORING_OP_RECV => Ok(Self::Recv),
471            io_uring_op_IORING_OP_RECVMSG => Ok(Self::RecvMsg),
472            io_uring_op_IORING_OP_SEND => Ok(Self::Send),
473            io_uring_op_IORING_OP_SENDMSG => Ok(Self::SendMsg),
474            io_uring_op_IORING_OP_STATX => Ok(Self::StatX),
475            io_uring_op_IORING_OP_SYNC_FILE_RANGE => Ok(Self::SyncFileRange),
476            io_uring_op_IORING_OP_TIMEOUT => Ok(Self::Timeout),
477            io_uring_op_IORING_OP_TIMEOUT_REMOVE => Ok(Self::TimeoutRemove),
478            io_uring_op_IORING_OP_WRITE => Ok(Self::Write),
479            io_uring_op_IORING_OP_WRITEV => Ok(Self::WriteV),
480            io_uring_op_IORING_OP_WRITE_FIXED => Ok(Self::WriteFixed),
481            _ => error!(EINVAL),
482        }
483    }
484}
485
486// Currently, we read and write the memory shared with userspace via the VMOs. In the future, we
487// will likely want to map the memory for these VMOs into the kernel address space so that we can
488// access their contents more efficiently and so that we can perform the appropriate atomic
489// operations.
490
491// TODO(https://fxbug.dev/297431387): Map `ring_buffer` and `sq_entries` into kernel memory so that
492// this operation becomes memcpy.
493fn read_object<T: FromBytes>(memory_object: &MemoryObject, offset: u64) -> Result<T, Errno> {
494    // SAFETY: read_uninit returns an error if not all the bytes were read.
495    unsafe {
496        read_to_object_as_bytes(|buf| {
497            memory_object.read_uninit(buf, offset).map_err(|_| errno!(EFAULT))?;
498            Ok(())
499        })
500    }
501}
502
503// TODO(https://fxbug.dev/297431387): Map `ring_buffer` and `sq_entries` into kernel memory so that
504// this operation becomes memcpy.
505fn write_object<T: IntoBytes + Immutable>(
506    memory_object: &MemoryObject,
507    offset: u64,
508    value: &T,
509) -> Result<(), Errno> {
510    memory_object.write(value.as_bytes(), offset).map_err(|_| errno!(EFAULT))
511}
512
513/// The memory the IoUring shares with userspace.
514struct IoUringQueue {
515    /// Metadata about the layout of this memory.
516    metadata: IoUringMetadata,
517
518    /// The primary ring buffer.
519    ///
520    /// The ring buffer's memory layout is as follows:
521    ///
522    ///   ControlHeader
523    ///   N completion queue entries
524    ///   An array of u32 values used to indirect indices to the submission queue entries
525    ///
526    /// The ControlHeader is a fixed size, which means the completion queue entries always start
527    /// at the same offset in this VMO.
528    ring_buffer: Arc<MemoryObject>,
529
530    /// A separate VMO for the submission queue entries.
531    ///
532    /// This entries are not necessarily populated in order. Instead, userspace uses the array of
533    /// submission queue indices in the `ring_buffer` in order. That array gives the indices of
534    /// the actual submission queue entries.
535    ///
536    /// IoUring uses this index indirection scheme because submission queue entries do not always
537    /// complete in the same order they were submitted.
538    sq_entries: Arc<MemoryObject>,
539}
540
541impl IoUringQueue {
542    fn new(metadata: IoUringMetadata) -> Result<Self, Errno> {
543        let ring_buffer =
544            zx::Vmo::create(metadata.ring_buffer_size() as u64).map_err(|_| errno!(ENOMEM))?;
545        set_zx_name(&ring_buffer, b"io_uring:ring");
546        let sq_entries =
547            zx::Vmo::create(metadata.sq_entries_size() as u64).map_err(|_| errno!(ENOMEM))?;
548        set_zx_name(&sq_entries, b"io_uring:sqes");
549
550        Ok(Self {
551            metadata,
552            ring_buffer: Arc::new(ring_buffer.into()),
553            sq_entries: Arc::new(sq_entries.into()),
554        })
555    }
556
557    fn write_header(&self, header: ControlHeader) -> Result<(), Errno> {
558        write_object(&self.ring_buffer, 0, &header).map_err(|_| errno!(ENOMEM))
559    }
560
561    fn read_sq_head(&self) -> Result<u32, Errno> {
562        read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, sq_head) as u64)
563    }
564
565    fn write_sq_head(&self, value: u32) -> Result<(), Errno> {
566        write_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, sq_head) as u64, &value)
567    }
568
569    fn read_sq_tail(&self) -> Result<u32, Errno> {
570        // TODO(https://fxbug.dev/297431387): Reading the tail field should be atomic with ordering
571        // acquire once we map these buffers into kernel memory.
572        read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, sq_tail) as u64)
573    }
574
575    fn read_cq_head(&self) -> Result<u32, Errno> {
576        // TODO(https://fxbug.dev/297431387): Reading the head field should be atomic with ordering
577        // acquire once we map these buffers into kernel memory.
578        read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, cq_head) as u64)
579    }
580
581    fn read_cq_tail(&self) -> Result<u32, Errno> {
582        read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, cq_tail) as u64)
583    }
584
585    fn write_cq_tail(&self, value: u32) -> Result<(), Errno> {
586        // TODO(https://fxbug.dev/297431387): Writing the tail field should be atomic with ordering
587        // release once we map these buffers into kernel memory.
588        write_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, cq_tail) as u64, &value)
589    }
590
591    fn read_array_entry(&self, index: u32) -> Result<u32, Errno> {
592        read_object(&self.ring_buffer, self.metadata.array_entry_offset(index))
593    }
594
595    fn read_sq_entry(&self, index: u32) -> Result<SqEntry, Errno> {
596        let sqe_index = self.read_array_entry(index)?;
597        read_object(&self.sq_entries, self.metadata.sq_entry_offset(sqe_index))
598    }
599
600    fn write_cq_entry(&self, index: u32, entry: &CqEntry) -> Result<(), Errno> {
601        write_object(&self.ring_buffer, self.metadata.cq_entry_offset(index), entry)
602    }
603
604    fn increment_overflow(&self) -> Result<(), Errno> {
605        // TODO(https://fxbug.dev/297431387): Incrementing the overflow count should be an atomic
606        // operation.
607        let offset = std::mem::offset_of!(ControlHeader, cq_overflow) as u64;
608        let mut overflow: u32 = read_object(&self.ring_buffer, offset)?;
609        overflow = overflow.saturating_add(1);
610        write_object(&self.ring_buffer, offset, &overflow)
611    }
612
613    /// Pop an entry off the submission queue and update the head to let userspace queue more
614    /// entries.
615    ///
616    /// Returns `None` if the submission queue is empty.
617    fn pop_sq_entry(&self) -> Result<Option<SqEntry>, Errno> {
618        let tail = self.read_sq_tail()?;
619        let head = self.read_sq_head()?;
620        if head != tail {
621            let sq_entry = self.read_sq_entry(head)?;
622            self.write_sq_head(head.wrapping_add(1))?;
623            Ok(Some(sq_entry))
624        } else {
625            Ok(None)
626        }
627    }
628
629    /// Push an entry onto the completion queue and update the tail to let userspace know a new
630    /// entry is available.
631    ///
632    /// If there is no room in the completion queue, this function will increment the overflow
633    /// counter.
634    fn push_cq_entry(&self, entry: &CqEntry) -> Result<(), Errno> {
635        let head = self.read_cq_head()?;
636        let tail = self.read_cq_tail()?;
637        // Check that the offset for the tail location doesn't collide with the head of the queue.
638        // This can happen because the entries are stored in a ring buffer.
639        if head != tail
640            && self.metadata.cq_entry_offset(tail) == self.metadata.cq_entry_offset(head)
641        {
642            self.increment_overflow()?;
643        } else {
644            self.write_cq_entry(tail, entry)?;
645            self.write_cq_tail(tail.wrapping_add(1))?;
646        }
647        Ok(())
648    }
649}
650
651pub struct IoUringFileObject {
652    queue: IoUringQueue,
653    state: OrderedMutex<IoUringFileMutableState, TerminalLock>,
654    _flags: IoRingSetupFlags,
655}
656
657#[derive(Default, Debug)]
658struct IoUringFileMutableState {
659    registered_buffers: UserBuffers,
660    registered_iobuffers: Vec<IoUringProviderRingBuffer>,
661}
662
663impl IoUringFileObject {
664    pub fn new_file<L>(
665        locked: &mut Locked<L>,
666        current_task: &CurrentTask,
667        entries: u32,
668        params: &mut io_uring_params,
669    ) -> Result<FileHandle, Errno>
670    where
671        L: LockEqualOrBefore<FileOpsCore>,
672    {
673        let flags = IoRingSetupFlags::build_and_validate_from(params.flags)?;
674
675        let sq_entries = entries.next_power_of_two();
676        let cq_entries = if flags.contains(IoRingSetupFlags::CqSize) {
677            UserValue::from_raw(params.cq_entries)
678                .validate(sq_entries..IORING_MAX_CQ_ENTRIES)
679                .ok_or_else(|| errno!(EINVAL))?
680                .next_power_of_two()
681        } else {
682            // This operation cannot overflow because sq_entries is capped at IORING_MAX_ENTRIES,
683            // which is only 15 bits.
684            sq_entries * 2
685        };
686
687        let queue =
688            IoUringQueue::new(IoUringMetadata { sq_entries: sq_entries, cq_entries: cq_entries })?;
689
690        queue.write_header(ControlHeader {
691            sq_ring_mask: sq_entries - 1,
692            cq_ring_mask: cq_entries - 1,
693            sq_ring_entries: sq_entries,
694            cq_ring_entries: cq_entries,
695            ..Default::default()
696        })?;
697
698        params.sq_entries = sq_entries;
699        params.cq_entries = cq_entries;
700        params.features = IORING_FEAT_SINGLE_MMAP;
701        params.sq_off = io_sqring_offsets {
702            head: std::mem::offset_of!(ControlHeader, sq_head) as u32,
703            tail: std::mem::offset_of!(ControlHeader, sq_tail) as u32,
704            ring_mask: std::mem::offset_of!(ControlHeader, sq_ring_mask) as u32,
705            ring_entries: std::mem::offset_of!(ControlHeader, sq_ring_entries) as u32,
706            flags: std::mem::offset_of!(ControlHeader, sq_flags) as u32,
707            dropped: std::mem::offset_of!(ControlHeader, sq_dropped) as u32,
708            array: queue.metadata.array_offset() as u32,
709            ..Default::default()
710        };
711        params.cq_off = io_cqring_offsets {
712            head: std::mem::offset_of!(ControlHeader, cq_head) as u32,
713            tail: std::mem::offset_of!(ControlHeader, cq_tail) as u32,
714            ring_mask: std::mem::offset_of!(ControlHeader, cq_ring_mask) as u32,
715            ring_entries: std::mem::offset_of!(ControlHeader, cq_ring_entries) as u32,
716            overflow: std::mem::offset_of!(ControlHeader, cq_overflow) as u32,
717            cqes: queue.metadata.cqes_offset() as u32,
718            flags: std::mem::offset_of!(ControlHeader, cq_flags) as u32,
719            ..Default::default()
720        };
721
722        let object =
723            Box::new(IoUringFileObject { queue, state: Default::default(), _flags: flags });
724        Anon::new_file(locked, current_task, object, OpenFlags::RDWR, "[io_uring]")
725    }
726
727    pub fn register_buffers(&self, locked: &mut Locked<Unlocked>, buffers: UserBuffers) {
728        // The docs for io_uring_register imply that the kernel should actually map this memory
729        // into its own address space when these buffers are registered. That's probably observable
730        // if the client changes the mappings for these addresses between the time they are
731        // registered and they are used. For now, we just store the addresses.
732        self.state.lock(locked).registered_buffers = buffers;
733    }
734
735    pub fn unregister_buffers(&self, locked: &mut Locked<Unlocked>) {
736        self.state.lock(locked).registered_buffers.clear();
737    }
738
739    pub fn register_ring_buffers(
740        &self,
741        locked: &mut Locked<Unlocked>,
742        buffer_definition: uapi::io_uring_buf_reg,
743    ) -> Result<(), Errno> {
744        track_stub!(
745            TODO("https://fxbug.dev/297431387"),
746            "IoUringFileObject::register_ring_buffers"
747        );
748        if !buffer_definition.ring_addr.is_multiple_of(*PAGE_SIZE) {
749            return error!(EINVAL);
750        }
751        if !buffer_definition.ring_entries.is_power_of_two() {
752            return error!(EINVAL);
753        }
754        if buffer_definition.ring_entries > IORING_MAX_ENTRIES {
755            return error!(EINVAL);
756        }
757        self.state
758            .lock(locked)
759            .registered_iobuffers
760            .push(IoUringProviderRingBuffer::new(buffer_definition)?);
761        Ok(())
762    }
763
764    pub fn unregister_ring_buffers(
765        &self,
766        locked: &mut Locked<Unlocked>,
767        buffer_definition: uapi::io_uring_buf_reg,
768    ) -> Result<(), Errno> {
769        if self
770            .state
771            .lock(locked)
772            .registered_iobuffers
773            .extract_if(.., |buffer| buffer.config.bgid == buffer_definition.bgid)
774            .next()
775            .is_none()
776        {
777            return error!(EINVAL);
778        }
779        Ok(())
780    }
781
782    pub fn ring_buffer_status(
783        &self,
784        locked: &mut Locked<Unlocked>,
785        buffer_status: &mut uapi::io_uring_buf_status,
786    ) -> Result<(), Errno> {
787        let state = self.state.lock(locked);
788        let Some(buffer) = state
789            .registered_iobuffers
790            .iter()
791            .find(|buffer| buffer.config.bgid as u32 == buffer_status.buf_group)
792        else {
793            return error!(EINVAL);
794        };
795        buffer_status.head = buffer.head as u32;
796        Ok(())
797    }
798
799    pub fn enter(
800        &self,
801        locked: &mut Locked<Unlocked>,
802        current_task: &CurrentTask,
803        to_submit: u32,
804        _min_complete: u32,
805        _flags: u32,
806    ) -> Result<u32, Errno> {
807        let mut submitted = 0;
808        while let Some(sq_entry) = self.queue.pop_sq_entry()? {
809            submitted += 1;
810            // We currently act as if every SqEntry has IOSQE_IO_DRAIN.
811            let mut complete_flags: u32 = 0;
812            let result = self.execute(locked, current_task, &sq_entry, &mut complete_flags);
813            let cq_entry = sq_entry.complete(result, complete_flags);
814            self.queue.push_cq_entry(&cq_entry)?;
815            if submitted >= to_submit {
816                break;
817            }
818        }
819        Ok(submitted)
820    }
821
822    fn has_registered_buffers(&self, locked: &mut Locked<Unlocked>) -> bool {
823        !self.state.lock(locked).registered_buffers.is_empty()
824    }
825
826    fn check_buffer(&self, locked: &mut Locked<Unlocked>, entry: &SqEntry) -> Result<(), Errno> {
827        let index = entry.buf_index();
828        let state = self.state.lock(locked);
829        let buffers = &state.registered_buffers;
830        if buffers.is_empty() {
831            return error!(EFAULT);
832        }
833        let buffer = buffers.get(index).ok_or_else(|| errno!(EINVAL))?;
834        if !buffer.contains(entry.address(), entry.length()) { error!(EFAULT) } else { Ok(()) }
835    }
836
837    fn execute(
838        &self,
839        locked: &mut Locked<Unlocked>,
840        current_task: &CurrentTask,
841        entry: &SqEntry,
842        complete_flags: &mut u32,
843    ) -> Result<SyscallResult, Errno> {
844        assert_eq!(*complete_flags, 0);
845
846        let flags = SqEntryFlags::from_bits(entry.flags).ok_or_else(|| errno!(EINVAL))?;
847        match Op::from_code(entry.opcode as io_uring_op)? {
848            Op::NOP => Ok(SUCCESS),
849            Op::ReadV => {
850                if !flags.is_empty() {
851                    return error!(EINVAL);
852                }
853                if entry.ioprio != 0 || entry.buf_index() != 0 {
854                    return error!(EINVAL);
855                }
856                sys_preadv2(
857                    locked,
858                    current_task,
859                    entry.fd(),
860                    entry.iovec_addr(current_task),
861                    entry.iovec_count(),
862                    entry.offset(),
863                    SyscallArg::default(),
864                    entry.op_flags,
865                )
866                .map(Into::into)
867            }
868            Op::WriteV => {
869                if !flags.is_empty() {
870                    return error!(EINVAL);
871                }
872                if entry.ioprio != 0 || entry.buf_index() != 0 {
873                    return error!(EINVAL);
874                }
875                sys_pwritev2(
876                    locked,
877                    current_task,
878                    entry.fd(),
879                    entry.iovec_addr(current_task),
880                    entry.iovec_count(),
881                    entry.offset(),
882                    SyscallArg::default(),
883                    entry.op_flags,
884                )
885                .map(Into::into)
886            }
887            Op::ReadFixed => {
888                if !flags.is_empty() {
889                    return error!(EINVAL);
890                }
891                if entry.ioprio != 0 {
892                    return error!(EINVAL);
893                }
894                // TODO(https://fxbug.dev/297431387): We're supposed to make a kernel mapping
895                // when the buffers are registered and we should be performing this operation using
896                // those kernel mappings rather than using the userspace mappings.
897                self.check_buffer(locked, entry)?;
898                do_read(locked, current_task, entry)
899            }
900            Op::WriteFixed => {
901                if !flags.is_empty() {
902                    return error!(EINVAL);
903                }
904                if entry.ioprio != 0 {
905                    return error!(EINVAL);
906                }
907                // TODO(https://fxbug.dev/297431387): We're supposed to make a kernel mapping
908                // when the buffers are registered and we should be performing this operation using
909                // those kernel mappings rather than using the userspace mappings.
910                self.check_buffer(locked, entry)?;
911                do_write(locked, current_task, entry)
912            }
913            Op::Read => {
914                if !flags.is_empty() {
915                    return error!(EINVAL);
916                }
917                if self.has_registered_buffers(locked) {
918                    return error!(EINVAL);
919                }
920                do_read(locked, current_task, entry)
921            }
922            Op::Write => {
923                if !flags.is_empty() {
924                    return error!(EINVAL);
925                }
926                if self.has_registered_buffers(locked) {
927                    return error!(EINVAL);
928                }
929                do_write(locked, current_task, entry)
930            }
931            Op::SendMsg => {
932                if !flags.is_empty() {
933                    return error!(EINVAL);
934                }
935                if entry.ioprio != 0 {
936                    return error!(EINVAL);
937                }
938                sys_sendmsg(
939                    locked,
940                    current_task,
941                    entry.fd(),
942                    MsgHdrPtr::new(current_task, entry.address()),
943                    entry.op_flags,
944                )
945                .map(Into::into)
946            }
947            Op::RecvMsg => {
948                // A struct to hold the information about the provided buffer.
949                // This is needed because the buffer is claimed before the call to `recvmsg_impl`
950                // but the result is adjusted after.
951                struct RecvMsgBufferInfo {
952                    buffer: UserBuffer,
953                    header: uapi::io_uring_recvmsg_out,
954                    buffer_adjustment: usize,
955                }
956                let mut flags = flags;
957                let mut ioprio = entry.ioprio as u32;
958                let msg_hdr_ptr = MsgHdrPtr::new(current_task, entry.address());
959                let (mut msg_hdr_ref, recv_msg_buffer_info): (
960                    MsgHdrRef,
961                    Option<RecvMsgBufferInfo>,
962                ) = if flags.contains(SqEntryFlags::BUFFER_SELECT) {
963                    flags -= SqEntryFlags::BUFFER_SELECT;
964                    // If BUFFER_SELECT is set, the application is providing a buffer for the
965                    // recvmsg operation.
966                    let buffer = self.claim_next_buffer(
967                        locked,
968                        current_task,
969                        entry.group(),
970                        complete_flags,
971                    )?;
972                    let mut msg_hdr = current_task.read_multi_arch_object(msg_hdr_ptr)?;
973                    // The buffer is laid out as follows:
974                    // - io_uring_recvmsg_out
975                    // - sockaddr (name)
976                    // - msghdr.msg_control
977                    // - payload
978                    let headerlen: u32 = std::mem::size_of::<uapi::io_uring_recvmsg_out>() as u32;
979                    let namelen: u32 = msg_hdr.name_len.try_into().map_err(|_| errno!(EINVAL))?;
980                    let controllen: u32 =
981                        msg_hdr.control_len.try_into().map_err(|_| errno!(EINVAL))?;
982                    let buffer_adjustment: u32 = headerlen
983                        .checked_add(namelen)
984                        .and_then(|v| v.checked_add(controllen))
985                        .ok_or_else(|| errno!(EINVAL))?;
986                    let payloadlen: u32 = (buffer.length as u32)
987                        .checked_sub(buffer_adjustment)
988                        .ok_or_else(|| errno!(EINVAL))?;
989                    let io_uring_hdr = uapi::io_uring_recvmsg_out {
990                        namelen,
991                        controllen,
992                        payloadlen,
993                        flags: msg_hdr.flags,
994                    };
995
996                    let name_addr = (buffer.address + headerlen as usize)?;
997                    let control_addr = (name_addr + namelen as usize)?;
998                    let payload_addr = (control_addr + controllen as usize)?;
999                    msg_hdr.name = name_addr;
1000                    msg_hdr.control = control_addr;
1001
1002                    // Zero out the prefix of the buffer that will contain the header, name and
1003                    // control bytes.
1004                    current_task.zero(buffer.address, buffer_adjustment as usize)?;
1005
1006                    let msg_hdr = WithAlternateBuffer::WithAux(
1007                        msg_hdr,
1008                        UserBuffer { address: payload_addr, length: payloadlen as usize },
1009                    );
1010                    (
1011                        msg_hdr.into(),
1012                        Some(RecvMsgBufferInfo {
1013                            buffer,
1014                            header: io_uring_hdr,
1015                            buffer_adjustment: buffer_adjustment as usize,
1016                        }),
1017                    )
1018                } else {
1019                    (msg_hdr_ptr.into(), None)
1020                };
1021                if ioprio & uapi::IORING_RECV_MULTISHOT > 0 {
1022                    // Ignoring IORING_RECV_MULTISHOT
1023                    // Because the IORING_CQE_F_BUFFER flags will never be set, the client will
1024                    // always have to call the syscall again.
1025                    ioprio &= !uapi::IORING_RECV_MULTISHOT;
1026                }
1027                if !flags.is_empty() {
1028                    return error!(EINVAL);
1029                }
1030                if ioprio != 0 {
1031                    return error!(EINVAL);
1032                }
1033                let mut count = recvmsg_impl(
1034                    locked,
1035                    current_task,
1036                    entry.fd(),
1037                    &mut msg_hdr_ref,
1038                    entry.op_flags,
1039                )?;
1040                if let Some(recv_msg_buffer_info) = recv_msg_buffer_info {
1041                    // The result from `recvmsg_impl` is the number of bytes written to the
1042                    // payload. The result of the io_uring operation is the number of bytes
1043                    // written to the provided buffer.
1044                    // 1. Write the io_uring buffer header.
1045                    current_task.write_object(
1046                        recv_msg_buffer_info.buffer.address.into(),
1047                        &recv_msg_buffer_info.header,
1048                    )?;
1049                    // 2. Adjust the written count.
1050                    count += recv_msg_buffer_info.buffer_adjustment;
1051                }
1052                Ok(count.into())
1053            }
1054            Op::Send => {
1055                if !flags.is_empty() {
1056                    return error!(EINVAL);
1057                }
1058                if entry.ioprio != 0 {
1059                    return error!(EINVAL);
1060                }
1061                sys_sendto(
1062                    locked,
1063                    current_task,
1064                    entry.fd(),
1065                    entry.address(),
1066                    entry.length(),
1067                    entry.op_flags,
1068                    UserAddress::default(),
1069                    socklen_t::default(),
1070                )
1071                .map(Into::into)
1072            }
1073            Op::Recv => {
1074                if !flags.is_empty() {
1075                    return error!(EINVAL);
1076                }
1077                if entry.ioprio != 0 {
1078                    return error!(EINVAL);
1079                }
1080                sys_recvfrom(
1081                    locked,
1082                    current_task,
1083                    entry.fd(),
1084                    entry.address(),
1085                    entry.length(),
1086                    entry.op_flags,
1087                    UserAddress::default(),
1088                    UserRef::default(),
1089                )
1090                .map(Into::into)
1091            }
1092            Op::FSync
1093            | Op::PollAdd
1094            | Op::PollRemove
1095            | Op::SyncFileRange
1096            | Op::Timeout
1097            | Op::TimeoutRemove
1098            | Op::Accept
1099            | Op::AsyncCancel
1100            | Op::LinkTimeout
1101            | Op::Connect
1102            | Op::FAllocate
1103            | Op::OpenAt
1104            | Op::Close
1105            | Op::FilesUpdate
1106            | Op::StatX
1107            | Op::FAdvise
1108            | Op::MAdvise
1109            | Op::OpenAt2
1110            | Op::EpollCtl => error!(EOPNOTSUPP),
1111        }
1112    }
1113
1114    fn claim_next_buffer(
1115        &self,
1116        locked: &mut Locked<Unlocked>,
1117        current_task: &CurrentTask,
1118        bgid: u16,
1119        complete_flags: &mut u32,
1120    ) -> Result<UserBuffer, Errno> {
1121        let mut state = self.state.lock(locked);
1122        let Some(buffer) =
1123            state.registered_iobuffers.iter_mut().find(|buffer| buffer.config.bgid == bgid)
1124        else {
1125            return error!(EINVAL);
1126        };
1127        buffer.claim_next(current_task, complete_flags)
1128    }
1129}
1130
1131#[derive(Debug)]
1132struct IoUringProviderRingBuffer {
1133    config: uapi::io_uring_buf_reg,
1134    tail_ptr: UserRef<u16>,
1135    entries_ptr: UserRef<UserRingBufferEntry>,
1136    head: u16,
1137}
1138
1139impl IoUringProviderRingBuffer {
1140    fn new(config: uapi::io_uring_buf_reg) -> Result<Self, Errno> {
1141        let ring_addr = UserAddress::from(config.ring_addr);
1142        let tail_ptr =
1143            UserRef::<u16>::from((ring_addr + std::mem::offset_of!(UserRingBufferHeader, tail))?);
1144        let entries_ptr = UserRef::<UserRingBufferEntry>::from(ring_addr);
1145        Ok(Self { config, tail_ptr, entries_ptr, head: 0 })
1146    }
1147
1148    fn claim_next(
1149        &mut self,
1150        current_task: &CurrentTask,
1151        complete_flags: &mut u32,
1152    ) -> Result<UserBuffer, Errno> {
1153        // TODO(https://fxbug.dev/297431387): Reading the tail field should be atomic with ordering
1154        // acquire.
1155        let tail = current_task.read_object(self.tail_ptr)?;
1156        if self.head == tail {
1157            return error!(ENOBUFS);
1158        }
1159        let buffer_info = current_task.read_object(
1160            self.entries_ptr.at((self.head as usize) % (self.config.ring_entries as usize))?,
1161        )?;
1162        self.head += 1;
1163        *complete_flags |=
1164            uapi::IORING_CQE_F_BUFFER | ((buffer_info.bid as u32) << uapi::IORING_CQE_BUFFER_SHIFT);
1165        Ok(UserBuffer { address: buffer_info.addr.into(), length: buffer_info.len as usize })
1166    }
1167}
1168
1169fn do_read(
1170    locked: &mut Locked<Unlocked>,
1171    current_task: &CurrentTask,
1172    entry: &SqEntry,
1173) -> Result<SyscallResult, Errno> {
1174    let offset = entry.offset();
1175    if offset == -1 {
1176        sys_read(locked, current_task, entry.fd(), entry.address(), entry.length()).map(Into::into)
1177    } else {
1178        sys_pread64(locked, current_task, entry.fd(), entry.address(), entry.length(), offset)
1179            .map(Into::into)
1180    }
1181}
1182
1183fn do_write(
1184    locked: &mut Locked<Unlocked>,
1185    current_task: &CurrentTask,
1186    entry: &SqEntry,
1187) -> Result<SyscallResult, Errno> {
1188    let offset = entry.offset();
1189    if offset == -1 {
1190        sys_write(locked, current_task, entry.fd(), entry.address(), entry.length()).map(Into::into)
1191    } else {
1192        sys_pwrite64(
1193            locked,
1194            current_task,
1195            entry.fd(),
1196            entry.address(),
1197            entry.length(),
1198            entry.offset(),
1199        )
1200        .map(Into::into)
1201    }
1202}
1203
1204impl FileOps for IoUringFileObject {
1205    fileops_impl_nonseekable!();
1206    fileops_impl_noop_sync!();
1207    fileops_impl_dataless!();
1208
1209    fn mmap(
1210        &self,
1211        _locked: &mut Locked<FileOpsCore>,
1212        _file: &FileObject,
1213        current_task: &CurrentTask,
1214        addr: DesiredAddress,
1215        memory_offset: u64,
1216        length: usize,
1217        prot_flags: ProtectionFlags,
1218        options: MappingOptions,
1219        filename: NamespaceNode,
1220    ) -> Result<UserAddress, Errno> {
1221        if !options.contains(MappingOptions::SHARED) {
1222            return error!(EINVAL);
1223        }
1224        let magic_offset: u32 = memory_offset.try_into().map_err(|_| errno!(EINVAL))?;
1225        let memory = match magic_offset {
1226            IORING_OFF_SQ_RING | IORING_OFF_CQ_RING => self.queue.ring_buffer.clone(),
1227            IORING_OFF_SQES => self.queue.sq_entries.clone(),
1228            _ => return error!(EINVAL),
1229        };
1230        current_task.mm()?.map_memory(
1231            addr,
1232            memory,
1233            0,
1234            length,
1235            prot_flags,
1236            Access::rwx(),
1237            options,
1238            MappingName::File(filename.into_mapping(None)?),
1239        )
1240    }
1241}