1#![allow(non_upper_case_globals)]
6#![allow(clippy::unwrap_in_result)]
8
9use crate::mm::memory::MemoryObject;
10use crate::mm::{
11 DesiredAddress, IOVecPtr, MappingName, MappingOptions, MemoryAccessor, MemoryAccessorExt,
12 PAGE_SIZE, ProtectionFlags, read_to_object_as_bytes,
13};
14use crate::task::CurrentTask;
15use crate::vfs::socket::syscalls::{
16 MsgHdrPtr, MsgHdrRef, WithAlternateBuffer, recvmsg_impl, sys_recvfrom, sys_sendmsg, sys_sendto,
17};
18use crate::vfs::syscalls::{
19 sys_pread64, sys_preadv2, sys_pwrite64, sys_pwritev2, sys_read, sys_write,
20};
21use crate::vfs::{
22 Anon, FdNumber, FileHandle, FileObject, FileOps, NamespaceNode, fileops_impl_dataless,
23 fileops_impl_nonseekable, fileops_impl_noop_sync,
24};
25use bitflags::bitflags;
26use starnix_logging::{set_zx_name, track_stub};
27use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, OrderedMutex, TerminalLock, Unlocked};
28use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
29use starnix_types::user_buffer::{UserBuffer, UserBuffers};
30use starnix_uapi::errors::Errno;
31use starnix_uapi::file_mode::Access;
32use starnix_uapi::open_flags::OpenFlags;
33use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
34use starnix_uapi::user_value::UserValue;
35use starnix_uapi::{
36 IORING_FEAT_SINGLE_MMAP, IORING_OFF_CQ_RING, IORING_OFF_SQ_RING, IORING_OFF_SQES, errno, error,
37 io_cqring_offsets, io_sqring_offsets, io_uring_cqe, io_uring_op, io_uring_op_IORING_OP_ACCEPT,
38 io_uring_op_IORING_OP_ASYNC_CANCEL, io_uring_op_IORING_OP_CLOSE, io_uring_op_IORING_OP_CONNECT,
39 io_uring_op_IORING_OP_EPOLL_CTL, io_uring_op_IORING_OP_FADVISE,
40 io_uring_op_IORING_OP_FALLOCATE, io_uring_op_IORING_OP_FILES_UPDATE,
41 io_uring_op_IORING_OP_FSYNC, io_uring_op_IORING_OP_LINK_TIMEOUT, io_uring_op_IORING_OP_MADVISE,
42 io_uring_op_IORING_OP_NOP, io_uring_op_IORING_OP_OPENAT, io_uring_op_IORING_OP_OPENAT2,
43 io_uring_op_IORING_OP_POLL_ADD, io_uring_op_IORING_OP_POLL_REMOVE, io_uring_op_IORING_OP_READ,
44 io_uring_op_IORING_OP_READ_FIXED, io_uring_op_IORING_OP_READV, io_uring_op_IORING_OP_RECV,
45 io_uring_op_IORING_OP_RECVMSG, io_uring_op_IORING_OP_SEND, io_uring_op_IORING_OP_SENDMSG,
46 io_uring_op_IORING_OP_STATX, io_uring_op_IORING_OP_SYNC_FILE_RANGE,
47 io_uring_op_IORING_OP_TIMEOUT, io_uring_op_IORING_OP_TIMEOUT_REMOVE,
48 io_uring_op_IORING_OP_WRITE, io_uring_op_IORING_OP_WRITE_FIXED, io_uring_op_IORING_OP_WRITEV,
49 io_uring_params, io_uring_sqe, io_uring_sqe_flags_bit_IOSQE_ASYNC_BIT,
50 io_uring_sqe_flags_bit_IOSQE_BUFFER_SELECT_BIT,
51 io_uring_sqe_flags_bit_IOSQE_CQE_SKIP_SUCCESS_BIT, io_uring_sqe_flags_bit_IOSQE_FIXED_FILE_BIT,
52 io_uring_sqe_flags_bit_IOSQE_IO_DRAIN_BIT, io_uring_sqe_flags_bit_IOSQE_IO_HARDLINK_BIT,
53 io_uring_sqe_flags_bit_IOSQE_IO_LINK_BIT, off_t, socklen_t, uapi,
54};
55use std::sync::Arc;
56use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
57
58pub const IORING_MAX_ENTRIES: u32 = 1 << 15; const IORING_MAX_CQ_ENTRIES: u32 = 2 * IORING_MAX_ENTRIES;
61
62bitflags! {
63 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64 pub struct IoRingSetupFlags: u32 {
65 const IoPoll = starnix_uapi::IORING_SETUP_IOPOLL;
66 const SqPoll = starnix_uapi::IORING_SETUP_SQPOLL;
67 const SqAff = starnix_uapi::IORING_SETUP_SQ_AFF;
68 const CqSize = starnix_uapi::IORING_SETUP_CQSIZE;
69 const Clamp = starnix_uapi::IORING_SETUP_CLAMP;
70 const AttachWq = starnix_uapi::IORING_SETUP_ATTACH_WQ;
71 const RDisabled = starnix_uapi::IORING_SETUP_R_DISABLED;
72 const SubmitAll = starnix_uapi::IORING_SETUP_SUBMIT_ALL;
73 const CoopTaskRun = starnix_uapi::IORING_SETUP_COOP_TASKRUN;
74 const TaskRunFlag = starnix_uapi::IORING_SETUP_TASKRUN_FLAG;
75 const SqE128 = starnix_uapi::IORING_SETUP_SQE128;
76 const CqE32 = starnix_uapi::IORING_SETUP_CQE32;
77 const SingleIssuer = starnix_uapi::IORING_SETUP_SINGLE_ISSUER;
78 const DeferTaskRun = starnix_uapi::IORING_SETUP_DEFER_TASKRUN;
79 const NoMmap = starnix_uapi::IORING_SETUP_NO_MMAP;
80 const RegisteredFdOnly = starnix_uapi::IORING_SETUP_REGISTERED_FD_ONLY;
81 const NoSqArray = starnix_uapi::IORING_SETUP_NO_SQARRAY;
82
83 const SupportedFlags = starnix_uapi::IORING_SETUP_CQSIZE |
86 starnix_uapi::IORING_SETUP_COOP_TASKRUN |
87 starnix_uapi::IORING_SETUP_TASKRUN_FLAG |
88 starnix_uapi::IORING_SETUP_SINGLE_ISSUER |
89 starnix_uapi::IORING_SETUP_DEFER_TASKRUN;
90
91 const IgnoredFlags = starnix_uapi::IORING_SETUP_COOP_TASKRUN |
95 starnix_uapi::IORING_SETUP_TASKRUN_FLAG |
96 starnix_uapi::IORING_SETUP_SINGLE_ISSUER |
97 starnix_uapi::IORING_SETUP_DEFER_TASKRUN;
98 }
99
100 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
101 struct SqEntryFlags: u8 {
102 const FIXED_FILE = 1 << io_uring_sqe_flags_bit_IOSQE_FIXED_FILE_BIT;
103 const IO_DRAIN = 1 << io_uring_sqe_flags_bit_IOSQE_IO_DRAIN_BIT;
104 const IO_LINK = 1 << io_uring_sqe_flags_bit_IOSQE_IO_LINK_BIT;
105 const IO_HARDLINK = 1 << io_uring_sqe_flags_bit_IOSQE_IO_HARDLINK_BIT;
106 const ASYNC = 1 << io_uring_sqe_flags_bit_IOSQE_ASYNC_BIT;
107 const BUFFER_SELECT = 1 << io_uring_sqe_flags_bit_IOSQE_BUFFER_SELECT_BIT;
108 const CQE_SKIP_SUCCESS = 1 << io_uring_sqe_flags_bit_IOSQE_CQE_SKIP_SUCCESS_BIT;
109 }
110}
111
112impl IoRingSetupFlags {
113 fn build_and_validate_from(value: u32) -> Result<Self, Errno> {
114 let Some(flags) = IoRingSetupFlags::from_bits(value) else {
115 track_stub!(
116 TODO("https://fxbug.dev/297431387"),
117 "io_uring_setup undefined flag(s)",
118 value
119 );
120 return error!(EINVAL);
121 };
122
123 let unsupported_flags = flags.difference(IoRingSetupFlags::SupportedFlags);
124 if !unsupported_flags.is_empty() {
125 track_stub!(
126 TODO("https://fxbug.dev/297431387"),
127 "io_uring_setup unsupported flags",
128 unsupported_flags.bits()
129 );
130 return error!(EINVAL);
131 }
132 let ignored_flags = flags.intersection(IoRingSetupFlags::IgnoredFlags);
133 if !ignored_flags.is_empty() {
134 track_stub!(
135 TODO("https://fxbug.dev/297431387"),
136 "io_uring_setup ignored flags",
137 ignored_flags.bits()
138 );
139 }
140
141 if flags.contains(IoRingSetupFlags::DeferTaskRun)
143 && !flags.contains(IoRingSetupFlags::SingleIssuer)
144 {
145 return error!(EINVAL);
146 }
147
148 return Ok(flags);
149 }
150}
151
152type RingIndex = u32;
153
154type UserRingBufferHeader = uapi::io_uring_buf_ring__bindgen_ty_1__bindgen_ty_1;
155type UserRingBufferEntry = uapi::io_uring_buf;
156
157static_assertions::const_assert_eq!(
158 std::mem::size_of::<u16>(),
159 uapi::size_of_field!(UserRingBufferHeader, tail)
160);
161static_assertions::const_assert_eq!(
162 std::mem::size_of::<UserRingBufferHeader>(),
163 std::mem::size_of::<UserRingBufferEntry>()
164);
165
166#[repr(C)]
176#[derive(Debug, Default, Copy, Clone, IntoBytes, FromBytes, KnownLayout, Immutable)]
177struct ControlHeader {
178 sq_head: u32,
184
185 sq_tail: u32,
190
191 cq_head: u32,
197
198 cq_tail: u32,
203
204 sq_ring_mask: u32,
206
207 cq_ring_mask: u32,
209
210 sq_ring_entries: u32,
212
213 cq_ring_entries: u32,
215
216 sq_dropped: u32,
218
219 sq_flags: u32,
220 cq_flags: u32,
221
222 cq_overflow: u32,
225
226 _padding: [u8; 16],
227}
228
229const RING_ALIGNMENT: usize = 64;
230
231static_assertions::const_assert_eq!(std::mem::size_of::<ControlHeader>(), RING_ALIGNMENT);
233
234#[repr(C)]
240#[derive(Debug, Default, Copy, Clone, IntoBytes, FromBytes, KnownLayout, Immutable)]
241struct SqEntry {
242 opcode: u8,
243 flags: u8,
244 ioprio: u16,
245 raw_fd: i32,
246 field0: u64,
247 field1: u64,
248 len: u32,
249 op_flags: u32,
250 user_data: u64,
251 buf_index_or_group: u16,
252 personality: u16,
253 field2: u32,
254 field3: [u64; 2usize],
255}
256
257uapi::check_arch_independent_same_layout! {
258 SqEntry = io_uring_sqe {
259 opcode => opcode,
260 flags => flags,
261 ioprio => ioprio,
262 raw_fd => fd,
263 field0 => __bindgen_anon_1,
264 field1 => __bindgen_anon_2,
265 len => len,
266 op_flags => __bindgen_anon_3,
267 user_data => user_data,
268 buf_index_or_group => __bindgen_anon_4,
269 personality => personality,
270 field2 => __bindgen_anon_5,
271 field3 => __bindgen_anon_6,
272 }
273}
274
275uapi::check_arch_independent_layout! {
276 io_uring_recvmsg_out{
277 namelen,
278 controllen,
279 payloadlen,
280 flags,
281 }
282}
283
284impl SqEntry {
285 fn complete(&self, result: Result<SyscallResult, Errno>, flags: u32) -> CqEntry {
286 let res = match result {
287 Ok(return_value) => return_value.value() as i32,
288 Err(errno) => errno.return_value() as i32,
289 };
290 CqEntry { user_data: self.user_data, res, flags }
291 }
292
293 fn fd(&self) -> FdNumber {
294 FdNumber::from_raw(self.raw_fd)
295 }
296
297 fn iovec_addr<Arch: ArchSpecific>(&self, arch: &Arch) -> IOVecPtr {
298 IOVecPtr::new(arch, self.field1)
299 }
300
301 fn iovec_count(&self) -> UserValue<i32> {
302 (self.len as i32).into()
303 }
304
305 fn address(&self) -> UserAddress {
306 self.field1.into()
307 }
308
309 fn length(&self) -> usize {
310 self.len as usize
311 }
312
313 fn offset(&self) -> off_t {
314 self.field0 as off_t
315 }
316
317 fn buf_index(&self) -> usize {
318 self.buf_index_or_group as usize
319 }
320
321 fn group(&self) -> u16 {
322 self.buf_index_or_group
323 }
324}
325
326#[repr(C)]
332#[derive(Debug, Default, Copy, Clone, IntoBytes, FromBytes, KnownLayout, Immutable)]
333struct CqEntry {
334 pub user_data: u64,
335 pub res: i32,
336 pub flags: u32,
337}
338
339static_assertions::assert_eq_size!(CqEntry, io_uring_cqe);
340static_assertions::const_assert_eq!(
341 std::mem::offset_of!(CqEntry, user_data),
342 std::mem::offset_of!(io_uring_cqe, user_data)
343);
344static_assertions::const_assert_eq!(
345 std::mem::offset_of!(CqEntry, res),
346 std::mem::offset_of!(io_uring_cqe, res)
347);
348static_assertions::const_assert_eq!(
349 std::mem::offset_of!(CqEntry, flags),
350 std::mem::offset_of!(io_uring_cqe, flags)
351);
352
353const CQES_OFFSET: usize = std::mem::size_of::<ControlHeader>();
354
355#[inline]
356fn align_ring_field(offset: usize) -> usize {
357 offset.next_multiple_of(RING_ALIGNMENT)
358}
359struct IoUringMetadata {
360 sq_entries: u32,
362
363 cq_entries: u32,
365}
366
367impl IoUringMetadata {
368 fn cq_entry_offset(&self, index: u32) -> u64 {
372 let index = index % self.cq_entries;
373 (CQES_OFFSET + index as usize * std::mem::size_of::<io_uring_cqe>()) as u64
374 }
375
376 fn cqes_offset(&self) -> usize {
378 CQES_OFFSET
379 }
380
381 fn array_offset(&self) -> usize {
383 CQES_OFFSET
384 + align_ring_field(self.cq_entries as usize * std::mem::size_of::<io_uring_cqe>())
385 }
386
387 fn array_entry_offset(&self, index: u32) -> u64 {
390 let index = index % self.sq_entries;
391 (self.array_offset() + index as usize * std::mem::size_of::<RingIndex>()) as u64
392 }
393
394 fn ring_buffer_size(&self) -> usize {
396 self.array_offset() + self.sq_entries as usize * std::mem::size_of::<RingIndex>()
397 }
398
399 fn sq_entry_offset(&self, index: u32) -> u64 {
404 let index = index % self.sq_entries;
405 (index as usize * std::mem::size_of::<io_uring_sqe>()) as u64
406 }
407
408 fn sq_entries_size(&self) -> usize {
410 self.sq_entries as usize * std::mem::size_of::<io_uring_sqe>()
411 }
412}
413
414#[repr(u32)]
415enum Op {
416 Accept = io_uring_op_IORING_OP_ACCEPT,
417 AsyncCancel = io_uring_op_IORING_OP_ASYNC_CANCEL,
418 Close = io_uring_op_IORING_OP_CLOSE,
419 Connect = io_uring_op_IORING_OP_CONNECT,
420 EpollCtl = io_uring_op_IORING_OP_EPOLL_CTL,
421 FAdvise = io_uring_op_IORING_OP_FADVISE,
422 FAllocate = io_uring_op_IORING_OP_FALLOCATE,
423 FilesUpdate = io_uring_op_IORING_OP_FILES_UPDATE,
424 FSync = io_uring_op_IORING_OP_FSYNC,
425 LinkTimeout = io_uring_op_IORING_OP_LINK_TIMEOUT,
426 MAdvise = io_uring_op_IORING_OP_MADVISE,
427 NOP = io_uring_op_IORING_OP_NOP,
428 OpenAt = io_uring_op_IORING_OP_OPENAT,
429 OpenAt2 = io_uring_op_IORING_OP_OPENAT2,
430 PollAdd = io_uring_op_IORING_OP_POLL_ADD,
431 PollRemove = io_uring_op_IORING_OP_POLL_REMOVE,
432 Read = io_uring_op_IORING_OP_READ,
433 ReadV = io_uring_op_IORING_OP_READV,
434 ReadFixed = io_uring_op_IORING_OP_READ_FIXED,
435 Recv = io_uring_op_IORING_OP_RECV,
436 RecvMsg = io_uring_op_IORING_OP_RECVMSG,
437 Send = io_uring_op_IORING_OP_SEND,
438 SendMsg = io_uring_op_IORING_OP_SENDMSG,
439 StatX = io_uring_op_IORING_OP_STATX,
440 SyncFileRange = io_uring_op_IORING_OP_SYNC_FILE_RANGE,
441 Timeout = io_uring_op_IORING_OP_TIMEOUT,
442 TimeoutRemove = io_uring_op_IORING_OP_TIMEOUT_REMOVE,
443 Write = io_uring_op_IORING_OP_WRITE,
444 WriteV = io_uring_op_IORING_OP_WRITEV,
445 WriteFixed = io_uring_op_IORING_OP_WRITE_FIXED,
446}
447
448impl Op {
449 fn from_code(opcode: io_uring_op) -> Result<Op, Errno> {
450 match opcode {
451 io_uring_op_IORING_OP_ACCEPT => Ok(Self::Accept),
452 io_uring_op_IORING_OP_ASYNC_CANCEL => Ok(Self::AsyncCancel),
453 io_uring_op_IORING_OP_CLOSE => Ok(Self::Close),
454 io_uring_op_IORING_OP_CONNECT => Ok(Self::Connect),
455 io_uring_op_IORING_OP_EPOLL_CTL => Ok(Self::EpollCtl),
456 io_uring_op_IORING_OP_FADVISE => Ok(Self::FAdvise),
457 io_uring_op_IORING_OP_FALLOCATE => Ok(Self::FAllocate),
458 io_uring_op_IORING_OP_FILES_UPDATE => Ok(Self::FilesUpdate),
459 io_uring_op_IORING_OP_FSYNC => Ok(Self::FSync),
460 io_uring_op_IORING_OP_LINK_TIMEOUT => Ok(Self::LinkTimeout),
461 io_uring_op_IORING_OP_MADVISE => Ok(Self::MAdvise),
462 io_uring_op_IORING_OP_NOP => Ok(Self::NOP),
463 io_uring_op_IORING_OP_OPENAT => Ok(Self::OpenAt),
464 io_uring_op_IORING_OP_OPENAT2 => Ok(Self::OpenAt2),
465 io_uring_op_IORING_OP_POLL_ADD => Ok(Self::PollAdd),
466 io_uring_op_IORING_OP_POLL_REMOVE => Ok(Self::PollRemove),
467 io_uring_op_IORING_OP_READ => Ok(Self::Read),
468 io_uring_op_IORING_OP_READV => Ok(Self::ReadV),
469 io_uring_op_IORING_OP_READ_FIXED => Ok(Self::ReadFixed),
470 io_uring_op_IORING_OP_RECV => Ok(Self::Recv),
471 io_uring_op_IORING_OP_RECVMSG => Ok(Self::RecvMsg),
472 io_uring_op_IORING_OP_SEND => Ok(Self::Send),
473 io_uring_op_IORING_OP_SENDMSG => Ok(Self::SendMsg),
474 io_uring_op_IORING_OP_STATX => Ok(Self::StatX),
475 io_uring_op_IORING_OP_SYNC_FILE_RANGE => Ok(Self::SyncFileRange),
476 io_uring_op_IORING_OP_TIMEOUT => Ok(Self::Timeout),
477 io_uring_op_IORING_OP_TIMEOUT_REMOVE => Ok(Self::TimeoutRemove),
478 io_uring_op_IORING_OP_WRITE => Ok(Self::Write),
479 io_uring_op_IORING_OP_WRITEV => Ok(Self::WriteV),
480 io_uring_op_IORING_OP_WRITE_FIXED => Ok(Self::WriteFixed),
481 _ => error!(EINVAL),
482 }
483 }
484}
485
486fn read_object<T: FromBytes>(memory_object: &MemoryObject, offset: u64) -> Result<T, Errno> {
494 unsafe {
496 read_to_object_as_bytes(|buf| {
497 memory_object.read_uninit(buf, offset).map_err(|_| errno!(EFAULT))?;
498 Ok(())
499 })
500 }
501}
502
503fn write_object<T: IntoBytes + Immutable>(
506 memory_object: &MemoryObject,
507 offset: u64,
508 value: &T,
509) -> Result<(), Errno> {
510 memory_object.write(value.as_bytes(), offset).map_err(|_| errno!(EFAULT))
511}
512
513struct IoUringQueue {
515 metadata: IoUringMetadata,
517
518 ring_buffer: Arc<MemoryObject>,
529
530 sq_entries: Arc<MemoryObject>,
539}
540
541impl IoUringQueue {
542 fn new(metadata: IoUringMetadata) -> Result<Self, Errno> {
543 let ring_buffer =
544 zx::Vmo::create(metadata.ring_buffer_size() as u64).map_err(|_| errno!(ENOMEM))?;
545 set_zx_name(&ring_buffer, b"io_uring:ring");
546 let sq_entries =
547 zx::Vmo::create(metadata.sq_entries_size() as u64).map_err(|_| errno!(ENOMEM))?;
548 set_zx_name(&sq_entries, b"io_uring:sqes");
549
550 Ok(Self {
551 metadata,
552 ring_buffer: Arc::new(ring_buffer.into()),
553 sq_entries: Arc::new(sq_entries.into()),
554 })
555 }
556
557 fn write_header(&self, header: ControlHeader) -> Result<(), Errno> {
558 write_object(&self.ring_buffer, 0, &header).map_err(|_| errno!(ENOMEM))
559 }
560
561 fn read_sq_head(&self) -> Result<u32, Errno> {
562 read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, sq_head) as u64)
563 }
564
565 fn write_sq_head(&self, value: u32) -> Result<(), Errno> {
566 write_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, sq_head) as u64, &value)
567 }
568
569 fn read_sq_tail(&self) -> Result<u32, Errno> {
570 read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, sq_tail) as u64)
573 }
574
575 fn read_cq_head(&self) -> Result<u32, Errno> {
576 read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, cq_head) as u64)
579 }
580
581 fn read_cq_tail(&self) -> Result<u32, Errno> {
582 read_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, cq_tail) as u64)
583 }
584
585 fn write_cq_tail(&self, value: u32) -> Result<(), Errno> {
586 write_object(&self.ring_buffer, std::mem::offset_of!(ControlHeader, cq_tail) as u64, &value)
589 }
590
591 fn read_array_entry(&self, index: u32) -> Result<u32, Errno> {
592 read_object(&self.ring_buffer, self.metadata.array_entry_offset(index))
593 }
594
595 fn read_sq_entry(&self, index: u32) -> Result<SqEntry, Errno> {
596 let sqe_index = self.read_array_entry(index)?;
597 read_object(&self.sq_entries, self.metadata.sq_entry_offset(sqe_index))
598 }
599
600 fn write_cq_entry(&self, index: u32, entry: &CqEntry) -> Result<(), Errno> {
601 write_object(&self.ring_buffer, self.metadata.cq_entry_offset(index), entry)
602 }
603
604 fn increment_overflow(&self) -> Result<(), Errno> {
605 let offset = std::mem::offset_of!(ControlHeader, cq_overflow) as u64;
608 let mut overflow: u32 = read_object(&self.ring_buffer, offset)?;
609 overflow = overflow.saturating_add(1);
610 write_object(&self.ring_buffer, offset, &overflow)
611 }
612
613 fn pop_sq_entry(&self) -> Result<Option<SqEntry>, Errno> {
618 let tail = self.read_sq_tail()?;
619 let head = self.read_sq_head()?;
620 if head != tail {
621 let sq_entry = self.read_sq_entry(head)?;
622 self.write_sq_head(head.wrapping_add(1))?;
623 Ok(Some(sq_entry))
624 } else {
625 Ok(None)
626 }
627 }
628
629 fn push_cq_entry(&self, entry: &CqEntry) -> Result<(), Errno> {
635 let head = self.read_cq_head()?;
636 let tail = self.read_cq_tail()?;
637 if head != tail
640 && self.metadata.cq_entry_offset(tail) == self.metadata.cq_entry_offset(head)
641 {
642 self.increment_overflow()?;
643 } else {
644 self.write_cq_entry(tail, entry)?;
645 self.write_cq_tail(tail.wrapping_add(1))?;
646 }
647 Ok(())
648 }
649}
650
651pub struct IoUringFileObject {
652 queue: IoUringQueue,
653 state: OrderedMutex<IoUringFileMutableState, TerminalLock>,
654 _flags: IoRingSetupFlags,
655}
656
657#[derive(Default, Debug)]
658struct IoUringFileMutableState {
659 registered_buffers: UserBuffers,
660 registered_iobuffers: Vec<IoUringProviderRingBuffer>,
661}
662
663impl IoUringFileObject {
664 pub fn new_file<L>(
665 locked: &mut Locked<L>,
666 current_task: &CurrentTask,
667 entries: u32,
668 params: &mut io_uring_params,
669 ) -> Result<FileHandle, Errno>
670 where
671 L: LockEqualOrBefore<FileOpsCore>,
672 {
673 let flags = IoRingSetupFlags::build_and_validate_from(params.flags)?;
674
675 let sq_entries = entries.next_power_of_two();
676 let cq_entries = if flags.contains(IoRingSetupFlags::CqSize) {
677 UserValue::from_raw(params.cq_entries)
678 .validate(sq_entries..IORING_MAX_CQ_ENTRIES)
679 .ok_or_else(|| errno!(EINVAL))?
680 .next_power_of_two()
681 } else {
682 sq_entries * 2
685 };
686
687 let queue =
688 IoUringQueue::new(IoUringMetadata { sq_entries: sq_entries, cq_entries: cq_entries })?;
689
690 queue.write_header(ControlHeader {
691 sq_ring_mask: sq_entries - 1,
692 cq_ring_mask: cq_entries - 1,
693 sq_ring_entries: sq_entries,
694 cq_ring_entries: cq_entries,
695 ..Default::default()
696 })?;
697
698 params.sq_entries = sq_entries;
699 params.cq_entries = cq_entries;
700 params.features = IORING_FEAT_SINGLE_MMAP;
701 params.sq_off = io_sqring_offsets {
702 head: std::mem::offset_of!(ControlHeader, sq_head) as u32,
703 tail: std::mem::offset_of!(ControlHeader, sq_tail) as u32,
704 ring_mask: std::mem::offset_of!(ControlHeader, sq_ring_mask) as u32,
705 ring_entries: std::mem::offset_of!(ControlHeader, sq_ring_entries) as u32,
706 flags: std::mem::offset_of!(ControlHeader, sq_flags) as u32,
707 dropped: std::mem::offset_of!(ControlHeader, sq_dropped) as u32,
708 array: queue.metadata.array_offset() as u32,
709 ..Default::default()
710 };
711 params.cq_off = io_cqring_offsets {
712 head: std::mem::offset_of!(ControlHeader, cq_head) as u32,
713 tail: std::mem::offset_of!(ControlHeader, cq_tail) as u32,
714 ring_mask: std::mem::offset_of!(ControlHeader, cq_ring_mask) as u32,
715 ring_entries: std::mem::offset_of!(ControlHeader, cq_ring_entries) as u32,
716 overflow: std::mem::offset_of!(ControlHeader, cq_overflow) as u32,
717 cqes: queue.metadata.cqes_offset() as u32,
718 flags: std::mem::offset_of!(ControlHeader, cq_flags) as u32,
719 ..Default::default()
720 };
721
722 let object =
723 Box::new(IoUringFileObject { queue, state: Default::default(), _flags: flags });
724 Anon::new_file(locked, current_task, object, OpenFlags::RDWR, "[io_uring]")
725 }
726
727 pub fn register_buffers(&self, locked: &mut Locked<Unlocked>, buffers: UserBuffers) {
728 self.state.lock(locked).registered_buffers = buffers;
733 }
734
735 pub fn unregister_buffers(&self, locked: &mut Locked<Unlocked>) {
736 self.state.lock(locked).registered_buffers.clear();
737 }
738
739 pub fn register_ring_buffers(
740 &self,
741 locked: &mut Locked<Unlocked>,
742 buffer_definition: uapi::io_uring_buf_reg,
743 ) -> Result<(), Errno> {
744 track_stub!(
745 TODO("https://fxbug.dev/297431387"),
746 "IoUringFileObject::register_ring_buffers"
747 );
748 if !buffer_definition.ring_addr.is_multiple_of(*PAGE_SIZE) {
749 return error!(EINVAL);
750 }
751 if !buffer_definition.ring_entries.is_power_of_two() {
752 return error!(EINVAL);
753 }
754 if buffer_definition.ring_entries > IORING_MAX_ENTRIES {
755 return error!(EINVAL);
756 }
757 self.state
758 .lock(locked)
759 .registered_iobuffers
760 .push(IoUringProviderRingBuffer::new(buffer_definition)?);
761 Ok(())
762 }
763
764 pub fn unregister_ring_buffers(
765 &self,
766 locked: &mut Locked<Unlocked>,
767 buffer_definition: uapi::io_uring_buf_reg,
768 ) -> Result<(), Errno> {
769 if self
770 .state
771 .lock(locked)
772 .registered_iobuffers
773 .extract_if(.., |buffer| buffer.config.bgid == buffer_definition.bgid)
774 .next()
775 .is_none()
776 {
777 return error!(EINVAL);
778 }
779 Ok(())
780 }
781
782 pub fn ring_buffer_status(
783 &self,
784 locked: &mut Locked<Unlocked>,
785 buffer_status: &mut uapi::io_uring_buf_status,
786 ) -> Result<(), Errno> {
787 let state = self.state.lock(locked);
788 let Some(buffer) = state
789 .registered_iobuffers
790 .iter()
791 .find(|buffer| buffer.config.bgid as u32 == buffer_status.buf_group)
792 else {
793 return error!(EINVAL);
794 };
795 buffer_status.head = buffer.head as u32;
796 Ok(())
797 }
798
799 pub fn enter(
800 &self,
801 locked: &mut Locked<Unlocked>,
802 current_task: &CurrentTask,
803 to_submit: u32,
804 _min_complete: u32,
805 _flags: u32,
806 ) -> Result<u32, Errno> {
807 let mut submitted = 0;
808 while let Some(sq_entry) = self.queue.pop_sq_entry()? {
809 submitted += 1;
810 let mut complete_flags: u32 = 0;
812 let result = self.execute(locked, current_task, &sq_entry, &mut complete_flags);
813 let cq_entry = sq_entry.complete(result, complete_flags);
814 self.queue.push_cq_entry(&cq_entry)?;
815 if submitted >= to_submit {
816 break;
817 }
818 }
819 Ok(submitted)
820 }
821
822 fn has_registered_buffers(&self, locked: &mut Locked<Unlocked>) -> bool {
823 !self.state.lock(locked).registered_buffers.is_empty()
824 }
825
826 fn check_buffer(&self, locked: &mut Locked<Unlocked>, entry: &SqEntry) -> Result<(), Errno> {
827 let index = entry.buf_index();
828 let state = self.state.lock(locked);
829 let buffers = &state.registered_buffers;
830 if buffers.is_empty() {
831 return error!(EFAULT);
832 }
833 let buffer = buffers.get(index).ok_or_else(|| errno!(EINVAL))?;
834 if !buffer.contains(entry.address(), entry.length()) { error!(EFAULT) } else { Ok(()) }
835 }
836
837 fn execute(
838 &self,
839 locked: &mut Locked<Unlocked>,
840 current_task: &CurrentTask,
841 entry: &SqEntry,
842 complete_flags: &mut u32,
843 ) -> Result<SyscallResult, Errno> {
844 assert_eq!(*complete_flags, 0);
845
846 let flags = SqEntryFlags::from_bits(entry.flags).ok_or_else(|| errno!(EINVAL))?;
847 match Op::from_code(entry.opcode as io_uring_op)? {
848 Op::NOP => Ok(SUCCESS),
849 Op::ReadV => {
850 if !flags.is_empty() {
851 return error!(EINVAL);
852 }
853 if entry.ioprio != 0 || entry.buf_index() != 0 {
854 return error!(EINVAL);
855 }
856 sys_preadv2(
857 locked,
858 current_task,
859 entry.fd(),
860 entry.iovec_addr(current_task),
861 entry.iovec_count(),
862 entry.offset(),
863 SyscallArg::default(),
864 entry.op_flags,
865 )
866 .map(Into::into)
867 }
868 Op::WriteV => {
869 if !flags.is_empty() {
870 return error!(EINVAL);
871 }
872 if entry.ioprio != 0 || entry.buf_index() != 0 {
873 return error!(EINVAL);
874 }
875 sys_pwritev2(
876 locked,
877 current_task,
878 entry.fd(),
879 entry.iovec_addr(current_task),
880 entry.iovec_count(),
881 entry.offset(),
882 SyscallArg::default(),
883 entry.op_flags,
884 )
885 .map(Into::into)
886 }
887 Op::ReadFixed => {
888 if !flags.is_empty() {
889 return error!(EINVAL);
890 }
891 if entry.ioprio != 0 {
892 return error!(EINVAL);
893 }
894 self.check_buffer(locked, entry)?;
898 do_read(locked, current_task, entry)
899 }
900 Op::WriteFixed => {
901 if !flags.is_empty() {
902 return error!(EINVAL);
903 }
904 if entry.ioprio != 0 {
905 return error!(EINVAL);
906 }
907 self.check_buffer(locked, entry)?;
911 do_write(locked, current_task, entry)
912 }
913 Op::Read => {
914 if !flags.is_empty() {
915 return error!(EINVAL);
916 }
917 if self.has_registered_buffers(locked) {
918 return error!(EINVAL);
919 }
920 do_read(locked, current_task, entry)
921 }
922 Op::Write => {
923 if !flags.is_empty() {
924 return error!(EINVAL);
925 }
926 if self.has_registered_buffers(locked) {
927 return error!(EINVAL);
928 }
929 do_write(locked, current_task, entry)
930 }
931 Op::SendMsg => {
932 if !flags.is_empty() {
933 return error!(EINVAL);
934 }
935 if entry.ioprio != 0 {
936 return error!(EINVAL);
937 }
938 sys_sendmsg(
939 locked,
940 current_task,
941 entry.fd(),
942 MsgHdrPtr::new(current_task, entry.address()),
943 entry.op_flags,
944 )
945 .map(Into::into)
946 }
947 Op::RecvMsg => {
948 struct RecvMsgBufferInfo {
952 buffer: UserBuffer,
953 header: uapi::io_uring_recvmsg_out,
954 buffer_adjustment: usize,
955 }
956 let mut flags = flags;
957 let mut ioprio = entry.ioprio as u32;
958 let msg_hdr_ptr = MsgHdrPtr::new(current_task, entry.address());
959 let (mut msg_hdr_ref, recv_msg_buffer_info): (
960 MsgHdrRef,
961 Option<RecvMsgBufferInfo>,
962 ) = if flags.contains(SqEntryFlags::BUFFER_SELECT) {
963 flags -= SqEntryFlags::BUFFER_SELECT;
964 let buffer = self.claim_next_buffer(
967 locked,
968 current_task,
969 entry.group(),
970 complete_flags,
971 )?;
972 let mut msg_hdr = current_task.read_multi_arch_object(msg_hdr_ptr)?;
973 let headerlen: u32 = std::mem::size_of::<uapi::io_uring_recvmsg_out>() as u32;
979 let namelen: u32 = msg_hdr.name_len.try_into().map_err(|_| errno!(EINVAL))?;
980 let controllen: u32 =
981 msg_hdr.control_len.try_into().map_err(|_| errno!(EINVAL))?;
982 let buffer_adjustment: u32 = headerlen
983 .checked_add(namelen)
984 .and_then(|v| v.checked_add(controllen))
985 .ok_or_else(|| errno!(EINVAL))?;
986 let payloadlen: u32 = (buffer.length as u32)
987 .checked_sub(buffer_adjustment)
988 .ok_or_else(|| errno!(EINVAL))?;
989 let io_uring_hdr = uapi::io_uring_recvmsg_out {
990 namelen,
991 controllen,
992 payloadlen,
993 flags: msg_hdr.flags,
994 };
995
996 let name_addr = (buffer.address + headerlen as usize)?;
997 let control_addr = (name_addr + namelen as usize)?;
998 let payload_addr = (control_addr + controllen as usize)?;
999 msg_hdr.name = name_addr;
1000 msg_hdr.control = control_addr;
1001
1002 current_task.zero(buffer.address, buffer_adjustment as usize)?;
1005
1006 let msg_hdr = WithAlternateBuffer::WithAux(
1007 msg_hdr,
1008 UserBuffer { address: payload_addr, length: payloadlen as usize },
1009 );
1010 (
1011 msg_hdr.into(),
1012 Some(RecvMsgBufferInfo {
1013 buffer,
1014 header: io_uring_hdr,
1015 buffer_adjustment: buffer_adjustment as usize,
1016 }),
1017 )
1018 } else {
1019 (msg_hdr_ptr.into(), None)
1020 };
1021 if ioprio & uapi::IORING_RECV_MULTISHOT > 0 {
1022 ioprio &= !uapi::IORING_RECV_MULTISHOT;
1026 }
1027 if !flags.is_empty() {
1028 return error!(EINVAL);
1029 }
1030 if ioprio != 0 {
1031 return error!(EINVAL);
1032 }
1033 let mut count = recvmsg_impl(
1034 locked,
1035 current_task,
1036 entry.fd(),
1037 &mut msg_hdr_ref,
1038 entry.op_flags,
1039 )?;
1040 if let Some(recv_msg_buffer_info) = recv_msg_buffer_info {
1041 current_task.write_object(
1046 recv_msg_buffer_info.buffer.address.into(),
1047 &recv_msg_buffer_info.header,
1048 )?;
1049 count += recv_msg_buffer_info.buffer_adjustment;
1051 }
1052 Ok(count.into())
1053 }
1054 Op::Send => {
1055 if !flags.is_empty() {
1056 return error!(EINVAL);
1057 }
1058 if entry.ioprio != 0 {
1059 return error!(EINVAL);
1060 }
1061 sys_sendto(
1062 locked,
1063 current_task,
1064 entry.fd(),
1065 entry.address(),
1066 entry.length(),
1067 entry.op_flags,
1068 UserAddress::default(),
1069 socklen_t::default(),
1070 )
1071 .map(Into::into)
1072 }
1073 Op::Recv => {
1074 if !flags.is_empty() {
1075 return error!(EINVAL);
1076 }
1077 if entry.ioprio != 0 {
1078 return error!(EINVAL);
1079 }
1080 sys_recvfrom(
1081 locked,
1082 current_task,
1083 entry.fd(),
1084 entry.address(),
1085 entry.length(),
1086 entry.op_flags,
1087 UserAddress::default(),
1088 UserRef::default(),
1089 )
1090 .map(Into::into)
1091 }
1092 Op::FSync
1093 | Op::PollAdd
1094 | Op::PollRemove
1095 | Op::SyncFileRange
1096 | Op::Timeout
1097 | Op::TimeoutRemove
1098 | Op::Accept
1099 | Op::AsyncCancel
1100 | Op::LinkTimeout
1101 | Op::Connect
1102 | Op::FAllocate
1103 | Op::OpenAt
1104 | Op::Close
1105 | Op::FilesUpdate
1106 | Op::StatX
1107 | Op::FAdvise
1108 | Op::MAdvise
1109 | Op::OpenAt2
1110 | Op::EpollCtl => error!(EOPNOTSUPP),
1111 }
1112 }
1113
1114 fn claim_next_buffer(
1115 &self,
1116 locked: &mut Locked<Unlocked>,
1117 current_task: &CurrentTask,
1118 bgid: u16,
1119 complete_flags: &mut u32,
1120 ) -> Result<UserBuffer, Errno> {
1121 let mut state = self.state.lock(locked);
1122 let Some(buffer) =
1123 state.registered_iobuffers.iter_mut().find(|buffer| buffer.config.bgid == bgid)
1124 else {
1125 return error!(EINVAL);
1126 };
1127 buffer.claim_next(current_task, complete_flags)
1128 }
1129}
1130
1131#[derive(Debug)]
1132struct IoUringProviderRingBuffer {
1133 config: uapi::io_uring_buf_reg,
1134 tail_ptr: UserRef<u16>,
1135 entries_ptr: UserRef<UserRingBufferEntry>,
1136 head: u16,
1137}
1138
1139impl IoUringProviderRingBuffer {
1140 fn new(config: uapi::io_uring_buf_reg) -> Result<Self, Errno> {
1141 let ring_addr = UserAddress::from(config.ring_addr);
1142 let tail_ptr =
1143 UserRef::<u16>::from((ring_addr + std::mem::offset_of!(UserRingBufferHeader, tail))?);
1144 let entries_ptr = UserRef::<UserRingBufferEntry>::from(ring_addr);
1145 Ok(Self { config, tail_ptr, entries_ptr, head: 0 })
1146 }
1147
1148 fn claim_next(
1149 &mut self,
1150 current_task: &CurrentTask,
1151 complete_flags: &mut u32,
1152 ) -> Result<UserBuffer, Errno> {
1153 let tail = current_task.read_object(self.tail_ptr)?;
1156 if self.head == tail {
1157 return error!(ENOBUFS);
1158 }
1159 let buffer_info = current_task.read_object(
1160 self.entries_ptr.at((self.head as usize) % (self.config.ring_entries as usize))?,
1161 )?;
1162 self.head += 1;
1163 *complete_flags |=
1164 uapi::IORING_CQE_F_BUFFER | ((buffer_info.bid as u32) << uapi::IORING_CQE_BUFFER_SHIFT);
1165 Ok(UserBuffer { address: buffer_info.addr.into(), length: buffer_info.len as usize })
1166 }
1167}
1168
1169fn do_read(
1170 locked: &mut Locked<Unlocked>,
1171 current_task: &CurrentTask,
1172 entry: &SqEntry,
1173) -> Result<SyscallResult, Errno> {
1174 let offset = entry.offset();
1175 if offset == -1 {
1176 sys_read(locked, current_task, entry.fd(), entry.address(), entry.length()).map(Into::into)
1177 } else {
1178 sys_pread64(locked, current_task, entry.fd(), entry.address(), entry.length(), offset)
1179 .map(Into::into)
1180 }
1181}
1182
1183fn do_write(
1184 locked: &mut Locked<Unlocked>,
1185 current_task: &CurrentTask,
1186 entry: &SqEntry,
1187) -> Result<SyscallResult, Errno> {
1188 let offset = entry.offset();
1189 if offset == -1 {
1190 sys_write(locked, current_task, entry.fd(), entry.address(), entry.length()).map(Into::into)
1191 } else {
1192 sys_pwrite64(
1193 locked,
1194 current_task,
1195 entry.fd(),
1196 entry.address(),
1197 entry.length(),
1198 entry.offset(),
1199 )
1200 .map(Into::into)
1201 }
1202}
1203
1204impl FileOps for IoUringFileObject {
1205 fileops_impl_nonseekable!();
1206 fileops_impl_noop_sync!();
1207 fileops_impl_dataless!();
1208
1209 fn mmap(
1210 &self,
1211 _locked: &mut Locked<FileOpsCore>,
1212 _file: &FileObject,
1213 current_task: &CurrentTask,
1214 addr: DesiredAddress,
1215 memory_offset: u64,
1216 length: usize,
1217 prot_flags: ProtectionFlags,
1218 options: MappingOptions,
1219 filename: NamespaceNode,
1220 ) -> Result<UserAddress, Errno> {
1221 if !options.contains(MappingOptions::SHARED) {
1222 return error!(EINVAL);
1223 }
1224 let magic_offset: u32 = memory_offset.try_into().map_err(|_| errno!(EINVAL))?;
1225 let memory = match magic_offset {
1226 IORING_OFF_SQ_RING | IORING_OFF_CQ_RING => self.queue.ring_buffer.clone(),
1227 IORING_OFF_SQES => self.queue.sq_entries.clone(),
1228 _ => return error!(EINVAL),
1229 };
1230 current_task.mm()?.map_memory(
1231 addr,
1232 memory,
1233 0,
1234 length,
1235 prot_flags,
1236 Access::rwx(),
1237 options,
1238 MappingName::File(filename.into_mapping(None)?),
1239 )
1240 }
1241}