Skip to main content

starnix_syscall_loop/
lib.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use anyhow::{Error, format_err};
6use extended_pstate::ExtendedPstatePointer;
7use starnix_core::arch::execution::new_syscall;
8use starnix_core::ptrace::{StopState, ptrace_syscall_enter, ptrace_syscall_exit};
9use starnix_core::signals::{
10    SignalInfo, deliver_signal, dequeue_signal, prepare_to_restart_syscall,
11};
12use starnix_core::task::{CurrentTask, ExceptionResult, ExitStatus, SeccompStateValue, TaskFlags};
13use starnix_logging::{
14    CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION, NAME_RESTRICTED_KICK, NAME_RUN_TASK, log_error,
15    log_syscall, log_trace, log_warn, set_current_task_info,
16};
17use starnix_registers::RestrictedState;
18use starnix_sync::{Locked, Unlocked};
19use starnix_syscalls::SyscallResult;
20use starnix_syscalls::decls::{Syscall, SyscallDecl};
21use starnix_uapi::errno;
22use starnix_uapi::errors::Errno;
23use starnix_uapi::signals::SIGKILL;
24use zerocopy::FromZeros;
25
26mod table;
27
28pub fn enter(locked: &mut Locked<Unlocked>, current_task: &mut CurrentTask) -> ExitStatus {
29    // Zircon will populate this report on restricted exception exits. Initialize it to all zero
30    // since we're just reserving storage.
31    let mut exception_report = zx::sys::zx_exception_report_t::new_zeroed();
32    match RestrictedState::bind_and_map(
33        &mut current_task.thread_state.registers,
34        &mut exception_report,
35    ) {
36        Ok(restricted_state) => {
37            match run_task(
38                locked,
39                current_task,
40                restricted_state.bound_state.as_ptr(),
41                &exception_report,
42            ) {
43                Ok(ok) => ok,
44                Err(error) => {
45                    log_warn!("Died unexpectedly from {error:?}! treating as SIGKILL");
46                    ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
47                }
48            }
49        }
50        Err(error) => {
51            log_error!("failed to map mode state vmo, {error:?}! treating as SIGKILL");
52            ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
53        }
54    }
55}
56
57type RestrictedExitCallback = extern "C" fn(
58    *mut RestrictedEnterContext<'_>,
59    zx::sys::zx_restricted_reason_t,
60    *mut ExtendedPstatePointer,
61) -> bool;
62
63unsafe extern "C" {
64    // rustc doesn't like RestrictedEnterContext for FFI but we're just passing it back to
65    // ourselves with extra steps.
66    #[allow(improper_ctypes)]
67    fn restricted_enter_loop(
68        options: u32,
69        restricted_exit_callback: RestrictedExitCallback,
70        restricted_exit_callback_context: *mut RestrictedEnterContext<'_>,
71        restricted_state: *mut zx::sys::zx_restricted_state_t,
72        extended_pstate_ptr_ptr: *mut ExtendedPstatePointer,
73    ) -> zx::sys::zx_status_t;
74}
75
76const RESTRICTED_ENTER_OPTIONS: u32 = 0;
77
78struct RestrictedEnterContext<'a> {
79    current_task: &'a mut CurrentTask,
80    error_context: Option<ErrorContext>,
81    exit_status: Result<ExitStatus, Error>,
82    exception_report_raw: *const zx::sys::zx_exception_report_t,
83}
84
85/// Runs the `current_task` to completion.
86///
87/// The high-level flow of this function looks as follows:
88///
89///   1. Write the restricted state for the current thread to set it up to enter into the restricted
90///      (Linux) part of the address space.
91///   2. Enter restricted mode.
92///   3. Return from restricted mode, reading out the new state of the restricted mode execution.
93///      This state contains the thread's restricted register state, which is used to determine
94///      which system call to dispatch.
95///   4. Dispatch the system call.
96///   5. Handle pending signals.
97///   6. Goto 1.
98fn run_task(
99    locked: &mut Locked<Unlocked>,
100    current_task: &mut CurrentTask,
101    restricted_state_ptr: *mut zx::sys::zx_restricted_state_t,
102    exception_report_raw: *const zx::sys::zx_exception_report_t,
103) -> Result<ExitStatus, Error> {
104    set_current_task_info(
105        current_task.task.command(),
106        current_task.task.thread_group().read().leader_command(),
107        current_task.task.thread_group().leader,
108        current_task.tid,
109    );
110
111    fuchsia_trace::duration!(CATEGORY_STARNIX, NAME_RUN_TASK);
112
113    // This tracks the last failing system call for debugging purposes.
114    let error_context = None;
115
116    // We need to check for exit once, before the task starts executing, in case
117    // the task has already been sent a signal that will cause it to exit.
118    if let Some(exit_status) =
119        process_completed_restricted_exit(locked, current_task, &error_context)?
120    {
121        return Ok(exit_status);
122    }
123
124    // This extended pstate pointer points to the storage for extended processor
125    // state (vector and FP registers).
126    let mut extended_pstate_ptr = current_task.thread_state.extended_pstate.as_ptr();
127
128    let mut restricted_enter_context = RestrictedEnterContext {
129        current_task,
130        error_context,
131        exit_status: Err(errno!(ENOEXEC).into()),
132        exception_report_raw,
133    };
134
135    #[allow(
136        clippy::undocumented_unsafe_blocks,
137        reason = "Force documented unsafe blocks in Starnix"
138    )]
139    let restricted_enter_status = zx::Status::from_raw(unsafe {
140        restricted_enter_loop(
141            RESTRICTED_ENTER_OPTIONS,
142            restricted_exit_callback_c,
143            &mut restricted_enter_context,
144            restricted_state_ptr,
145            &raw mut extended_pstate_ptr,
146        )
147    });
148    if restricted_enter_status != zx::Status::OK {
149        // If restricted_enter_loop failed, it means that we failed to satisfy
150        // a prerequisite of zx_restricted_enter which should never happen.
151        log_error!(
152            "restricted_enter_loop failed: {}, register state: {:?}",
153            restricted_enter_status,
154            restricted_enter_context.current_task.thread_state.registers
155        );
156    }
157    restricted_enter_context.exit_status
158}
159
160extern "C" fn restricted_exit_callback_c(
161    context: *mut RestrictedEnterContext<'_>,
162    reason_code: zx::sys::zx_restricted_reason_t,
163    extended_pstate_ptr_ptr: *mut ExtendedPstatePointer,
164) -> bool {
165    // SAFETY:
166    // `context` is a pointer to a `RestrictedEnterContext` that was passed to
167    // `restricted_enter_loop`.
168    //  `extended_pstate_ptr` is a pointer to the ExtendedPstatePointer instance
169    //  that was passed to `restricted_enter_loop.`
170    // Our restricted return assembly and Zircon together guarantee that this
171    // thread has exclusive access to these variables.
172    let (restricted_context, extended_pstate_ptr) =
173        unsafe { (&mut *context, extended_pstate_ptr_ptr.as_mut_unchecked()) };
174    restricted_exit_callback(
175        reason_code,
176        restricted_context.current_task,
177        &mut restricted_context.error_context,
178        &mut restricted_context.exit_status,
179        extended_pstate_ptr,
180        restricted_context.exception_report_raw,
181    )
182}
183
184fn restricted_exit_callback(
185    reason_code: zx::sys::zx_restricted_reason_t,
186    current_task: &mut CurrentTask,
187    error_context: &mut Option<ErrorContext>,
188    exit_status: &mut Result<ExitStatus, Error>,
189    extended_pstate_ptr: &mut ExtendedPstatePointer,
190    exception_report_raw: *const zx::sys::zx_exception_report_t,
191) -> bool {
192    debug_assert_eq!(
193        current_task.thread_state.restart_code, None,
194        "restart_code should only ever be Some() in normal mode",
195    );
196
197    let ret = match process_restricted_exit(
198        reason_code,
199        current_task,
200        error_context,
201        exception_report_raw,
202    ) {
203        Ok(None) => {
204            // Keep going!
205
206            *extended_pstate_ptr = current_task.thread_state.extended_pstate.as_ptr();
207
208            true
209        }
210        Ok(Some(completed_exit_status)) => {
211            *exit_status = Ok(completed_exit_status);
212            false
213        }
214        Err(error) => {
215            *exit_status = Err(error);
216            false
217        }
218    };
219
220    debug_assert_eq!(
221        current_task.thread_state.restart_code, None,
222        "restart_code should only ever be Some() in normal mode",
223    );
224
225    ret
226}
227
228fn process_restricted_exit(
229    reason_code: zx::sys::zx_restricted_reason_t,
230    current_task: &mut CurrentTask,
231    error_context: &mut Option<ErrorContext>,
232    exception_report_raw: *const zx::sys::zx_exception_report_t,
233) -> Result<Option<ExitStatus>, Error> {
234    // We can't hold any locks entering restricted mode so we can't be holding any locks on exit.
235    #[allow(
236        clippy::undocumented_unsafe_blocks,
237        reason = "Force documented unsafe blocks in Starnix"
238    )]
239    let locked = unsafe { Unlocked::new() };
240
241    current_task.thread_state.registers.sync_stack_ptr();
242
243    match reason_code {
244        zx::sys::ZX_RESTRICTED_REASON_SYSCALL => {
245            let syscall_decl = SyscallDecl::from_number(
246                current_task.thread_state.registers.syscall_register(),
247                current_task.thread_state.arch_width(),
248            );
249
250            if let Some(new_error_context) = execute_syscall(locked, current_task, syscall_decl) {
251                *error_context = Some(new_error_context);
252            }
253        }
254        zx::sys::ZX_RESTRICTED_REASON_EXCEPTION => {
255            fuchsia_trace::duration!(CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION);
256            // SAFETY: `exception_report_raw` was written by Zircon during this restricted exit.
257            let exception_report = unsafe { zx::ExceptionReport::from_raw(*exception_report_raw) };
258            let exception_result = current_task.process_exception(locked, &exception_report);
259            process_completed_exception(locked, current_task, exception_result, exception_report);
260        }
261        zx::sys::ZX_RESTRICTED_REASON_KICK => {
262            fuchsia_trace::instant!(
263                CATEGORY_STARNIX,
264                NAME_RESTRICTED_KICK,
265                fuchsia_trace::Scope::Thread
266            );
267            // Fall through to the post-syscall / post-exception handling logic. We were likely
268            // kicked because a signal is pending deliver or the task has exited. Spurious kicks are
269            // also possible.
270        }
271        _ => {
272            return Err(format_err!("Received unexpected restricted reason code: {}", reason_code));
273        }
274    }
275
276    if let Some(exit_status) =
277        process_completed_restricted_exit(locked, current_task, &error_context)?
278    {
279        return Ok(Some(exit_status));
280    }
281
282    Ok(None)
283}
284
285fn process_completed_exception(
286    locked: &mut Locked<Unlocked>,
287    current_task: &mut CurrentTask,
288    exception_result: ExceptionResult,
289    restricted_exception: zx::ExceptionReport,
290) {
291    match exception_result {
292        ExceptionResult::Handled => {}
293        ExceptionResult::Signal(signal) => {
294            let mut task_state = current_task.task.write();
295            if task_state.ptrace_on_signal_consume() {
296                task_state.set_stopped(
297                    StopState::SignalDeliveryStopping,
298                    Some(signal),
299                    Some(&current_task),
300                    None,
301                );
302                return;
303            }
304
305            if let Some(status) = deliver_signal(
306                current_task.task.as_ref(),
307                current_task.thread_state.arch_width(),
308                task_state,
309                signal.into(),
310                &mut current_task.thread_state.registers,
311                &current_task.thread_state.extended_pstate,
312                Some(restricted_exception),
313            ) {
314                current_task.kill_thread_group(locked, status);
315            }
316        }
317    }
318}
319
320/// Contains context to track the most recently failing system call.
321///
322/// When a task exits with a non-zero exit code, this context is logged to help debugging which
323/// system call may have triggered the failure.
324#[derive(Debug)]
325pub struct ErrorContext {
326    /// The system call that failed.
327    pub syscall: Syscall,
328
329    /// The error that was returned for the system call.
330    pub error: Errno,
331}
332
333/// Executes the provided `syscall` in `current_task`.
334///
335/// Returns an `ErrorContext` if the system call returned an error.
336#[inline(never)] // Inlining this function breaks the CFI directives used to unwind into user code.
337pub fn execute_syscall(
338    locked: &mut Locked<Unlocked>,
339    current_task: &mut CurrentTask,
340    syscall_decl: SyscallDecl,
341) -> Option<ErrorContext> {
342    fuchsia_trace::duration!(CATEGORY_STARNIX, syscall_decl.trace_name());
343    let syscall = new_syscall(syscall_decl, current_task);
344
345    current_task.thread_state.registers.save_registers_for_restart(syscall.decl.number);
346
347    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
348        ptrace_syscall_enter(locked, current_task);
349    }
350
351    log_syscall!(current_task, "{syscall:?}");
352
353    let _lockup_detector_guard = starnix_core::task::ThreadLockupDetector::track();
354    let result: Result<SyscallResult, Errno> =
355        if current_task.seccomp_filter_state.get() != SeccompStateValue::None {
356            // Inlined fast path for seccomp, so that we don't incur the cost
357            // of a method call when running the filters.
358            if let Some(res) = current_task.run_seccomp_filters(locked, &syscall) {
359                res
360            } else {
361                table::dispatch_syscall(locked, current_task, &syscall)
362            }
363        } else {
364            table::dispatch_syscall(locked, current_task, &syscall)
365        };
366
367    current_task.trigger_delayed_releaser(locked);
368
369    let return_value = match result {
370        Ok(return_value) => {
371            log_syscall!(current_task, "-> {:#x}", return_value.value());
372            current_task.thread_state.registers.set_return_register(return_value.value());
373            None
374        }
375        Err(errno) => {
376            log_syscall!(current_task, "!-> {errno}");
377            if errno.is_restartable() {
378                current_task.thread_state.restart_code = Some(errno.code);
379            }
380            current_task.thread_state.registers.set_return_register(errno.return_value());
381            Some(ErrorContext { error: errno, syscall })
382        }
383    };
384
385    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
386        ptrace_syscall_exit(locked, current_task, return_value.is_some());
387    }
388
389    return_value
390}
391
392/// Finishes `current_task` updates after a restricted mode exit such as a syscall, exception, or kick.
393///
394/// Returns an `ExitStatus` if the task is meant to exit.
395pub fn process_completed_restricted_exit(
396    locked: &mut Locked<Unlocked>,
397    current_task: &mut CurrentTask,
398    error_context: &Option<ErrorContext>,
399) -> Result<Option<ExitStatus>, Errno> {
400    let result;
401    loop {
402        // Checking for a signal might cause the task to exit, so check before processing exit
403        {
404            {
405                if !current_task.is_exitted() {
406                    dequeue_signal(locked, current_task);
407                }
408                // The syscall may need to restart for a non-signal-related
409                // reason. This call does nothing if we aren't restarting.
410                prepare_to_restart_syscall(&mut current_task.thread_state, None);
411            }
412        }
413
414        let exit_status = current_task.exit_status();
415        if let Some(exit_status) = exit_status {
416            log_trace!("exiting with status {:?}", exit_status);
417            if let Some(error_context) = error_context {
418                match exit_status {
419                    ExitStatus::Exit(value) if value == 0 => {}
420                    _ => {
421                        log_trace!(
422                            "last failing syscall before exit: {:?}, failed with {:?}",
423                            error_context.syscall,
424                            error_context.error
425                        );
426                    }
427                };
428            }
429
430            result = Some(exit_status);
431            break;
432        } else {
433            // Block a stopped process after it's had a chance to handle signals, since a signal might
434            // cause it to stop.
435            if current_task.block_if_stopped(locked) {
436                // If the task was stopped and has now woken up (e.g., via SIGCONT or PTRACE_CONT),
437                // loop back to process any pending signals before returning to userspace.
438                continue;
439            }
440            result = None;
441            // Always restore signal mask before returning to userspace.
442            current_task.write().restore_signal_mask();
443            break;
444        }
445    }
446
447    if let Some(ExitStatus::CoreDump(signal_info)) = &result {
448        if current_task.flags().contains(TaskFlags::DUMP_ON_EXIT) {
449            // Avoid taking a backtrace if the signal was sent by the same task.
450            if !signal_info.is_sent_by(&current_task.weak_task()) {
451                // Request a backtrace before reporting the crash to increase chance of a backtrace
452                // in logs. This call is kept as far up in the call stack as possible to avoid
453                // additional frames that are always the same and not relevant to users.
454                // TODO(https://fxbug.dev/356732164) collect a backtrace ourselves
455                debug::backtrace_request_current_thread();
456            }
457
458            if let Some(pending_report) =
459                current_task.kernel().crash_reporter.begin_crash_report(&current_task)
460            {
461                current_task.kernel().crash_reporter.handle_core_dump(
462                    &current_task,
463                    signal_info,
464                    pending_report,
465                );
466            }
467        }
468    }
469    return Ok(result);
470}