Skip to main content

starnix_syscall_loop/
lib.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use anyhow::{Error, format_err};
6use extended_pstate::ExtendedPstatePointer;
7use starnix_core::arch::execution::new_syscall;
8use starnix_core::ptrace::{PtraceStatus, StopState, ptrace_syscall_enter, ptrace_syscall_exit};
9use starnix_core::signals::{
10    SignalInfo, deliver_signal, dequeue_signal, prepare_to_restart_syscall,
11};
12use starnix_core::task::{CurrentTask, ExceptionResult, ExitStatus, SeccompStateValue, TaskFlags};
13use starnix_logging::{
14    CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION, NAME_RESTRICTED_KICK, NAME_RUN_TASK,
15    firehose_trace_duration, firehose_trace_instant, log_error, log_syscall, log_trace, log_warn,
16    set_current_task_info,
17};
18use starnix_registers::RestrictedState;
19use starnix_sync::{Locked, Unlocked};
20use starnix_syscalls::SyscallResult;
21use starnix_syscalls::decls::{Syscall, SyscallDecl};
22use starnix_uapi::errno;
23use starnix_uapi::errors::Errno;
24use starnix_uapi::signals::SIGKILL;
25use zerocopy::FromZeros;
26
27mod table;
28
29pub fn enter(locked: &mut Locked<Unlocked>, current_task: &mut CurrentTask) -> ExitStatus {
30    // Zircon will populate this report on restricted exception exits. Initialize it to all zero
31    // since we're just reserving storage.
32    let mut exception_report = zx::sys::zx_exception_report_t::new_zeroed();
33    match RestrictedState::bind_and_map(
34        &mut current_task.thread_state.registers,
35        &mut exception_report,
36    ) {
37        Ok(restricted_state) => {
38            match run_task(
39                locked,
40                current_task,
41                restricted_state.bound_state.as_ptr(),
42                &exception_report,
43            ) {
44                Ok(ok) => ok,
45                Err(error) => {
46                    log_warn!("Died unexpectedly from {error:?}! treating as SIGKILL");
47                    ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
48                }
49            }
50        }
51        Err(error) => {
52            log_error!("failed to map mode state vmo, {error:?}! treating as SIGKILL");
53            ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
54        }
55    }
56}
57
58type RestrictedExitCallback = extern "C" fn(
59    *mut RestrictedEnterContext<'_>,
60    zx::sys::zx_restricted_reason_t,
61    *mut ExtendedPstatePointer,
62) -> bool;
63
64unsafe extern "C" {
65    // rustc doesn't like RestrictedEnterContext for FFI but we're just passing it back to
66    // ourselves with extra steps.
67    #[allow(improper_ctypes)]
68    fn restricted_enter_loop(
69        options: u32,
70        restricted_exit_callback: RestrictedExitCallback,
71        restricted_exit_callback_context: *mut RestrictedEnterContext<'_>,
72        restricted_state: *mut zx::sys::zx_restricted_state_t,
73        extended_pstate_ptr_ptr: *mut ExtendedPstatePointer,
74    ) -> zx::sys::zx_status_t;
75}
76
77const RESTRICTED_ENTER_OPTIONS: u32 = 0;
78
79struct RestrictedEnterContext<'a> {
80    current_task: &'a mut CurrentTask,
81    error_context: Option<ErrorContext>,
82    exit_status: Result<ExitStatus, Error>,
83    exception_report_raw: *const zx::sys::zx_exception_report_t,
84}
85
86/// Runs the `current_task` to completion.
87///
88/// The high-level flow of this function looks as follows:
89///
90///   1. Write the restricted state for the current thread to set it up to enter into the restricted
91///      (Linux) part of the address space.
92///   2. Enter restricted mode.
93///   3. Return from restricted mode, reading out the new state of the restricted mode execution.
94///      This state contains the thread's restricted register state, which is used to determine
95///      which system call to dispatch.
96///   4. Dispatch the system call.
97///   5. Handle pending signals.
98///   6. Goto 1.
99fn run_task(
100    locked: &mut Locked<Unlocked>,
101    current_task: &mut CurrentTask,
102    restricted_state_ptr: *mut zx::sys::zx_restricted_state_t,
103    exception_report_raw: *const zx::sys::zx_exception_report_t,
104) -> Result<ExitStatus, Error> {
105    set_current_task_info(
106        current_task.task.command(),
107        current_task.task.thread_group().read().leader_command(),
108        current_task.task.thread_group().leader,
109        current_task.tid,
110    );
111
112    firehose_trace_duration!(CATEGORY_STARNIX, NAME_RUN_TASK);
113
114    // This tracks the last failing system call for debugging purposes.
115    let error_context = None;
116
117    // We need to check for exit once, before the task starts executing, in case
118    // the task has already been sent a signal that will cause it to exit.
119    if let Some(exit_status) =
120        process_completed_restricted_exit(locked, current_task, &error_context)?
121    {
122        return Ok(exit_status);
123    }
124
125    // This extended pstate pointer points to the storage for extended processor
126    // state (vector and FP registers).
127    let mut extended_pstate_ptr = current_task.thread_state.extended_pstate.as_ptr();
128
129    let mut restricted_enter_context = RestrictedEnterContext {
130        current_task,
131        error_context,
132        exit_status: Err(errno!(ENOEXEC).into()),
133        exception_report_raw,
134    };
135
136    #[allow(
137        clippy::undocumented_unsafe_blocks,
138        reason = "Force documented unsafe blocks in Starnix"
139    )]
140    let restricted_enter_status = zx::Status::from_raw(unsafe {
141        restricted_enter_loop(
142            RESTRICTED_ENTER_OPTIONS,
143            restricted_exit_callback_c,
144            &mut restricted_enter_context,
145            restricted_state_ptr,
146            &raw mut extended_pstate_ptr,
147        )
148    });
149    if restricted_enter_status != zx::Status::OK {
150        // If restricted_enter_loop failed, it means that we failed to satisfy
151        // a prerequisite of zx_restricted_enter which should never happen.
152        log_error!(
153            "restricted_enter_loop failed: {}, register state: {:?}",
154            restricted_enter_status,
155            restricted_enter_context.current_task.thread_state.registers
156        );
157    }
158    restricted_enter_context.exit_status
159}
160
161extern "C" fn restricted_exit_callback_c(
162    context: *mut RestrictedEnterContext<'_>,
163    reason_code: zx::sys::zx_restricted_reason_t,
164    extended_pstate_ptr_ptr: *mut ExtendedPstatePointer,
165) -> bool {
166    // SAFETY:
167    // `context` is a pointer to a `RestrictedEnterContext` that was passed to
168    // `restricted_enter_loop`.
169    //  `extended_pstate_ptr` is a pointer to the ExtendedPstatePointer instance
170    //  that was passed to `restricted_enter_loop.`
171    // Our restricted return assembly and Zircon together guarantee that this
172    // thread has exclusive access to these variables.
173    let (restricted_context, extended_pstate_ptr) =
174        unsafe { (&mut *context, extended_pstate_ptr_ptr.as_mut_unchecked()) };
175    restricted_exit_callback(
176        reason_code,
177        restricted_context.current_task,
178        &mut restricted_context.error_context,
179        &mut restricted_context.exit_status,
180        extended_pstate_ptr,
181        restricted_context.exception_report_raw,
182    )
183}
184
185fn restricted_exit_callback(
186    reason_code: zx::sys::zx_restricted_reason_t,
187    current_task: &mut CurrentTask,
188    error_context: &mut Option<ErrorContext>,
189    exit_status: &mut Result<ExitStatus, Error>,
190    extended_pstate_ptr: &mut ExtendedPstatePointer,
191    exception_report_raw: *const zx::sys::zx_exception_report_t,
192) -> bool {
193    debug_assert_eq!(
194        current_task.thread_state.restart_code, None,
195        "restart_code should only ever be Some() in normal mode",
196    );
197
198    let ret = match process_restricted_exit(
199        reason_code,
200        current_task,
201        error_context,
202        exception_report_raw,
203    ) {
204        Ok(None) => {
205            // Keep going!
206
207            *extended_pstate_ptr = current_task.thread_state.extended_pstate.as_ptr();
208
209            true
210        }
211        Ok(Some(completed_exit_status)) => {
212            *exit_status = Ok(completed_exit_status);
213            false
214        }
215        Err(error) => {
216            *exit_status = Err(error);
217            false
218        }
219    };
220
221    debug_assert_eq!(
222        current_task.thread_state.restart_code, None,
223        "restart_code should only ever be Some() in normal mode",
224    );
225
226    ret
227}
228
229fn process_restricted_exit(
230    reason_code: zx::sys::zx_restricted_reason_t,
231    current_task: &mut CurrentTask,
232    error_context: &mut Option<ErrorContext>,
233    exception_report_raw: *const zx::sys::zx_exception_report_t,
234) -> Result<Option<ExitStatus>, Error> {
235    // We can't hold any locks entering restricted mode so we can't be holding any locks on exit.
236    #[allow(
237        clippy::undocumented_unsafe_blocks,
238        reason = "Force documented unsafe blocks in Starnix"
239    )]
240    let locked = unsafe { Unlocked::new() };
241
242    current_task.thread_state.registers.sync_stack_ptr();
243
244    match reason_code {
245        zx::sys::ZX_RESTRICTED_REASON_SYSCALL => {
246            let syscall_decl = SyscallDecl::from_number(
247                current_task.thread_state.registers.syscall_register(),
248                current_task.thread_state.arch_width(),
249            );
250
251            if let Some(new_error_context) = execute_syscall(locked, current_task, syscall_decl) {
252                *error_context = Some(new_error_context);
253            }
254        }
255        zx::sys::ZX_RESTRICTED_REASON_EXCEPTION => {
256            firehose_trace_duration!(CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION);
257            // SAFETY: `exception_report_raw` was written by Zircon during this restricted exit.
258            let exception_report = unsafe { zx::ExceptionReport::from_raw(*exception_report_raw) };
259            let exception_result = current_task.process_exception(locked, &exception_report);
260            process_completed_exception(locked, current_task, exception_result, exception_report);
261        }
262        zx::sys::ZX_RESTRICTED_REASON_KICK => {
263            firehose_trace_instant!(
264                CATEGORY_STARNIX,
265                NAME_RESTRICTED_KICK,
266                fuchsia_trace::Scope::Thread
267            );
268            // Fall through to the post-syscall / post-exception handling logic. We were likely
269            // kicked because a signal is pending deliver or the task has exited. Spurious kicks are
270            // also possible.
271        }
272        _ => {
273            return Err(format_err!("Received unexpected restricted reason code: {}", reason_code));
274        }
275    }
276
277    if let Some(exit_status) =
278        process_completed_restricted_exit(locked, current_task, &error_context)?
279    {
280        return Ok(Some(exit_status));
281    }
282
283    Ok(None)
284}
285
286fn process_completed_exception(
287    locked: &mut Locked<Unlocked>,
288    current_task: &mut CurrentTask,
289    exception_result: ExceptionResult,
290    restricted_exception: zx::ExceptionReport,
291) {
292    match exception_result {
293        ExceptionResult::Handled => {}
294        ExceptionResult::Signal(signal) => {
295            let mut task_state = current_task.task.write();
296            if task_state.ptrace_on_signal_consume() {
297                task_state.set_stopped(
298                    StopState::SignalDeliveryStopping,
299                    Some(signal),
300                    Some(&current_task),
301                    None,
302                );
303                return;
304            }
305
306            if let Some(status) = deliver_signal(
307                current_task.task.as_ref(),
308                current_task.thread_state.arch_width(),
309                task_state,
310                signal.into(),
311                &mut current_task.thread_state.registers,
312                &current_task.thread_state.extended_pstate,
313                Some(restricted_exception),
314            ) {
315                current_task.thread_group_exit(locked, status);
316            }
317        }
318    }
319}
320
321/// Contains context to track the most recently failing system call.
322///
323/// When a task exits with a non-zero exit code, this context is logged to help debugging which
324/// system call may have triggered the failure.
325#[derive(Debug)]
326pub struct ErrorContext {
327    /// The system call that failed.
328    pub syscall: Syscall,
329
330    /// The error that was returned for the system call.
331    pub error: Errno,
332}
333
334/// Executes the provided `syscall` in `current_task`.
335///
336/// Returns an `ErrorContext` if the system call returned an error.
337#[inline(never)] // Inlining this function breaks the CFI directives used to unwind into user code.
338pub fn execute_syscall(
339    locked: &mut Locked<Unlocked>,
340    current_task: &mut CurrentTask,
341    syscall_decl: SyscallDecl,
342) -> Option<ErrorContext> {
343    firehose_trace_duration!(CATEGORY_STARNIX, syscall_decl.trace_name());
344    let syscall = new_syscall(syscall_decl, current_task);
345
346    current_task.thread_state.registers.save_registers_for_restart(syscall.decl.number);
347
348    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
349        ptrace_syscall_enter(locked, current_task);
350    }
351
352    log_syscall!(current_task, "{syscall:?}");
353
354    let result: Result<SyscallResult, Errno> =
355        if current_task.seccomp_filter_state.get() != SeccompStateValue::None {
356            // Inlined fast path for seccomp, so that we don't incur the cost
357            // of a method call when running the filters.
358            if let Some(res) = current_task.run_seccomp_filters(locked, &syscall) {
359                res
360            } else {
361                table::dispatch_syscall(locked, current_task, &syscall)
362            }
363        } else {
364            table::dispatch_syscall(locked, current_task, &syscall)
365        };
366
367    current_task.trigger_delayed_releaser(locked);
368
369    let return_value = match result {
370        Ok(return_value) => {
371            log_syscall!(current_task, "-> {:#x}", return_value.value());
372            current_task.thread_state.registers.set_return_register(return_value.value());
373            None
374        }
375        Err(errno) => {
376            log_syscall!(current_task, "!-> {errno}");
377            if errno.is_restartable() {
378                current_task.thread_state.restart_code = Some(errno.code);
379            }
380            current_task.thread_state.registers.set_return_register(errno.return_value());
381            Some(ErrorContext { error: errno, syscall })
382        }
383    };
384
385    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
386        ptrace_syscall_exit(locked, current_task, return_value.is_some());
387    }
388
389    return_value
390}
391
392/// Finishes `current_task` updates after a restricted mode exit such as a syscall, exception, or kick.
393///
394/// Returns an `ExitStatus` if the task is meant to exit.
395pub fn process_completed_restricted_exit(
396    locked: &mut Locked<Unlocked>,
397    current_task: &mut CurrentTask,
398    error_context: &Option<ErrorContext>,
399) -> Result<Option<ExitStatus>, Errno> {
400    let result;
401    loop {
402        // Checking for a signal might cause the task to exit, so check before processing exit
403        {
404            {
405                if !current_task.is_exitted() {
406                    dequeue_signal(locked, current_task);
407                }
408                // The syscall may need to restart for a non-signal-related
409                // reason. This call does nothing if we aren't restarting.
410                prepare_to_restart_syscall(&mut current_task.thread_state, None);
411            }
412        }
413
414        let exit_status = current_task.exit_status();
415        if let Some(exit_status) = exit_status {
416            log_trace!("exiting with status {:?}", exit_status);
417            if let Some(error_context) = error_context {
418                match exit_status {
419                    ExitStatus::Exit(value) if value == 0 => {}
420                    _ => {
421                        log_trace!(
422                            "last failing syscall before exit: {:?}, failed with {:?}",
423                            error_context.syscall,
424                            error_context.error
425                        );
426                    }
427                };
428            }
429
430            result = Some(exit_status);
431            break;
432        } else {
433            // Block a stopped process after it's had a chance to handle signals, since a signal might
434            // cause it to stop.
435            current_task.block_while_stopped(locked);
436            // If ptrace_cont has sent a signal, process it immediately.  This
437            // seems to match Linux behavior.
438
439            let mut task_state = current_task.write();
440            if task_state
441                .ptrace
442                .as_ref()
443                .is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Continuing)
444                && task_state.is_any_signal_pending()
445                && !current_task.is_exitted()
446            {
447                continue;
448            }
449            result = None;
450            // Always restore signal mask before returning to userspace.
451            task_state.restore_signal_mask();
452            break;
453        }
454    }
455
456    if let Some(ExitStatus::CoreDump(signal_info)) = &result {
457        if current_task.flags().contains(TaskFlags::DUMP_ON_EXIT) {
458            // Avoid taking a backtrace if the signal was sent by the same task.
459            if !signal_info.is_sent_by(&current_task.weak_task()) {
460                // Request a backtrace before reporting the crash to increase chance of a backtrace
461                // in logs. This call is kept as far up in the call stack as possible to avoid
462                // additional frames that are always the same and not relevant to users.
463                // TODO(https://fxbug.dev/356732164) collect a backtrace ourselves
464                debug::backtrace_request_current_thread();
465            }
466
467            if let Some(pending_report) =
468                current_task.kernel().crash_reporter.begin_crash_report(&current_task)
469            {
470                current_task.kernel().crash_reporter.handle_core_dump(
471                    &current_task,
472                    signal_info,
473                    pending_report,
474                );
475            }
476        }
477    }
478    return Ok(result);
479}