Skip to main content

starnix_syscall_loop/
lib.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use anyhow::{Error, format_err};
6use extended_pstate::ExtendedPstatePointer;
7use starnix_core::arch::execution::new_syscall;
8use starnix_core::ptrace::{PtraceStatus, StopState, ptrace_syscall_enter, ptrace_syscall_exit};
9use starnix_core::signals::{
10    SignalInfo, deliver_signal, dequeue_signal, prepare_to_restart_syscall,
11};
12use starnix_core::task::{CurrentTask, ExceptionResult, ExitStatus, SeccompStateValue, TaskFlags};
13use starnix_logging::{
14    CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION, NAME_RESTRICTED_KICK, NAME_RUN_TASK,
15    firehose_trace_duration, firehose_trace_instant, log_error, log_syscall, log_trace, log_warn,
16    set_current_task_info,
17};
18use starnix_registers::RestrictedState;
19use starnix_sync::{Locked, Unlocked};
20use starnix_syscalls::SyscallResult;
21use starnix_syscalls::decls::{Syscall, SyscallDecl};
22use starnix_uapi::errno;
23use starnix_uapi::errors::Errno;
24use starnix_uapi::signals::SIGKILL;
25
26mod table;
27
28pub fn enter(locked: &mut Locked<Unlocked>, current_task: &mut CurrentTask) -> ExitStatus {
29    match RestrictedState::bind_and_map(&mut current_task.thread_state.registers) {
30        Ok(restricted_state) => match run_task(locked, current_task, restricted_state) {
31            Ok(ok) => ok,
32            Err(error) => {
33                log_warn!("Died unexpectedly from {error:?}! treating as SIGKILL");
34                ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
35            }
36        },
37        Err(error) => {
38            log_error!("failed to map mode state vmo, {error:?}! treating as SIGKILL");
39            ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
40        }
41    }
42}
43
44type RestrictedExitCallback = extern "C" fn(
45    *mut RestrictedEnterContext<'_>,
46    zx::sys::zx_restricted_reason_t,
47    *mut ExtendedPstatePointer,
48) -> bool;
49
50unsafe extern "C" {
51    // rustc doesn't like RestrictedEnterContext for FFI but we're just passing it back to
52    // ourselves with extra steps.
53    #[allow(improper_ctypes)]
54    fn restricted_enter_loop(
55        options: u32,
56        restricted_exit_callback: RestrictedExitCallback,
57        restricted_exit_callback_context: *mut RestrictedEnterContext<'_>,
58        restricted_state: *mut zx::sys::zx_restricted_exception_t,
59        extended_pstate_ptr_ptr: *mut ExtendedPstatePointer,
60    ) -> zx::sys::zx_status_t;
61}
62
63const RESTRICTED_ENTER_OPTIONS: u32 = 0;
64
65struct RestrictedEnterContext<'a> {
66    current_task: &'a mut CurrentTask,
67    restricted_state: RestrictedState,
68    error_context: Option<ErrorContext>,
69    exit_status: Result<ExitStatus, Error>,
70}
71
72/// Runs the `current_task` to completion.
73///
74/// The high-level flow of this function looks as follows:
75///
76///   1. Write the restricted state for the current thread to set it up to enter into the restricted
77///      (Linux) part of the address space.
78///   2. Enter restricted mode.
79///   3. Return from restricted mode, reading out the new state of the restricted mode execution.
80///      This state contains the thread's restricted register state, which is used to determine
81///      which system call to dispatch.
82///   4. Dispatch the system call.
83///   5. Handle pending signals.
84///   6. Goto 1.
85fn run_task(
86    locked: &mut Locked<Unlocked>,
87    current_task: &mut CurrentTask,
88    restricted_state: RestrictedState,
89) -> Result<ExitStatus, Error> {
90    set_current_task_info(
91        current_task.task.command(),
92        current_task.task.thread_group().read().leader_command(),
93        current_task.task.thread_group().leader,
94        current_task.tid,
95    );
96
97    firehose_trace_duration!(CATEGORY_STARNIX, NAME_RUN_TASK);
98
99    // This tracks the last failing system call for debugging purposes.
100    let error_context = None;
101
102    // We need to check for exit once, before the task starts executing, in case
103    // the task has already been sent a signal that will cause it to exit.
104    if let Some(exit_status) =
105        process_completed_restricted_exit(locked, current_task, &error_context)?
106    {
107        return Ok(exit_status);
108    }
109
110    // The restricted_state_ptr points at our bound state. It will remain the
111    // same value for the duration of the restricted loop. The value it points
112    // out will be mutated by restricted_enter_loop.
113    let restricted_state_ptr = restricted_state.bound_state.as_ptr();
114
115    // This extended pstate pointer points to the storage for extended processor
116    // state (vector and FP registers).
117    let mut extended_pstate_ptr = current_task.thread_state.extended_pstate.as_ptr();
118
119    let mut restricted_enter_context = RestrictedEnterContext {
120        current_task,
121        restricted_state,
122        error_context,
123        exit_status: Err(errno!(ENOEXEC).into()),
124    };
125
126    #[allow(
127        clippy::undocumented_unsafe_blocks,
128        reason = "Force documented unsafe blocks in Starnix"
129    )]
130    let restricted_enter_status = zx::Status::from_raw(unsafe {
131        restricted_enter_loop(
132            RESTRICTED_ENTER_OPTIONS,
133            restricted_exit_callback_c,
134            &mut restricted_enter_context,
135            restricted_state_ptr,
136            &raw mut extended_pstate_ptr,
137        )
138    });
139    if restricted_enter_status != zx::Status::OK {
140        // If restricted_enter_loop failed, it means that we failed to satisfy
141        // a prerequisite of zx_restricted_enter which should never happen.
142        log_error!(
143            "restricted_enter_loop failed: {}, register state: {:?}",
144            restricted_enter_status,
145            restricted_enter_context.current_task.thread_state.registers
146        );
147    }
148    restricted_enter_context.exit_status
149}
150
151extern "C" fn restricted_exit_callback_c(
152    context: *mut RestrictedEnterContext<'_>,
153    reason_code: zx::sys::zx_restricted_reason_t,
154    extended_pstate_ptr_ptr: *mut ExtendedPstatePointer,
155) -> bool {
156    // SAFETY:
157    // `context` is a pointer to a `RestrictedEnterContext` that was passed to
158    // `restricted_enter_loop`.
159    //  `extended_pstate_ptr` is a pointer to the ExtendedPstatePointer instance
160    //  that was passed to `restricted_enter_loop.`
161    // Our restricted return assembly and Zircon together guarantee that this
162    // thread has exclusive access to these variables.
163    let (restricted_context, extended_pstate_ptr) =
164        unsafe { (&mut *context, extended_pstate_ptr_ptr.as_mut_unchecked()) };
165    restricted_exit_callback(
166        reason_code,
167        restricted_context.current_task,
168        &mut restricted_context.restricted_state,
169        &mut restricted_context.error_context,
170        &mut restricted_context.exit_status,
171        extended_pstate_ptr,
172    )
173}
174
175fn restricted_exit_callback(
176    reason_code: zx::sys::zx_restricted_reason_t,
177    current_task: &mut CurrentTask,
178    restricted_state: &mut RestrictedState,
179    error_context: &mut Option<ErrorContext>,
180    exit_status: &mut Result<ExitStatus, Error>,
181    extended_pstate_ptr: &mut ExtendedPstatePointer,
182) -> bool {
183    debug_assert_eq!(
184        current_task.thread_state.restart_code, None,
185        "restart_code should only ever be Some() in normal mode",
186    );
187
188    let ret =
189        match process_restricted_exit(reason_code, current_task, restricted_state, error_context) {
190            Ok(None) => {
191                // Keep going!
192
193                *extended_pstate_ptr = current_task.thread_state.extended_pstate.as_ptr();
194
195                true
196            }
197            Ok(Some(completed_exit_status)) => {
198                *exit_status = Ok(completed_exit_status);
199                false
200            }
201            Err(error) => {
202                *exit_status = Err(error);
203                false
204            }
205        };
206
207    debug_assert_eq!(
208        current_task.thread_state.restart_code, None,
209        "restart_code should only ever be Some() in normal mode",
210    );
211
212    ret
213}
214
215fn process_restricted_exit(
216    reason_code: zx::sys::zx_restricted_reason_t,
217    current_task: &mut CurrentTask,
218    restricted_state: &mut RestrictedState,
219    error_context: &mut Option<ErrorContext>,
220) -> Result<Option<ExitStatus>, Error> {
221    // We can't hold any locks entering restricted mode so we can't be holding any locks on exit.
222    #[allow(
223        clippy::undocumented_unsafe_blocks,
224        reason = "Force documented unsafe blocks in Starnix"
225    )]
226    let locked = unsafe { Unlocked::new() };
227
228    current_task.thread_state.registers.sync_stack_ptr();
229
230    match reason_code {
231        zx::sys::ZX_RESTRICTED_REASON_SYSCALL => {
232            let syscall_decl = SyscallDecl::from_number(
233                current_task.thread_state.registers.syscall_register(),
234                current_task.thread_state.arch_width(),
235            );
236
237            if let Some(new_error_context) = execute_syscall(locked, current_task, syscall_decl) {
238                *error_context = Some(new_error_context);
239            }
240        }
241        zx::sys::ZX_RESTRICTED_REASON_EXCEPTION => {
242            firehose_trace_duration!(CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION);
243            let restricted_exception = restricted_state.read_exception();
244            let exception_result = current_task.process_exception(locked, &restricted_exception);
245            process_completed_exception(
246                locked,
247                current_task,
248                exception_result,
249                restricted_exception,
250            );
251        }
252        zx::sys::ZX_RESTRICTED_REASON_KICK => {
253            firehose_trace_instant!(
254                CATEGORY_STARNIX,
255                NAME_RESTRICTED_KICK,
256                fuchsia_trace::Scope::Thread
257            );
258            // Fall through to the post-syscall / post-exception handling logic. We were likely
259            // kicked because a signal is pending deliver or the task has exited. Spurious kicks are
260            // also possible.
261        }
262        _ => {
263            return Err(format_err!("Received unexpected restricted reason code: {}", reason_code));
264        }
265    }
266
267    if let Some(exit_status) =
268        process_completed_restricted_exit(locked, current_task, &error_context)?
269    {
270        return Ok(Some(exit_status));
271    }
272
273    Ok(None)
274}
275
276fn process_completed_exception(
277    locked: &mut Locked<Unlocked>,
278    current_task: &mut CurrentTask,
279    exception_result: ExceptionResult,
280    restricted_exception: zx::ExceptionReport,
281) {
282    match exception_result {
283        ExceptionResult::Handled => {}
284        ExceptionResult::Signal(signal) => {
285            let mut task_state = current_task.task.write();
286            if task_state.ptrace_on_signal_consume() {
287                task_state.set_stopped(
288                    StopState::SignalDeliveryStopping,
289                    Some(signal),
290                    Some(&current_task),
291                    None,
292                );
293                return;
294            }
295
296            if let Some(status) = deliver_signal(
297                current_task.task.as_ref(),
298                current_task.thread_state.arch_width(),
299                task_state,
300                signal.into(),
301                &mut current_task.thread_state.registers,
302                &current_task.thread_state.extended_pstate,
303                Some(restricted_exception),
304            ) {
305                current_task.thread_group_exit(locked, status);
306            }
307        }
308    }
309}
310
311/// Contains context to track the most recently failing system call.
312///
313/// When a task exits with a non-zero exit code, this context is logged to help debugging which
314/// system call may have triggered the failure.
315#[derive(Debug)]
316pub struct ErrorContext {
317    /// The system call that failed.
318    pub syscall: Syscall,
319
320    /// The error that was returned for the system call.
321    pub error: Errno,
322}
323
324/// Executes the provided `syscall` in `current_task`.
325///
326/// Returns an `ErrorContext` if the system call returned an error.
327#[inline(never)] // Inlining this function breaks the CFI directives used to unwind into user code.
328pub fn execute_syscall(
329    locked: &mut Locked<Unlocked>,
330    current_task: &mut CurrentTask,
331    syscall_decl: SyscallDecl,
332) -> Option<ErrorContext> {
333    firehose_trace_duration!(CATEGORY_STARNIX, syscall_decl.trace_name());
334    let syscall = new_syscall(syscall_decl, current_task);
335
336    current_task.thread_state.registers.save_registers_for_restart(syscall.decl.number);
337
338    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
339        ptrace_syscall_enter(locked, current_task);
340    }
341
342    log_syscall!(current_task, "{syscall:?}");
343
344    let result: Result<SyscallResult, Errno> =
345        if current_task.seccomp_filter_state.get() != SeccompStateValue::None {
346            // Inlined fast path for seccomp, so that we don't incur the cost
347            // of a method call when running the filters.
348            if let Some(res) = current_task.run_seccomp_filters(locked, &syscall) {
349                res
350            } else {
351                table::dispatch_syscall(locked, current_task, &syscall)
352            }
353        } else {
354            table::dispatch_syscall(locked, current_task, &syscall)
355        };
356
357    current_task.trigger_delayed_releaser(locked);
358
359    let return_value = match result {
360        Ok(return_value) => {
361            log_syscall!(current_task, "-> {:#x}", return_value.value());
362            current_task.thread_state.registers.set_return_register(return_value.value());
363            None
364        }
365        Err(errno) => {
366            log_syscall!(current_task, "!-> {errno}");
367            if errno.is_restartable() {
368                current_task.thread_state.restart_code = Some(errno.code);
369            }
370            current_task.thread_state.registers.set_return_register(errno.return_value());
371            Some(ErrorContext { error: errno, syscall })
372        }
373    };
374
375    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
376        ptrace_syscall_exit(locked, current_task, return_value.is_some());
377    }
378
379    return_value
380}
381
382/// Finishes `current_task` updates after a restricted mode exit such as a syscall, exception, or kick.
383///
384/// Returns an `ExitStatus` if the task is meant to exit.
385pub fn process_completed_restricted_exit(
386    locked: &mut Locked<Unlocked>,
387    current_task: &mut CurrentTask,
388    error_context: &Option<ErrorContext>,
389) -> Result<Option<ExitStatus>, Errno> {
390    let result;
391    loop {
392        // Checking for a signal might cause the task to exit, so check before processing exit
393        {
394            {
395                if !current_task.is_exitted() {
396                    dequeue_signal(locked, current_task);
397                }
398                // The syscall may need to restart for a non-signal-related
399                // reason. This call does nothing if we aren't restarting.
400                prepare_to_restart_syscall(&mut current_task.thread_state, None);
401            }
402        }
403
404        let exit_status = current_task.exit_status();
405        if let Some(exit_status) = exit_status {
406            log_trace!("exiting with status {:?}", exit_status);
407            if let Some(error_context) = error_context {
408                match exit_status {
409                    ExitStatus::Exit(value) if value == 0 => {}
410                    _ => {
411                        log_trace!(
412                            "last failing syscall before exit: {:?}, failed with {:?}",
413                            error_context.syscall,
414                            error_context.error
415                        );
416                    }
417                };
418            }
419
420            result = Some(exit_status);
421            break;
422        } else {
423            // Block a stopped process after it's had a chance to handle signals, since a signal might
424            // cause it to stop.
425            current_task.block_while_stopped(locked);
426            // If ptrace_cont has sent a signal, process it immediately.  This
427            // seems to match Linux behavior.
428
429            let task_state = current_task.read();
430            if task_state
431                .ptrace
432                .as_ref()
433                .is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Continuing)
434                && task_state.is_any_signal_pending()
435                && !current_task.is_exitted()
436            {
437                continue;
438            }
439            result = None;
440            break;
441        }
442    }
443
444    if let Some(ExitStatus::CoreDump(signal_info)) = &result {
445        if current_task.flags().contains(TaskFlags::DUMP_ON_EXIT) {
446            // Avoid taking a backtrace if the signal was sent by the same task.
447            if !signal_info.is_sent_by(&current_task.weak_task()) {
448                // Request a backtrace before reporting the crash to increase chance of a backtrace
449                // in logs. This call is kept as far up in the call stack as possible to avoid
450                // additional frames that are always the same and not relevant to users.
451                // TODO(https://fxbug.dev/356732164) collect a backtrace ourselves
452                debug::backtrace_request_current_thread();
453            }
454
455            if let Some(pending_report) =
456                current_task.kernel().crash_reporter.begin_crash_report(&current_task)
457            {
458                current_task.kernel().crash_reporter.handle_core_dump(
459                    &current_task,
460                    signal_info,
461                    pending_report,
462                );
463            }
464        }
465    }
466    return Ok(result);
467}