Skip to main content

starnix_syscall_loop/
lib.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use anyhow::{Error, format_err};
6use extended_pstate::ExtendedPstateState;
7use starnix_core::arch::execution::new_syscall;
8use starnix_core::ptrace::{PtraceStatus, StopState, ptrace_syscall_enter, ptrace_syscall_exit};
9use starnix_core::signals::{
10    SignalInfo, deliver_signal, dequeue_signal, prepare_to_restart_syscall,
11};
12use starnix_core::task::{CurrentTask, ExceptionResult, ExitStatus, SeccompStateValue, TaskFlags};
13use starnix_logging::{
14    CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION, NAME_RESTRICTED_KICK, NAME_RUN_TASK,
15    firehose_trace_duration, firehose_trace_instant, log_error, log_syscall, log_trace, log_warn,
16    set_current_task_info,
17};
18use starnix_registers::RestrictedState;
19use starnix_sync::{Locked, Unlocked};
20use starnix_syscalls::SyscallResult;
21use starnix_syscalls::decls::{Syscall, SyscallDecl};
22use starnix_uapi::errno;
23use starnix_uapi::errors::Errno;
24use starnix_uapi::signals::SIGKILL;
25
26mod table;
27
28pub fn enter(locked: &mut Locked<Unlocked>, current_task: &mut CurrentTask) -> ExitStatus {
29    match RestrictedState::bind_and_map(&mut current_task.thread_state.registers) {
30        Ok(restricted_state) => match run_task(locked, current_task, restricted_state) {
31            Ok(ok) => ok,
32            Err(error) => {
33                log_warn!("Died unexpectedly from {error:?}! treating as SIGKILL");
34                ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
35            }
36        },
37        Err(error) => {
38            log_error!("failed to map mode state vmo, {error:?}! treating as SIGKILL");
39            ExitStatus::Kill(SignalInfo::kernel(SIGKILL))
40        }
41    }
42}
43
44unsafe extern "C" {
45    // rustc doesn't like RestrictedEnterContext for FFI but we're just passing it back to
46    // ourselves with extra steps.
47    #[allow(improper_ctypes)]
48    fn restricted_enter_loop(
49        options: u32,
50        restricted_exit_callback: extern "C" fn(*mut RestrictedEnterContext<'_>, u64) -> bool,
51        restricted_exit_callback_context: *mut RestrictedEnterContext<'_>,
52        restricted_state: *mut zx::sys::zx_restricted_exception_t,
53        extended_pstate: *mut ExtendedPstateState,
54    ) -> zx::sys::zx_status_t;
55}
56
57const RESTRICTED_ENTER_OPTIONS: u32 = 0;
58
59struct RestrictedEnterContext<'a> {
60    current_task: &'a mut CurrentTask,
61    restricted_state: RestrictedState,
62    error_context: Option<ErrorContext>,
63    exit_status: Result<ExitStatus, Error>,
64}
65
66/// Runs the `current_task` to completion.
67///
68/// The high-level flow of this function looks as follows:
69///
70///   1. Write the restricted state for the current thread to set it up to enter into the restricted
71///      (Linux) part of the address space.
72///   2. Enter restricted mode.
73///   3. Return from restricted mode, reading out the new state of the restricted mode execution.
74///      This state contains the thread's restricted register state, which is used to determine
75///      which system call to dispatch.
76///   4. Dispatch the system call.
77///   5. Handle pending signals.
78///   6. Goto 1.
79fn run_task(
80    locked: &mut Locked<Unlocked>,
81    current_task: &mut CurrentTask,
82    restricted_state: RestrictedState,
83) -> Result<ExitStatus, Error> {
84    set_current_task_info(
85        current_task.task.command(),
86        current_task.task.thread_group().read().leader_command(),
87        current_task.task.thread_group().leader,
88        current_task.tid,
89    );
90
91    firehose_trace_duration!(CATEGORY_STARNIX, NAME_RUN_TASK);
92
93    // This tracks the last failing system call for debugging purposes.
94    let error_context = None;
95
96    // We need to check for exit once, before the task starts executing, in case
97    // the task has already been sent a signal that will cause it to exit.
98    if let Some(exit_status) =
99        process_completed_restricted_exit(locked, current_task, &error_context)?
100    {
101        return Ok(exit_status);
102    }
103
104    // The restricted_state_ptr points at our bound state. It will remain the
105    // same value for the duration of the restricted loop. The value it points
106    // out will be mutated by restricted_enter_loop.
107    let restricted_state_ptr = restricted_state.bound_state.as_ptr();
108    // This extended pstate pointer points to the storage for extended processor
109    // state (vector and FP registers).
110    let extended_pstate_ptr = current_task.thread_state.extended_pstate.as_ptr();
111
112    let mut restricted_enter_context = RestrictedEnterContext {
113        current_task,
114        restricted_state,
115        error_context,
116        exit_status: Err(errno!(ENOEXEC).into()),
117    };
118
119    #[allow(
120        clippy::undocumented_unsafe_blocks,
121        reason = "Force documented unsafe blocks in Starnix"
122    )]
123    let restricted_enter_status = zx::Status::from_raw(unsafe {
124        restricted_enter_loop(
125            RESTRICTED_ENTER_OPTIONS,
126            restricted_exit_callback_c,
127            &mut restricted_enter_context,
128            restricted_state_ptr,
129            extended_pstate_ptr,
130        )
131    });
132    if restricted_enter_status != zx::Status::OK {
133        // If restricted_enter_loop failed, it means that we failed to satisfy
134        // a prerequisite of zx_restricted_enter which should never happen.
135        log_error!(
136            "restricted_enter_loop failed: {}, register state: {:?}",
137            restricted_enter_status,
138            restricted_enter_context.current_task.thread_state.registers
139        );
140    }
141    restricted_enter_context.exit_status
142}
143
144extern "C" fn restricted_exit_callback_c(
145    context: *mut RestrictedEnterContext<'_>,
146    reason_code: zx::sys::zx_restricted_reason_t,
147) -> bool {
148    // SAFETY: `context` is a pointer to a `RestrictedEnterContext` that was passed to
149    // `restricted_enter_loop`. Our restricted return assembly and Zircon together guarantee that
150    // this thread has exclusive access to the restricted enter context.
151    let restricted_context = unsafe { &mut *context };
152    restricted_exit_callback(
153        reason_code,
154        restricted_context.current_task,
155        &mut restricted_context.restricted_state,
156        &mut restricted_context.error_context,
157        &mut restricted_context.exit_status,
158    )
159}
160
161fn restricted_exit_callback(
162    reason_code: zx::sys::zx_restricted_reason_t,
163    current_task: &mut CurrentTask,
164    restricted_state: &mut RestrictedState,
165    error_context: &mut Option<ErrorContext>,
166    exit_status: &mut Result<ExitStatus, Error>,
167) -> bool {
168    debug_assert_eq!(
169        current_task.thread_state.restart_code, None,
170        "restart_code should only ever be Some() in normal mode",
171    );
172
173    let ret =
174        match process_restricted_exit(reason_code, current_task, restricted_state, error_context) {
175            Ok(None) => {
176                // Keep going!
177                true
178            }
179            Ok(Some(completed_exit_status)) => {
180                *exit_status = Ok(completed_exit_status);
181                false
182            }
183            Err(error) => {
184                *exit_status = Err(error);
185                false
186            }
187        };
188
189    debug_assert_eq!(
190        current_task.thread_state.restart_code, None,
191        "restart_code should only ever be Some() in normal mode",
192    );
193
194    ret
195}
196
197fn process_restricted_exit(
198    reason_code: zx::sys::zx_restricted_reason_t,
199    current_task: &mut CurrentTask,
200    restricted_state: &mut RestrictedState,
201    error_context: &mut Option<ErrorContext>,
202) -> Result<Option<ExitStatus>, Error> {
203    // We can't hold any locks entering restricted mode so we can't be holding any locks on exit.
204    #[allow(
205        clippy::undocumented_unsafe_blocks,
206        reason = "Force documented unsafe blocks in Starnix"
207    )]
208    let locked = unsafe { Unlocked::new() };
209
210    current_task.thread_state.registers.sync_stack_ptr();
211
212    match reason_code {
213        zx::sys::ZX_RESTRICTED_REASON_SYSCALL => {
214            let syscall_decl = SyscallDecl::from_number(
215                current_task.thread_state.registers.syscall_register(),
216                current_task.thread_state.arch_width(),
217            );
218
219            if let Some(new_error_context) = execute_syscall(locked, current_task, syscall_decl) {
220                *error_context = Some(new_error_context);
221            }
222        }
223        zx::sys::ZX_RESTRICTED_REASON_EXCEPTION => {
224            firehose_trace_duration!(CATEGORY_STARNIX, NAME_HANDLE_EXCEPTION);
225            let restricted_exception = restricted_state.read_exception();
226            let exception_result = current_task.process_exception(locked, &restricted_exception);
227            process_completed_exception(
228                locked,
229                current_task,
230                exception_result,
231                restricted_exception,
232            );
233        }
234        zx::sys::ZX_RESTRICTED_REASON_KICK => {
235            firehose_trace_instant!(
236                CATEGORY_STARNIX,
237                NAME_RESTRICTED_KICK,
238                fuchsia_trace::Scope::Thread
239            );
240            // Fall through to the post-syscall / post-exception handling logic. We were likely
241            // kicked because a signal is pending deliver or the task has exited. Spurious kicks are
242            // also possible.
243        }
244        _ => {
245            return Err(format_err!("Received unexpected restricted reason code: {}", reason_code));
246        }
247    }
248
249    if let Some(exit_status) =
250        process_completed_restricted_exit(locked, current_task, &error_context)?
251    {
252        return Ok(Some(exit_status));
253    }
254
255    Ok(None)
256}
257
258fn process_completed_exception(
259    locked: &mut Locked<Unlocked>,
260    current_task: &mut CurrentTask,
261    exception_result: ExceptionResult,
262    restricted_exception: zx::ExceptionReport,
263) {
264    match exception_result {
265        ExceptionResult::Handled => {}
266        ExceptionResult::Signal(signal) => {
267            let mut task_state = current_task.task.write();
268            if task_state.ptrace_on_signal_consume() {
269                task_state.set_stopped(
270                    StopState::SignalDeliveryStopping,
271                    Some(signal),
272                    Some(&current_task),
273                    None,
274                );
275                return;
276            }
277
278            if let Some(status) = deliver_signal(
279                current_task.task.as_ref(),
280                current_task.thread_state.arch_width(),
281                task_state,
282                signal.into(),
283                &mut current_task.thread_state.registers,
284                &current_task.thread_state.extended_pstate,
285                Some(restricted_exception),
286            ) {
287                current_task.thread_group_exit(locked, status);
288            }
289        }
290    }
291}
292
293/// Contains context to track the most recently failing system call.
294///
295/// When a task exits with a non-zero exit code, this context is logged to help debugging which
296/// system call may have triggered the failure.
297#[derive(Debug)]
298pub struct ErrorContext {
299    /// The system call that failed.
300    pub syscall: Syscall,
301
302    /// The error that was returned for the system call.
303    pub error: Errno,
304}
305
306/// Executes the provided `syscall` in `current_task`.
307///
308/// Returns an `ErrorContext` if the system call returned an error.
309#[inline(never)] // Inlining this function breaks the CFI directives used to unwind into user code.
310pub fn execute_syscall(
311    locked: &mut Locked<Unlocked>,
312    current_task: &mut CurrentTask,
313    syscall_decl: SyscallDecl,
314) -> Option<ErrorContext> {
315    firehose_trace_duration!(CATEGORY_STARNIX, syscall_decl.trace_name());
316    let syscall = new_syscall(syscall_decl, current_task);
317
318    current_task.thread_state.registers.save_registers_for_restart(syscall.decl.number);
319
320    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
321        ptrace_syscall_enter(locked, current_task);
322    }
323
324    log_syscall!(current_task, "{syscall:?}");
325
326    let result: Result<SyscallResult, Errno> =
327        if current_task.seccomp_filter_state.get() != SeccompStateValue::None {
328            // Inlined fast path for seccomp, so that we don't incur the cost
329            // of a method call when running the filters.
330            if let Some(res) = current_task.run_seccomp_filters(locked, &syscall) {
331                res
332            } else {
333                table::dispatch_syscall(locked, current_task, &syscall)
334            }
335        } else {
336            table::dispatch_syscall(locked, current_task, &syscall)
337        };
338
339    current_task.trigger_delayed_releaser(locked);
340
341    let return_value = match result {
342        Ok(return_value) => {
343            log_syscall!(current_task, "-> {:#x}", return_value.value());
344            current_task.thread_state.registers.set_return_register(return_value.value());
345            None
346        }
347        Err(errno) => {
348            log_syscall!(current_task, "!-> {errno}");
349            if errno.is_restartable() {
350                current_task.thread_state.restart_code = Some(errno.code);
351            }
352            current_task.thread_state.registers.set_return_register(errno.return_value());
353            Some(ErrorContext { error: errno, syscall })
354        }
355    };
356
357    if current_task.trace_syscalls.load(std::sync::atomic::Ordering::Relaxed) {
358        ptrace_syscall_exit(locked, current_task, return_value.is_some());
359    }
360
361    return_value
362}
363
364/// Finishes `current_task` updates after a restricted mode exit such as a syscall, exception, or kick.
365///
366/// Returns an `ExitStatus` if the task is meant to exit.
367pub fn process_completed_restricted_exit(
368    locked: &mut Locked<Unlocked>,
369    current_task: &mut CurrentTask,
370    error_context: &Option<ErrorContext>,
371) -> Result<Option<ExitStatus>, Errno> {
372    let result;
373    loop {
374        // Checking for a signal might cause the task to exit, so check before processing exit
375        {
376            {
377                if !current_task.is_exitted() {
378                    dequeue_signal(locked, current_task);
379                }
380                // The syscall may need to restart for a non-signal-related
381                // reason. This call does nothing if we aren't restarting.
382                prepare_to_restart_syscall(&mut current_task.thread_state, None);
383            }
384        }
385
386        let exit_status = current_task.exit_status();
387        if let Some(exit_status) = exit_status {
388            log_trace!("exiting with status {:?}", exit_status);
389            if let Some(error_context) = error_context {
390                match exit_status {
391                    ExitStatus::Exit(value) if value == 0 => {}
392                    _ => {
393                        log_trace!(
394                            "last failing syscall before exit: {:?}, failed with {:?}",
395                            error_context.syscall,
396                            error_context.error
397                        );
398                    }
399                };
400            }
401
402            result = Some(exit_status);
403            break;
404        } else {
405            // Block a stopped process after it's had a chance to handle signals, since a signal might
406            // cause it to stop.
407            current_task.block_while_stopped(locked);
408            // If ptrace_cont has sent a signal, process it immediately.  This
409            // seems to match Linux behavior.
410
411            let task_state = current_task.read();
412            if task_state
413                .ptrace
414                .as_ref()
415                .is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Continuing)
416                && task_state.is_any_signal_pending()
417                && !current_task.is_exitted()
418            {
419                continue;
420            }
421            result = None;
422            break;
423        }
424    }
425
426    if let Some(ExitStatus::CoreDump(signal_info)) = &result {
427        if current_task.flags().contains(TaskFlags::DUMP_ON_EXIT) {
428            // Avoid taking a backtrace if the signal was sent by the same task.
429            if !signal_info.is_sent_by(&current_task.weak_task()) {
430                // Request a backtrace before reporting the crash to increase chance of a backtrace
431                // in logs. This call is kept as far up in the call stack as possible to avoid
432                // additional frames that are always the same and not relevant to users.
433                // TODO(https://fxbug.dev/356732164) collect a backtrace ourselves
434                debug::backtrace_request_current_thread();
435            }
436
437            if let Some(pending_report) =
438                current_task.kernel().crash_reporter.begin_crash_report(&current_task)
439            {
440                current_task.kernel().crash_reporter.handle_core_dump(
441                    &current_task,
442                    signal_info,
443                    pending_report,
444                );
445            }
446        }
447    }
448    return Ok(result);
449}