Skip to main content

starnix_core/task/
kernel.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::bpf::EbpfState;
6use crate::device::remote_block_device::RemoteBlockDeviceRegistry;
7use crate::device::{DeviceMode, DeviceRegistry};
8use crate::execution::CrashReporter;
9use crate::mm::{FutexTable, MappingSummary, MlockPinFlavor, SharedFutexKey};
10use crate::power::SuspendResumeManagerHandle;
11use crate::ptrace::StopState;
12use crate::security::{self, AuditLogger};
13use crate::task::container_namespace::ContainerNamespace;
14use crate::task::limits::SystemLimits;
15use crate::task::memory_attribution::MemoryAttributionManager;
16use crate::task::net::NetstackDevices;
17use crate::task::tracing::PidToKoidMap;
18use crate::task::{
19    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, DelayedReleaser,
20    IpTables, KernelCgroups, KernelStats, KernelThreads, PidTable, SchedulerManager, Syslog,
21    ThreadGroup, UtsNamespace, UtsNamespaceHandle,
22};
23use crate::time::{HrTimerManager, HrTimerManagerHandle};
24use crate::vdso::vdso_loader::Vdso;
25use crate::vfs::fs_args::MountParams;
26use crate::vfs::socket::{
27    GenericMessage, GenericNetlink, NetlinkAccessControl, NetlinkContextImpl,
28    NetlinkToClientSender, SocketAddress, SocketTokensStore,
29};
30use crate::vfs::{CacheConfig, FileOps, FsNodeHandle, FsString, Mounts, NamespaceNode};
31use bstr::{BString, ByteSlice};
32use devicetree::types::Devicetree;
33use expando::Expando;
34use fidl::endpoints::{
35    ClientEnd, ControlHandle, DiscoverableProtocolMarker, ProtocolMarker, create_endpoints,
36};
37use fidl_fuchsia_component_runner::{ComponentControllerControlHandle, ComponentStopInfo};
38use fidl_fuchsia_feedback::CrashReporterProxy;
39use fidl_fuchsia_time_external::AdjustSynchronousProxy;
40use fuchsia_inspect::ArrayProperty;
41use futures::FutureExt;
42use netlink::interfaces::InterfacesHandler;
43use netlink::{NETLINK_LOG_TAG, Netlink};
44use once_cell::sync::OnceCell;
45use starnix_lifecycle::{AtomicU32Counter, AtomicU64Counter};
46use starnix_logging::{SyscallLogFilter, log_debug, log_error, log_info, log_warn};
47use starnix_sync::{
48    FileOpsCore, KernelSwapFiles, LockEqualOrBefore, Locked, Mutex, OrderedMutex, RwLock,
49};
50use starnix_types::ownership::TempRef;
51use starnix_uapi::device_type::DeviceType;
52use starnix_uapi::errors::{Errno, errno};
53use starnix_uapi::open_flags::OpenFlags;
54use starnix_uapi::{VMADDR_CID_HOST, from_status_like_fdio};
55use std::borrow::Cow;
56use std::collections::{HashMap, HashSet};
57use std::num::NonZeroU64;
58use std::path::PathBuf;
59use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, Ordering};
60use std::sync::{Arc, OnceLock, Weak};
61use zx::CpuFeatureFlags;
62use {
63    fidl_fuchsia_io as fio, fidl_fuchsia_memory_attribution as fattribution,
64    fuchsia_async as fasync,
65};
66
67/// Kernel features are specified in the component manifest of the starnix container
68/// or explicitly provided to the kernel constructor in tests.
69#[derive(Debug, Default, Clone)]
70pub struct KernelFeatures {
71    pub bpf_v2: bool,
72
73    /// Whether the kernel supports the S_ISUID and S_ISGID bits.
74    ///
75    /// For example, these bits are used by `sudo`.
76    ///
77    /// Enabling this feature is potentially a security risk because they allow privilege
78    /// escalation.
79    pub enable_suid: bool,
80
81    /// Whether io_uring is enabled.
82    ///
83    /// TODO(https://fxbug.dev/297431387): Enabled by default once the feature is completed.
84    pub io_uring: bool,
85
86    /// Whether the kernel should return an error to userspace, rather than panicking, if `reboot()`
87    /// is requested but cannot be enacted because the kernel lacks the relevant capabilities.
88    pub error_on_failed_reboot: bool,
89
90    /// The default seclabel that is applied to components that are run in this kernel.
91    ///
92    /// Components can override this by setting the `seclabel` field in their program block.
93    pub default_seclabel: Option<String>,
94
95    /// Whether the kernel is being used to run the SELinux Test Suite.
96    ///
97    /// TODO: https://fxbug.dev/388077431 - remove this once we no longer need workarounds for the
98    /// SELinux Test Suite.
99    pub selinux_test_suite: bool,
100
101    /// The default mount options to use when mounting directories from a component's namespace.
102    ///
103    /// The key is the path in the component's namespace, and the value is the mount options
104    /// string.
105    pub default_ns_mount_options: Option<HashMap<String, String>>,
106
107    /// The default uid that is applied to components that are run in this kernel.
108    ///
109    /// Components can override this by setting the `uid` field in their program block.
110    pub default_uid: u32,
111
112    /// mlock() never prefaults pages.
113    pub mlock_always_onfault: bool,
114
115    /// Implementation of mlock() to use for this kernel instance.
116    pub mlock_pin_flavor: MlockPinFlavor,
117
118    /// Whether excessive crash reports should be throttled.
119    pub crash_report_throttling: bool,
120
121    /// Whether or not to serve wifi support to Android.
122    pub wifi: bool,
123
124    /// The number of bytes to cache in pages for reading zx::MapInfo from VMARs.
125    pub cached_zx_map_info_bytes: u32,
126
127    /// The size of the Dirent LRU cache.
128    pub dirent_cache_size: u32,
129
130    /// Whether to expose a stub '/dev/ion' node, as a temporary workaround for compatibility.
131    // TODO(https://fxbug.dev/485370648) remove when unnecessary
132    pub fake_ion: bool,
133}
134
135impl KernelFeatures {
136    /// Returns the `MountParams` to use when mounting the specified path from a component's
137    /// namespace.  This mechanism is also used to specified options for mounts created via
138    /// container features, by specifying a pseudo-path e.g. "#container".
139    pub fn ns_mount_options(&self, ns_path: &str) -> Result<MountParams, Errno> {
140        if let Some(all_options) = &self.default_ns_mount_options {
141            if let Some(options) = all_options.get(ns_path) {
142                return MountParams::parse(options.as_bytes().into());
143            }
144        }
145        Ok(MountParams::default())
146    }
147}
148
149/// Kernel command line argument structure
150pub struct ArgNameAndValue<'a> {
151    pub name: &'a str,
152    pub value: Option<&'a str>,
153}
154
155/// The shared, mutable state for the entire Starnix kernel.
156///
157/// The `Kernel` object holds all kernel threads, userspace tasks, and file system resources for a
158/// single instance of the Starnix kernel. In production, there is one instance of this object for
159/// the entire Starnix kernel. However, multiple instances of this object can be created in one
160/// process during unit testing.
161///
162/// The structure of this object will likely need to evolve as we implement more namespacing and
163/// isolation mechanisms, such as `namespaces(7)` and `pid_namespaces(7)`.
164pub struct Kernel {
165    /// Weak reference to self. Allows to not have to pass &Arc<Kernel> in apis.
166    pub weak_self: Weak<Kernel>,
167
168    /// The kernel threads running on behalf of this kernel.
169    pub kthreads: KernelThreads,
170
171    /// The features enabled for this kernel.
172    pub features: KernelFeatures,
173
174    /// The processes and threads running in this kernel, organized by pid_t.
175    pub pids: RwLock<PidTable>,
176
177    /// Used to record the pid/tid to Koid mappings. Set when collecting trace data.
178    pub pid_to_koid_mapping: Arc<RwLock<Option<PidToKoidMap>>>,
179
180    /// Subsystem-specific properties that hang off the Kernel object.
181    ///
182    /// Instead of adding yet another property to the Kernel object, consider storing the property
183    /// in an expando if that property is only used by one part of the system, such as a module.
184    pub expando: Expando,
185
186    /// The default namespace for abstract AF_UNIX sockets in this kernel.
187    ///
188    /// Rather than use this default namespace, abstract socket addresses
189    /// should be looked up in the AbstractSocketNamespace on each Task
190    /// object because some Task objects might have a non-default namespace.
191    pub default_abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
192
193    /// The default namespace for abstract AF_VSOCK sockets in this kernel.
194    pub default_abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
195
196    /// The kernel command line. Shows up in /proc/cmdline.
197    pub cmdline: BString,
198
199    pub device_tree: Option<Devicetree>,
200
201    // Global state held by the Linux Security Modules subsystem.
202    pub security_state: security::KernelState,
203
204    /// The registry of device drivers.
205    pub device_registry: DeviceRegistry,
206
207    /// Mapping of top-level namespace entries to an associated proxy.
208    /// For example, "/svc" to the respective proxy. Only the namespace entries
209    /// which were known at component startup will be available by the kernel.
210    pub container_namespace: ContainerNamespace,
211
212    /// The registry of block devices backed by a remote fuchsia.io file.
213    pub remote_block_device_registry: Arc<RemoteBlockDeviceRegistry>,
214
215    /// The iptables used for filtering network packets.
216    iptables: OnceLock<IpTables>,
217
218    /// The futexes shared across processes.
219    pub shared_futexes: Arc<FutexTable<SharedFutexKey>>,
220
221    /// The default UTS namespace for all tasks.
222    ///
223    /// Because each task can have its own UTS namespace, you probably want to use
224    /// the UTS namespace handle of the task, which may/may not point to this one.
225    pub root_uts_ns: UtsNamespaceHandle,
226
227    /// A struct containing a VMO with a vDSO implementation, if implemented for a given architecture, and possibly an offset for a sigreturn function.
228    pub vdso: Vdso,
229
230    /// A struct containing a VMO with a arch32-vDSO implementation, if implemented for a given architecture.
231    // TODO(https://fxbug.dev/380431743) This could be made less clunky -- maybe a Vec<Vdso> above or
232    // something else
233    pub vdso_arch32: Option<Vdso>,
234
235    /// The table of devices installed on the netstack and their associated
236    /// state local to this `Kernel`.
237    pub netstack_devices: Arc<NetstackDevices>,
238
239    /// Files that are currently available for swapping.
240    /// Note: Starnix never actually swaps memory to these files. We just need to track them
241    /// to pass conformance tests.
242    pub swap_files: OrderedMutex<Vec<FsNodeHandle>, KernelSwapFiles>,
243
244    /// The implementation of generic Netlink protocol families.
245    generic_netlink: OnceLock<GenericNetlink<NetlinkToClientSender<GenericMessage>>>,
246
247    /// The implementation of networking-related Netlink protocol families.
248    network_netlink: OnceLock<Netlink<NetlinkContextImpl>>,
249
250    /// Inspect instrumentation for this kernel instance.
251    pub inspect_node: fuchsia_inspect::Node,
252
253    /// The kinds of seccomp action that gets logged, stored as a bit vector.
254    /// Each potential SeccompAction gets a bit in the vector, as specified by
255    /// SeccompAction::logged_bit_offset.  If the bit is set, that means the
256    /// action should be logged when it is taken, subject to the caveats
257    /// described in seccomp(2).  The value of the bit vector is exposed to users
258    /// in a text form in the file /proc/sys/kernel/seccomp/actions_logged.
259    pub actions_logged: AtomicU16,
260
261    /// The manager for suspend/resume.
262    pub suspend_resume_manager: SuspendResumeManagerHandle,
263
264    /// Unique IDs for new mounts and mount namespaces.
265    pub next_mount_id: AtomicU64Counter,
266    pub next_peer_group_id: AtomicU64Counter,
267    pub next_namespace_id: AtomicU64Counter,
268
269    /// Unique IDs for file objects.
270    pub next_file_object_id: AtomicU64Counter,
271
272    /// Unique cookie used to link two inotify events, usually an IN_MOVE_FROM/IN_MOVE_TO pair.
273    pub next_inotify_cookie: AtomicU32Counter,
274
275    /// Controls which processes a process is allowed to ptrace.  See Documentation/security/Yama.txt
276    pub ptrace_scope: AtomicU8,
277
278    // The Fuchsia build version returned by `fuchsia.buildinfo.Provider`.
279    pub build_version: OnceCell<String>,
280
281    pub stats: Arc<KernelStats>,
282
283    /// Resource limits that are exposed, for example, via sysctl.
284    pub system_limits: SystemLimits,
285
286    // The service to handle delayed releases. This is required for elements that requires to
287    // execute some code when released and requires a known context (both in term of lock context,
288    // as well as `CurrentTask`).
289    pub delayed_releaser: DelayedReleaser,
290
291    /// Manages task priorities.
292    pub scheduler: SchedulerManager,
293
294    /// The syslog manager.
295    pub syslog: Syslog,
296
297    /// All mounts.
298    pub mounts: Mounts,
299
300    /// The manager for creating and managing high-resolution timers.
301    pub hrtimer_manager: HrTimerManagerHandle,
302
303    /// The manager for monitoring and reporting resources used by the kernel.
304    pub memory_attribution_manager: MemoryAttributionManager,
305
306    /// Handler for crashing Linux processes.
307    pub crash_reporter: CrashReporter,
308
309    /// Whether this kernel is shutting down. When shutting down, new processes may not be spawned.
310    shutting_down: AtomicBool,
311
312    /// True to disable syslog access to unprivileged callers.  This also controls whether read
313    /// access to /dev/kmsg requires privileged capabilities.
314    pub restrict_dmesg: AtomicBool,
315
316    /// Determines whether unprivileged BPF is permitted, or can be re-enabled.
317    ///   0 - Unprivileged BPF is permitted.
318    ///   1 - Unprivileged BPF is not permitted, and cannot be enabled.
319    ///   2 - Unprivileged BPF is not permitted, but can be enabled by a privileged task.
320    pub disable_unprivileged_bpf: AtomicU8,
321
322    /// Control handle to the running container's ComponentController.
323    pub container_control_handle: Mutex<Option<ComponentControllerControlHandle>>,
324
325    /// eBPF state: loaded programs, eBPF maps, etc.
326    pub ebpf_state: EbpfState,
327
328    /// Cgroups of the kernel.
329    pub cgroups: KernelCgroups,
330
331    /// Used to communicate requests to adjust system time from within a Starnix
332    /// container. Used from syscalls.
333    pub time_adjustment_proxy: Option<AdjustSynchronousProxy>,
334
335    /// Used to store tokens for sockets, particularly per-uid sharing domain sockets.
336    pub socket_tokens_store: SocketTokensStore,
337
338    /// Hardware capabilities to push onto stack when loading an ELF binary.
339    pub hwcaps: HwCaps,
340
341    /// Filters for syscall logging. Processes with names matching these filters will have syscalls
342    /// logged at INFO level.
343    pub syscall_log_filters: Mutex<Vec<SyscallLogFilter>>,
344}
345
346/// Hardware capabilities.
347#[derive(Debug, Clone, Copy, Default)]
348pub struct HwCap {
349    /// The value for `AT_HWCAP`.
350    pub hwcap: u32,
351    /// The value for `AT_HWCAP2`.
352    pub hwcap2: u32,
353}
354
355/// Hardware capabilities for both 32-bit and 64-bit ELF binaries.
356#[derive(Debug, Clone, Copy, Default)]
357pub struct HwCaps {
358    /// For 32-bit binaries.
359    #[cfg(target_arch = "aarch64")]
360    pub arch32: HwCap,
361    /// For 64-bit binaries.
362    pub arch64: HwCap,
363}
364
365/// An implementation of [`InterfacesHandler`].
366///
367/// This holds a `Weak<Kernel>` because it is held within a [`Netlink`] which
368/// is itself held within an `Arc<Kernel>`. Holding an `Arc<T>` within an
369/// `Arc<T>` prevents the `Arc`'s ref count from ever reaching 0, causing a
370/// leak.
371struct InterfacesHandlerImpl(Weak<Kernel>);
372
373impl InterfacesHandlerImpl {
374    fn kernel(&self) -> Option<Arc<Kernel>> {
375        self.0.upgrade()
376    }
377}
378
379impl InterfacesHandler for InterfacesHandlerImpl {
380    fn handle_new_link(&mut self, name: &str, interface_id: NonZeroU64) {
381        if let Some(kernel) = self.kernel() {
382            kernel.netstack_devices.add_device(&kernel, name.into(), interface_id);
383        }
384    }
385
386    fn handle_deleted_link(&mut self, name: &str) {
387        if let Some(kernel) = self.kernel() {
388            kernel.netstack_devices.remove_device(&kernel, name.into());
389        }
390    }
391
392    fn handle_idle_event(&mut self) {
393        let Some(kernel) = self.kernel() else {
394            log_error!("kernel went away while netlink is initializing");
395            return;
396        };
397        let (initialized, wq) = &kernel.netstack_devices.initialized_and_wq;
398        if initialized.swap(true, Ordering::SeqCst) {
399            log_error!("netlink initial devices should only be reported once");
400            return;
401        }
402        wq.notify_all()
403    }
404}
405
406impl Kernel {
407    pub fn new(
408        cmdline: BString,
409        features: KernelFeatures,
410        system_limits: SystemLimits,
411        container_namespace: ContainerNamespace,
412        scheduler: SchedulerManager,
413        crash_reporter_proxy: Option<CrashReporterProxy>,
414        inspect_node: fuchsia_inspect::Node,
415        security_state: security::KernelState,
416        time_adjustment_proxy: Option<AdjustSynchronousProxy>,
417        device_tree: Option<Devicetree>,
418    ) -> Result<Arc<Kernel>, zx::Status> {
419        let unix_address_maker =
420            Box::new(|x: FsString| -> SocketAddress { SocketAddress::Unix(x) });
421        let vsock_address_maker = Box::new(|x: u32| -> SocketAddress {
422            SocketAddress::Vsock { port: x, cid: VMADDR_CID_HOST }
423        });
424
425        let crash_reporter = CrashReporter::new(
426            &inspect_node,
427            crash_reporter_proxy,
428            zx::Duration::from_minutes(8),
429            features.crash_report_throttling,
430        );
431        let hrtimer_manager = HrTimerManager::new(&inspect_node);
432
433        let cpu_feature_flags =
434            zx::system_get_feature_flags::<CpuFeatureFlags>().unwrap_or_else(|e| {
435                log_debug!("CPU feature flags are only supported on ARM64: {}, reporting 0", e);
436                CpuFeatureFlags::empty()
437            });
438        let hwcaps = HwCaps::from_cpu_feature_flags(cpu_feature_flags);
439
440        let this = Arc::new_cyclic(|kernel| Kernel {
441            weak_self: kernel.clone(),
442            kthreads: KernelThreads::new(kernel.clone()),
443            features,
444            pids: Default::default(),
445            pid_to_koid_mapping: Arc::new(RwLock::new(None)),
446            expando: Default::default(),
447            default_abstract_socket_namespace: AbstractUnixSocketNamespace::new(unix_address_maker),
448            default_abstract_vsock_namespace: AbstractVsockSocketNamespace::new(
449                vsock_address_maker,
450            ),
451            cmdline,
452            device_tree,
453            security_state,
454            device_registry: Default::default(),
455            container_namespace,
456            remote_block_device_registry: Default::default(),
457            iptables: OnceLock::new(),
458            shared_futexes: Arc::<FutexTable<SharedFutexKey>>::default(),
459            root_uts_ns: Arc::new(RwLock::new(UtsNamespace::default())),
460            vdso: Vdso::new(),
461            vdso_arch32: Vdso::new_arch32(),
462            netstack_devices: Arc::default(),
463            swap_files: Default::default(),
464            generic_netlink: OnceLock::new(),
465            network_netlink: OnceLock::new(),
466            inspect_node,
467            actions_logged: AtomicU16::new(0),
468            suspend_resume_manager: Default::default(),
469            next_mount_id: AtomicU64Counter::new(1),
470            next_peer_group_id: AtomicU64Counter::new(1),
471            next_namespace_id: AtomicU64Counter::new(1),
472            next_inotify_cookie: AtomicU32Counter::new(1),
473            next_file_object_id: Default::default(),
474            system_limits,
475            ptrace_scope: AtomicU8::new(0), // Disable YAMA checks by default.
476            restrict_dmesg: AtomicBool::new(false),
477            disable_unprivileged_bpf: AtomicU8::new(0), // Enable unprivileged BPF by default.
478            build_version: OnceCell::new(),
479            stats: Arc::new(KernelStats::default()),
480            delayed_releaser: Default::default(),
481            scheduler,
482            syslog: Default::default(),
483            mounts: Mounts::new(),
484            hrtimer_manager,
485            memory_attribution_manager: MemoryAttributionManager::new(kernel.clone()),
486            crash_reporter,
487            shutting_down: AtomicBool::new(false),
488            container_control_handle: Mutex::new(None),
489            ebpf_state: Default::default(),
490            cgroups: Default::default(),
491            time_adjustment_proxy,
492            socket_tokens_store: Default::default(),
493            hwcaps,
494            syscall_log_filters: Default::default(),
495        });
496
497        // Initialize the device registry before registering any devices.
498        //
499        // We will create sysfs recursively within this function.
500        this.device_registry.objects.init(&mut this.kthreads.unlocked_for_async(), &this);
501
502        // Make a copy of this Arc for the inspect lazy node to use but don't create an Arc cycle
503        // because the inspect node that owns this reference is owned by the kernel.
504        let kernel = Arc::downgrade(&this);
505        this.inspect_node.record_lazy_child("thread_groups", move || {
506            if let Some(kernel) = kernel.upgrade() {
507                let inspector = kernel.get_thread_groups_inspect();
508                async move { Ok(inspector) }.boxed()
509            } else {
510                async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
511            }
512        });
513
514        let kernel = Arc::downgrade(&this);
515        this.inspect_node.record_lazy_child("cgroupv2", move || {
516            if let Some(kernel) = kernel.upgrade() {
517                async move { Ok(kernel.cgroups.cgroup2.get_cgroup_inspect()) }.boxed()
518            } else {
519                async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
520            }
521        });
522
523        Ok(this)
524    }
525
526    /// Shuts down userspace and the kernel in an orderly fashion, eventually terminating the root
527    /// kernel process.
528    pub fn shut_down(self: &Arc<Self>) {
529        // Run shutdown code on a kthread in the main process so that it can be the last process
530        // alive.
531        self.kthreads.spawn_future(
532            {
533                let kernel = self.clone();
534                move || async move {
535                    kernel.run_shutdown().await;
536                }
537            },
538            "run_shutdown",
539        );
540    }
541
542    /// Starts shutting down the Starnix kernel and any running container. Only one thread can drive
543    /// shutdown at a time. This function will return immediately if shut down is already under way.
544    ///
545    /// Shutdown happens in several phases:
546    ///
547    /// 1. Disable launching new processes
548    /// 2. Shut down individual ThreadGroups until only the init and system tasks remain
549    /// 3. Repeat the above for the init task
550    /// 4. Clean up kernel-internal structures that can hold processes alive
551    /// 5. Ensure this process is the only one running in the kernel job.
552    /// 6. Unmounts the kernel's mounts' FileSystems.
553    /// 7. Tell CF the container component has stopped
554    /// 8. Exit this process
555    ///
556    /// If a ThreadGroup does not shut down on its own (including after SIGKILL), that phase of
557    /// shutdown will hang. To gracefully shut down any further we need the other kernel processes
558    /// to do controlled exits that properly release access to shared state. If our orderly shutdown
559    /// does hang, eventually CF will kill the container component which will lead to the job of
560    /// this process being killed and shutdown will still complete.
561    async fn run_shutdown(&self) {
562        const INIT_PID: i32 = 1;
563        const SYSTEM_TASK_PID: i32 = 2;
564
565        // Step 1: Prevent new processes from being created once they observe this update. We don't
566        // want the thread driving shutdown to be racing with other threads creating new processes.
567        if self
568            .shutting_down
569            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
570            .is_err()
571        {
572            log_info!("Additional thread tried to initiate shutdown while already in-progress.");
573            return;
574        }
575
576        log_info!("Shutting down Starnix kernel.");
577
578        // Step 2: Shut down thread groups in a loop until init and the system task are all that
579        // remain.
580        loop {
581            let tgs = {
582                // Exiting thread groups need to acquire a write lock for the pid table to
583                // successfully exit so we need to acquire that lock in a reduced scope.
584                self.pids
585                    .read()
586                    .get_thread_groups()
587                    .filter(|tg| tg.leader != SYSTEM_TASK_PID && tg.leader != INIT_PID)
588                    .collect::<Vec<_>>()
589            };
590            if tgs.is_empty() {
591                log_info!("pid table is empty except init and system task");
592                break;
593            }
594
595            log_info!(tgs:?; "shutting down thread groups");
596            let mut tasks = vec![];
597            for tg in tgs {
598                let task = fasync::Task::local(ThreadGroup::shut_down(Arc::downgrade(&tg)));
599                tasks.push(task);
600            }
601            futures::future::join_all(tasks).await;
602        }
603
604        // Step 3: Terminate the init process.
605        let maybe_init = {
606            // Exiting thread groups need to acquire a write lock for the pid table to successfully
607            // exit so we need to acquire that lock in a reduced scope.
608            self.pids.read().get_thread_group(1).map(|tg| Arc::downgrade(&tg))
609        };
610        if let Some(init) = maybe_init {
611            log_info!("shutting down init");
612            ThreadGroup::shut_down(init).await;
613        } else {
614            log_info!("init already terminated");
615        }
616
617        // Step 4: Clean up any structures that can keep non-Linux processes live in our job.
618        log_info!("cleaning up pinned memory");
619        self.expando.remove::<crate::mm::InfoCacheShadowProcess>();
620        self.expando.remove::<crate::mm::MlockShadowProcess>();
621
622        // Step 5: Make sure this is the only process running in the job. We already should have
623        // cleared up all processes other than the system task at this point, but wait on any that
624        // might be around for good measure.
625        //
626        // Use unwrap liberally since we're shutting down anyway and errors will still tear down the
627        // kernel.
628        let kernel_job = fuchsia_runtime::job_default();
629        assert_eq!(kernel_job.children().unwrap(), &[], "starnix does not create any child jobs");
630        let own_koid = fuchsia_runtime::process_self().koid().unwrap();
631
632        log_info!("waiting for this to be the only process in the job");
633        loop {
634            let mut remaining_processes = kernel_job
635                .processes()
636                .unwrap()
637                .into_iter()
638                // Don't wait for ourselves to exit.
639                .filter(|pid| pid != &own_koid)
640                .peekable();
641            if remaining_processes.peek().is_none() {
642                log_info!("No stray Zircon processes.");
643                break;
644            }
645
646            let mut terminated_signals = vec![];
647            for pid in remaining_processes {
648                let handle = match kernel_job
649                    .get_child(&pid, zx::Rights::BASIC | zx::Rights::PROPERTY | zx::Rights::DESTROY)
650                {
651                    Ok(h) => h,
652                    Err(e) => {
653                        log_info!(pid:?, e:?; "failed to get child process from job");
654                        continue;
655                    }
656                };
657                log_info!(
658                    pid:?,
659                    name:? = handle.get_name();
660                    "waiting on process terminated signal"
661                );
662                terminated_signals
663                    .push(fuchsia_async::OnSignals::new(handle, zx::Signals::PROCESS_TERMINATED));
664            }
665            log_info!("waiting on process terminated signals");
666            futures::future::join_all(terminated_signals).await;
667        }
668
669        // Step 6: Forcibly unmounts the mounts' FileSystems.
670        log_info!("clearing mounts");
671        self.mounts.clear();
672
673        // Step 7: Tell CF the container stopped.
674        log_info!("all non-root processes killed, notifying CF container is stopped");
675        if let Some(control_handle) = self.container_control_handle.lock().take() {
676            log_info!("Notifying CF that the container has stopped.");
677            control_handle
678                .send_on_stop(ComponentStopInfo {
679                    termination_status: Some(zx::Status::OK.into_raw()),
680                    exit_code: Some(0),
681                    ..ComponentStopInfo::default()
682                })
683                .unwrap();
684            control_handle.shutdown_with_epitaph(zx::Status::OK);
685        } else {
686            log_warn!("Shutdown invoked without a container controller control handle.");
687        }
688
689        // Step 8: exiting this process.
690        log_info!("All tasks killed, exiting Starnix kernel root process.");
691        // Normally a Rust program exits its process by calling `std::process::exit()` which goes
692        // through libc to exit the program. This runs drop impls on any thread-local variables
693        // which can cause issues during Starnix shutdown when we haven't yet integrated every
694        // subsystem with the shutdown flow. While those issues are indicative of underlying
695        // problems, we can't solve them without finishing the implementation of graceful shutdown.
696        // Instead, ask Zircon to exit our process directly, bypassing any libc atexit handlers.
697        // TODO(https://fxbug.dev/295073633) return from main instead of avoiding atexit handlers
698        zx::Process::exit(0);
699    }
700
701    pub fn is_shutting_down(&self) -> bool {
702        self.shutting_down.load(Ordering::Acquire)
703    }
704
705    /// Opens a device file (driver) identified by `dev`.
706    pub fn open_device<L>(
707        &self,
708        locked: &mut Locked<L>,
709        current_task: &CurrentTask,
710        node: &NamespaceNode,
711        flags: OpenFlags,
712        dev: DeviceType,
713        mode: DeviceMode,
714    ) -> Result<Box<dyn FileOps>, Errno>
715    where
716        L: LockEqualOrBefore<FileOpsCore>,
717    {
718        self.device_registry.open_device(locked, current_task, node, flags, dev, mode)
719    }
720
721    /// Return a reference to the Audit Framework
722    ///
723    /// This function follows the lazy initialization pattern.
724    pub fn audit_logger(&self) -> Arc<AuditLogger> {
725        self.expando.get_or_init(|| AuditLogger::new(self))
726    }
727
728    /// Return a reference to the GenericNetlink implementation.
729    ///
730    /// This function follows the lazy initialization pattern, where the first
731    /// call will instantiate the Generic Netlink server in a separate kthread.
732    pub fn generic_netlink(&self) -> &GenericNetlink<NetlinkToClientSender<GenericMessage>> {
733        self.generic_netlink.get_or_init(|| {
734            let (generic_netlink, worker_params) = GenericNetlink::new();
735            let enable_nl80211 = self.features.wifi;
736            self.kthreads.spawn_future(
737                move || async move {
738                    crate::vfs::socket::run_generic_netlink_worker(worker_params, enable_nl80211)
739                        .await;
740                    log_error!("Generic Netlink future unexpectedly exited");
741                },
742                "generic_netlink_worker",
743            );
744            generic_netlink
745        })
746    }
747
748    /// Return a reference to the [`netlink::Netlink`] implementation.
749    ///
750    /// This function follows the lazy initialization pattern, where the first
751    /// call will instantiate the Netlink implementation.
752    pub fn network_netlink(self: &Arc<Self>) -> &Netlink<NetlinkContextImpl> {
753        self.network_netlink.get_or_init(|| {
754            let (network_netlink, worker_params) =
755                Netlink::new(InterfacesHandlerImpl(self.weak_self.clone()));
756
757            let kernel = self.clone();
758            self.kthreads.spawn_future(
759                move || async move {
760                    netlink::run_netlink_worker(
761                        worker_params,
762                        NetlinkAccessControl::new(kernel.kthreads.system_task()),
763                    )
764                    .await;
765                    log_error!(tag = NETLINK_LOG_TAG; "Netlink async worker unexpectedly exited");
766                },
767                "network_netlink_worker",
768            );
769            network_netlink
770        })
771    }
772
773    pub fn iptables(&self) -> &IpTables {
774        self.iptables.get_or_init(|| IpTables::new())
775    }
776
777    /// Returns a Proxy to the service used by the container at `filename`.
778    #[allow(unused)]
779    pub fn connect_to_named_protocol_at_container_svc<P: ProtocolMarker>(
780        &self,
781        filename: &str,
782    ) -> Result<ClientEnd<P>, Errno> {
783        match self.container_namespace.get_namespace_channel("/svc") {
784            Ok(channel) => {
785                let (client_end, server_end) = create_endpoints::<P>();
786                fdio::service_connect_at(channel.as_ref(), filename, server_end.into_channel())
787                    .map_err(|status| from_status_like_fdio!(status))?;
788                Ok(client_end)
789            }
790            Err(err) => {
791                log_error!("Unable to get /svc namespace channel! {}", err);
792                Err(errno!(ENOENT))
793            }
794        }
795    }
796
797    /// Returns a Proxy to the service `P` used by the container.
798    pub fn connect_to_protocol_at_container_svc<P: DiscoverableProtocolMarker>(
799        &self,
800    ) -> Result<ClientEnd<P>, Errno> {
801        self.connect_to_named_protocol_at_container_svc::<P>(P::PROTOCOL_NAME)
802    }
803
804    pub fn add_syscall_log_filter(&self, name: &str) {
805        let filter = SyscallLogFilter::new(name.to_string());
806        {
807            let mut filters = self.syscall_log_filters.lock();
808            if filters.contains(&filter) {
809                return;
810            }
811            filters.push(filter);
812        }
813        for headers in self.pids.read().get_thread_groups() {
814            headers.sync_syscall_log_level();
815        }
816    }
817
818    pub fn clear_syscall_log_filters(&self) {
819        {
820            let mut filters = self.syscall_log_filters.lock();
821            if filters.is_empty() {
822                return;
823            }
824            filters.clear();
825        }
826        for headers in self.pids.read().get_thread_groups() {
827            headers.sync_syscall_log_level();
828        }
829    }
830
831    fn get_thread_groups_inspect(&self) -> fuchsia_inspect::Inspector {
832        let inspector = fuchsia_inspect::Inspector::default();
833
834        let thread_groups = inspector.root();
835        let mut mm_summary = MappingSummary::default();
836        let mut mms_summarized = HashSet::new();
837
838        // Avoid holding locks for the entire iteration.
839        let all_thread_groups = {
840            let pid_table = self.pids.read();
841            pid_table.get_thread_groups().collect::<Vec<_>>()
842        };
843        for thread_group in all_thread_groups {
844            // Avoid holding the state lock while summarizing.
845            let (ppid, tasks) = {
846                let tg = thread_group.read();
847                (tg.get_ppid() as i64, tg.tasks().map(TempRef::into_static).collect::<Vec<_>>())
848            };
849
850            let tg_node = thread_groups.create_child(format!("{}", thread_group.leader));
851            if let Ok(koid) = &thread_group.process.koid() {
852                tg_node.record_int("koid", koid.raw_koid() as i64);
853            }
854            tg_node.record_int("pid", thread_group.leader as i64);
855            tg_node.record_int("ppid", ppid);
856            tg_node.record_bool("stopped", thread_group.load_stopped() == StopState::GroupStopped);
857
858            let tasks_node = tg_node.create_child("tasks");
859            for task in tasks {
860                if let Ok(mm) = task.mm() {
861                    if mms_summarized.insert(Arc::as_ptr(&mm) as usize) {
862                        mm.summarize(&mut mm_summary);
863                    }
864                }
865                let set_properties = |node: &fuchsia_inspect::Node| {
866                    node.record_string("command", task.command().to_string());
867
868                    let scheduler_state = task.read().scheduler_state;
869                    if !scheduler_state.is_default() {
870                        node.record_child("sched", |node| {
871                            node.record_string(
872                                "role_name",
873                                self.scheduler
874                                    .role_name(&task)
875                                    .map(|n| Cow::Borrowed(n))
876                                    .unwrap_or_else(|e| Cow::Owned(e.to_string())),
877                            );
878                            node.record_string("state", format!("{scheduler_state:?}"));
879                        });
880                    }
881                };
882                if task.tid == thread_group.leader {
883                    let mut argv = task.read_argv(256).unwrap_or_default();
884
885                    // Any runtime that overwrites argv is likely to leave a lot of trailing
886                    // nulls, no need to print those in inspect.
887                    argv.retain(|arg| !arg.is_empty());
888
889                    let inspect_argv = tg_node.create_string_array("argv", argv.len());
890                    for (i, arg) in argv.iter().enumerate() {
891                        inspect_argv.set(i, arg.to_string());
892                    }
893                    tg_node.record(inspect_argv);
894
895                    set_properties(&tg_node);
896                } else {
897                    tasks_node.record_child(task.tid.to_string(), |task_node| {
898                        set_properties(task_node);
899                    });
900                };
901            }
902            tg_node.record(tasks_node);
903            thread_groups.record(tg_node);
904        }
905
906        thread_groups.record_child("memory_managers", |node| mm_summary.record(node));
907
908        inspector
909    }
910
911    pub fn new_memory_attribution_observer(
912        &self,
913        control_handle: fattribution::ProviderControlHandle,
914    ) -> attribution_server::Observer {
915        self.memory_attribution_manager.new_observer(control_handle)
916    }
917
918    /// Opens and returns a directory proxy from the container's namespace, at
919    /// the requested path, using the provided flags. This method will open the
920    /// closest existing path from the namespace hierarchy, and then attempt
921    /// initialize an open on the remaining subdirectory path, using the given open_flags.
922    ///
923    /// For example, given the parameter provided is `/path/to/foo/bar` and there
924    /// are namespace entries already for `/path/to/foo` and `/path/to`. The entry
925    /// for /path/to/foo will be opened, and then the /bar will attempt to be opened
926    /// underneath that directory with the given open_flags. The returned value
927    /// will be the proxy to the parent (/path/to/foo) and the string to the child
928    /// path (/bar). The caller of this method can expect /bar to be initialized.
929    pub fn open_ns_dir(
930        &self,
931        path: &str,
932        open_flags: fio::Flags,
933    ) -> Result<(fio::DirectorySynchronousProxy, String), Errno> {
934        let ns_path = PathBuf::from(path);
935        match self.container_namespace.find_closest_channel(&ns_path) {
936            Ok((root_channel, remaining_subdir)) => {
937                let (_, server_end) = create_endpoints::<fio::DirectoryMarker>();
938                fdio::open_at(
939                    &root_channel,
940                    &remaining_subdir,
941                    open_flags,
942                    server_end.into_channel(),
943                )
944                .map_err(|e| {
945                    log_error!("Failed to intialize the subdirs: {}", e);
946                    errno!(EIO)
947                })?;
948
949                Ok((fio::DirectorySynchronousProxy::new(root_channel), remaining_subdir))
950            }
951            Err(err) => {
952                log_error!(
953                    "Unable to find a channel for {}. Received error: {}",
954                    ns_path.display(),
955                    err
956                );
957                Err(errno!(ENOENT))
958            }
959        }
960    }
961
962    /// Returns an iterator of the command line arguments.
963    pub fn cmdline_args_iter(&self) -> impl Iterator<Item = ArgNameAndValue<'_>> {
964        parse_cmdline(self.cmdline.to_str().unwrap_or_default()).filter_map(|arg| {
965            arg.split_once('=')
966                .map(|(name, value)| ArgNameAndValue { name: name, value: Some(value) })
967                .or(Some(ArgNameAndValue { name: arg, value: None }))
968        })
969    }
970
971    /// Returns the container-configured CacheConfig.
972    pub fn fs_cache_config(&self) -> CacheConfig {
973        CacheConfig { capacity: self.features.dirent_cache_size as usize }
974    }
975}
976
977pub fn parse_cmdline(cmdline: &str) -> impl Iterator<Item = &str> {
978    let mut args = Vec::new();
979    let mut arg_start: Option<usize> = None;
980    let mut in_quotes = false;
981    let mut previous_char = ' ';
982
983    for (i, c) in cmdline.char_indices() {
984        if let Some(start) = arg_start {
985            match c {
986                ' ' if !in_quotes => {
987                    args.push(&cmdline[start..i]);
988                    arg_start = None;
989                }
990                '"' if previous_char != '\\' => {
991                    in_quotes = !in_quotes;
992                }
993                _ => {}
994            }
995        } else if c != ' ' {
996            arg_start = Some(i);
997            if c == '"' {
998                in_quotes = true;
999            }
1000        }
1001        previous_char = c;
1002    }
1003    if let Some(start) = arg_start {
1004        args.push(&cmdline[start..]);
1005    }
1006    args.into_iter()
1007}
1008
1009impl std::fmt::Debug for Kernel {
1010    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1011        f.debug_struct("Kernel").finish()
1012    }
1013}
1014
1015// TODO(https://fxbug.dev/380427153): move arch dependent code to `kernel/core/arch/*`.
1016#[cfg(target_arch = "aarch64")]
1017fn arm32_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
1018    use starnix_uapi::arch32;
1019    const COMPAT_ARM32_ELF_HWCAP: u32 = arch32::HWCAP_HALF
1020        | arch32::HWCAP_THUMB
1021        | arch32::HWCAP_FAST_MULT
1022        | arch32::HWCAP_EDSP
1023        | arch32::HWCAP_TLS
1024        | arch32::HWCAP_IDIV // == IDIVA | IDIVT.
1025        | arch32::HWCAP_LPAE
1026        | arch32::HWCAP_EVTSTRM;
1027
1028    let mut hwcap = COMPAT_ARM32_ELF_HWCAP;
1029    let mut hwcap2 = 0;
1030    for feature in cpu_feature_flags.iter() {
1031        match feature {
1032            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= arch32::HWCAP_NEON,
1033            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap2 |= arch32::HWCAP2_AES,
1034            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap2 |= arch32::HWCAP2_PMULL,
1035            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap2 |= arch32::HWCAP2_SHA1,
1036            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap2 |= arch32::HWCAP2_SHA2,
1037            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap2 |= arch32::HWCAP2_CRC32,
1038            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap |= arch32::HWCAP_I8MM,
1039            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= arch32::HWCAP_ASIMDFHM,
1040            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= arch32::HWCAP_ASIMDDP,
1041            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => {
1042                hwcap |= arch32::HWCAP_VFP | arch32::HWCAP_VFPv3 | arch32::HWCAP_VFPv4
1043            }
1044            _ => {}
1045        }
1046    }
1047    HwCap { hwcap, hwcap2 }
1048}
1049
1050#[cfg(target_arch = "aarch64")]
1051fn arm64_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
1052    // See https://docs.kernel.org/arch/arm64/elf_hwcaps.html for details.
1053    use starnix_uapi;
1054    let mut hwcap = 0;
1055    let mut hwcap2 = 0;
1056
1057    for feature in cpu_feature_flags.iter() {
1058        match feature {
1059            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => hwcap |= starnix_uapi::HWCAP_FP,
1060            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= starnix_uapi::HWCAP_ASIMD,
1061            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap |= starnix_uapi::HWCAP_AES,
1062            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap |= starnix_uapi::HWCAP_PMULL,
1063            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap |= starnix_uapi::HWCAP_SHA1,
1064            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap |= starnix_uapi::HWCAP_SHA2,
1065            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap |= starnix_uapi::HWCAP_CRC32,
1066            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap2 |= starnix_uapi::HWCAP2_I8MM,
1067            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= starnix_uapi::HWCAP_ASIMDFHM,
1068            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= starnix_uapi::HWCAP_ASIMDDP,
1069            CpuFeatureFlags::ARM64_FEATURE_ISA_SM3 => hwcap |= starnix_uapi::HWCAP_SM3,
1070            CpuFeatureFlags::ARM64_FEATURE_ISA_SM4 => hwcap |= starnix_uapi::HWCAP_SM4,
1071            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA3 => hwcap |= starnix_uapi::HWCAP_SHA3,
1072            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA512 => hwcap |= starnix_uapi::HWCAP_SHA512,
1073            CpuFeatureFlags::ARM64_FEATURE_ISA_ATOMICS => hwcap |= starnix_uapi::HWCAP_ATOMICS,
1074            CpuFeatureFlags::ARM64_FEATURE_ISA_RDM => hwcap |= starnix_uapi::HWCAP_ASIMDRDM,
1075            CpuFeatureFlags::ARM64_FEATURE_ISA_TS => hwcap |= starnix_uapi::HWCAP_FLAGM,
1076            CpuFeatureFlags::ARM64_FEATURE_ISA_DPB => hwcap |= starnix_uapi::HWCAP_DCPOP,
1077            CpuFeatureFlags::ARM64_FEATURE_ISA_RNDR => hwcap2 |= starnix_uapi::HWCAP2_RNG,
1078            _ => {}
1079        }
1080    }
1081    HwCap { hwcap, hwcap2 }
1082}
1083
1084impl HwCaps {
1085    #[cfg(target_arch = "aarch64")]
1086    pub fn from_cpu_feature_flags(cpu_feature_flags: CpuFeatureFlags) -> Self {
1087        Self { arch32: arm32_hwcap(cpu_feature_flags), arch64: arm64_hwcap(cpu_feature_flags) }
1088    }
1089
1090    #[cfg(not(target_arch = "aarch64"))]
1091    pub fn from_cpu_feature_flags(_cpu_feature_flags: CpuFeatureFlags) -> Self {
1092        Self { arch64: HwCap::default() }
1093    }
1094}
1095
1096#[cfg(test)]
1097mod test {
1098    use super::parse_cmdline;
1099
1100    #[test]
1101    fn test_parse_cmdline() {
1102        let cmdline =
1103            r#"first second=third "fourth fifth" sixth="seventh eighth" "ninth\" tenth" eleventh"#;
1104        let expected = vec![
1105            "first",
1106            "second=third",
1107            "\"fourth fifth\"",
1108            "sixth=\"seventh eighth\"",
1109            "\"ninth\\\" tenth\"",
1110            "eleventh",
1111        ];
1112        assert_eq!(parse_cmdline(cmdline).collect::<Vec<_>>(), expected);
1113    }
1114}