Skip to main content

starnix_kernel_runner/
container.rs

1// Copyright 2022 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::{
6    Features, MountAction, expose_root, parse_features, parse_numbered_handles,
7    run_container_features, serve_component_runner, serve_container_controller,
8    serve_graphical_presenter, serve_lutex_controller,
9};
10use anyhow::{Context, Error, anyhow, bail};
11use bootreason::get_or_init_android_bootreason;
12use bstr::{BString, ByteSlice};
13use devicetree::parser::parse_devicetree;
14use devicetree::types::Devicetree;
15use fidl::endpoints::{ControlHandle, RequestStream, ServerEnd};
16use fidl_fuchsia_component_runner::{TaskProviderRequest, TaskProviderRequestStream};
17use fidl_fuchsia_feedback::CrashReporterMarker;
18use fidl_fuchsia_time_external::AdjustMarker;
19use fuchsia_async::DurationExt;
20use fuchsia_component::client::{connect_to_protocol, connect_to_protocol_sync};
21use fuchsia_component::server::ServiceFs;
22use futures::channel::oneshot;
23use futures::{FutureExt, StreamExt, TryStreamExt};
24use serde::Deserialize;
25use starnix_container_structured_config::Config as ContainerStructuredConfig;
26use starnix_core::device::remote_block_device::remote_block_device_init;
27use starnix_core::execution::{
28    create_init_process, create_system_task, execute_task_with_prerun_result,
29};
30use starnix_core::fs::fuchsia::new_remotefs_in_root;
31use starnix_core::fs::tmpfs::TmpFs;
32use starnix_core::security;
33use starnix_core::task::container_namespace::ContainerNamespace;
34use starnix_core::task::{
35    CurrentTask, ExitStatus, Kernel, RoleOverrides, SchedulerManager, parse_cmdline,
36};
37use starnix_core::vfs::{FileSystemOptions, FsContext, LookupContext, Namespace, WhatToMount};
38use starnix_logging::{
39    CATEGORY_STARNIX, NAME_CREATE_CONTAINER, log_debug, log_error, log_info, log_warn,
40    trace_duration,
41};
42use starnix_modules::{init_common_devices, register_common_file_systems};
43use starnix_modules_layeredfs::LayeredFs;
44use starnix_modules_magma::get_magma_params;
45use starnix_modules_overlayfs::OverlayStack;
46use starnix_modules_rtc::rtc_device_init;
47use starnix_sync::{Locked, Unlocked};
48use starnix_task_command::TaskCommand;
49use starnix_uapi::errors::{ENOENT, SourceContext};
50use starnix_uapi::open_flags::OpenFlags;
51use starnix_uapi::resource_limits::Resource;
52use starnix_uapi::{errno, tid_t};
53use std::collections::BTreeMap;
54use std::ffi::CString;
55use std::ops::DerefMut;
56use std::sync::Arc;
57use zx::Task as _;
58use {
59    fidl_fuchsia_boot as fboot, fidl_fuchsia_component as fcomponent,
60    fidl_fuchsia_component_runner as frunner, fidl_fuchsia_element as felement,
61    fidl_fuchsia_io as fio, fidl_fuchsia_mem as fmem,
62    fidl_fuchsia_memory_attribution as fattribution, fidl_fuchsia_starnix_binder as fbinder,
63    fidl_fuchsia_starnix_container as fstarcontainer, fuchsia_async as fasync,
64    fuchsia_inspect as inspect, fuchsia_runtime as fruntime, fuchsia_zbi as zbi,
65};
66
67use std::sync::Weak;
68
69use crate::serve_memory_attribution_provider_container;
70use attribution_server::{AttributionServer, AttributionServerHandle};
71use fidl::HandleBased;
72
73/// Manages the memory attribution protocol for a Starnix container.
74struct ContainerMemoryAttributionManager {
75    /// Holds state for the hanging-get attribution protocol.
76    memory_attribution_server: AttributionServerHandle,
77}
78
79impl ContainerMemoryAttributionManager {
80    /// Creates a new [ContainerMemoryAttributionManager] from a Starnix kernel and the moniker
81    /// token of the container component.
82    pub fn new(kernel: Weak<Kernel>, component_instance: zx::Event) -> Self {
83        let memory_attribution_server = AttributionServer::new(Box::new(move || {
84            let kernel_ref = match kernel.upgrade() {
85                None => return vec![],
86                Some(k) => k,
87            };
88            attribution_info_for_kernel(kernel_ref.as_ref(), &component_instance)
89        }));
90
91        ContainerMemoryAttributionManager { memory_attribution_server }
92    }
93
94    /// Creates a new observer for the attribution information from this container.
95    pub fn new_observer(
96        &self,
97        control_handle: fattribution::ProviderControlHandle,
98    ) -> attribution_server::Observer {
99        self.memory_attribution_server.new_observer(control_handle)
100    }
101}
102
103/// Generates the attribution information for the Starnix kernel ELF component. The attribution
104/// information for the container is handled by the container component, not the kernel
105/// component itself, even if both are hosted within the same kernel process.
106fn attribution_info_for_kernel(
107    kernel: &Kernel,
108    component_instance: &zx::Event,
109) -> Vec<fattribution::AttributionUpdate> {
110    // Start the server to handle the memory attribution requests for the container, and provide
111    // a handle to get detailed attribution. We start a new task as each incoming connection is
112    // independent.
113    let (client_end, server_end) =
114        fidl::endpoints::create_request_stream::<fattribution::ProviderMarker>();
115    fuchsia_async::Task::spawn(serve_memory_attribution_provider_container(server_end, kernel))
116        .detach();
117
118    let starnix_kernel_id = Some(1);
119    let starnix_kernel_principal = fattribution::NewPrincipal {
120        identifier: starnix_kernel_id,
121        description: Some(fattribution::Description::Part("starnix_kernel".to_string())),
122        principal_type: Some(fattribution::PrincipalType::Part),
123        // This part is created for accounting. It holds the resource used for starnix
124        // kernel operation. It neither has sub-principals, nor publishes attribution,
125        // hence it does not need to be tied to a provider server end.
126        detailed_attribution: None,
127        ..Default::default()
128    };
129
130    let starnix_kernel_attribution = fattribution::UpdatedPrincipal {
131        identifier: starnix_kernel_id, // Recipient.
132        resources: Some(fattribution::Resources::Data(fattribution::Data {
133            resources: vec![fattribution::Resource::ProcessMapped(fattribution::ProcessMapped {
134                process: fuchsia_runtime::process_self().koid().unwrap().raw_koid(),
135                base: 0, // Attribute all the range.
136                len: u64::max_value(),
137                hint_skip_handle_table: false,
138            })],
139        })),
140        ..Default::default()
141    };
142
143    let container_id = Some(2);
144    let new_principal = fattribution::NewPrincipal {
145        identifier: container_id,
146        description: Some(fattribution::Description::Component(
147            component_instance.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap(),
148        )),
149        principal_type: Some(fattribution::PrincipalType::Runnable),
150        detailed_attribution: Some(client_end),
151        ..Default::default()
152    };
153    let attribution = fattribution::UpdatedPrincipal {
154        identifier: container_id,
155        resources: Some(fattribution::Resources::Data(fattribution::Data {
156            resources: vec![fattribution::Resource::KernelObject(
157                fuchsia_runtime::job_default().koid().unwrap().raw_koid(),
158            )],
159        })),
160        ..Default::default()
161    };
162
163    vec![
164        fattribution::AttributionUpdate::Add(new_principal),
165        fattribution::AttributionUpdate::Add(starnix_kernel_principal),
166        fattribution::AttributionUpdate::Update(attribution),
167        fattribution::AttributionUpdate::Update(starnix_kernel_attribution),
168    ]
169}
170
171#[derive(Debug)]
172pub struct ContainerStartInfo {
173    /// Configuration specified by the component's `program` block.
174    pub program: ContainerProgram,
175
176    pub config: ContainerStructuredConfig,
177
178    /// The outgoing directory of the container, used to serve protocols on behalf of the container.
179    /// For example, the starnix_kernel serves a component runner in the containers' outgoing
180    /// directory.
181    outgoing_dir: Option<zx::Channel>,
182
183    /// Mapping of top-level namespace entries to an associated channel.
184    /// For example, "/svc" to the respective channel.
185    pub container_namespace: ContainerNamespace,
186
187    /// The runtime directory of the container, used to provide CF introspection.
188    runtime_dir: Option<ServerEnd<fio::DirectoryMarker>>,
189
190    /// An eventpair that debuggers can use to defer the launch of the container.
191    break_on_start: Option<zx::EventPair>,
192
193    /// Component moniker token for the container component. This token is used in various protocols
194    /// to uniquely identify a component.
195    component_instance: Option<zx::Event>,
196}
197
198const MISSING_CONFIG_VMO_CONTEXT: &str = concat!(
199    "Retrieving container config VMO. ",
200    "If this fails, make sure your container CML includes ",
201    "//src/starnix/containers/container.shard.cml.",
202);
203
204impl ContainerStartInfo {
205    fn new(mut start_info: frunner::ComponentStartInfo) -> Result<Self, Error> {
206        let program = start_info.program.as_ref().context("retrieving program block")?;
207        let program: ContainerProgram =
208            runner::serde::deserialize_program(&program).context("parsing program block")?;
209
210        let encoded_config =
211            start_info.encoded_config.as_ref().context(MISSING_CONFIG_VMO_CONTEXT)?;
212        let config = match encoded_config {
213            fmem::Data::Bytes(b) => ContainerStructuredConfig::from_bytes(b),
214            fmem::Data::Buffer(b) => ContainerStructuredConfig::from_vmo(&b.vmo),
215            other => anyhow::bail!("unknown Data variant {other:?}"),
216        }
217        .context("parsing container structured config")?;
218
219        let ns = start_info.ns.take().context("retrieving container namespace")?;
220        let container_namespace = ContainerNamespace::from(ns);
221
222        let outgoing_dir = start_info.outgoing_dir.take().map(|dir| dir.into_channel());
223        let component_instance = start_info.component_instance;
224
225        Ok(Self {
226            program,
227            config,
228            outgoing_dir,
229            container_namespace,
230            component_instance,
231            break_on_start: start_info.break_on_start,
232            runtime_dir: start_info.runtime_dir,
233        })
234    }
235}
236
237#[derive(Debug, Default, Deserialize)]
238#[serde(deny_unknown_fields)]
239pub struct ContainerProgram {
240    /// The name of this container.
241    name: String,
242
243    /// The command line for the initial process for this container.
244    init: Vec<String>,
245
246    /// The command line for the kernel.
247    #[serde(default)]
248    kernel_cmdline: String,
249
250    /// The specifications for the file system mounts for this container.
251    #[serde(default)]
252    mounts: Vec<String>,
253
254    /// The features enabled for this container.
255    #[serde(default)]
256    pub features: Vec<String>,
257
258    /// The resource limits to apply to this container.
259    #[serde(default)]
260    rlimits: Vec<String>,
261
262    /// The path that the container will wait until exists before considering itself to have started.
263    #[serde(default)]
264    startup_file_path: String,
265
266    /// The default seclabel that is applied to components that are instantiated in this container.
267    ///
268    /// Components can override this by setting the `seclabel` field in their program block.
269    #[serde(default)]
270    pub default_seclabel: Option<String>,
271
272    /// The default uid that is applied to components that are instantiated in this container.
273    ///
274    /// Components can override this by setting the `uid` field in their program block.
275    #[serde(default = "default_uid")]
276    pub default_uid: runner::serde::StoreAsString<u32>,
277
278    /// The default mount options to use when mounting directories from a component's namespace.
279    ///
280    /// Each string is expected to follow the format: "<namespace_path>:<mount_options>".
281    pub default_ns_mount_options: Option<Vec<String>>,
282
283    /// Specifies role names to use for "realtime" tasks based on their process & thread names.
284    ///
285    /// Zircon's scheduler doesn't support configuring tasks to always preempt non-"realtime"
286    /// tasks without specifying a constant bandwidth profile. These profiles specify the period and
287    /// expected runtime of a "realtime" task, bounding the amount of work it is allowed to perform
288    /// at an elevated "realtime" priority.
289    ///
290    /// Because constant bandwidth profiles require workload-specific tuning, we can't uniformly
291    /// apply a single profile for all "realtime" tasks. Instead, this container configuration
292    /// allows us to specify different constant bandwidth profiles for different workloads.
293    #[serde(default)]
294    task_role_overrides: Vec<TaskSchedulerMapping>,
295}
296
297/// Specifies a role override for a class of tasks whose process and thread names match provided
298/// patterns.
299#[derive(Default, Deserialize)]
300struct TaskSchedulerMapping {
301    /// The role name to use for tasks matching the provided patterns.
302    role: String,
303    /// A regular expression that will be matched against the process' command.
304    process: String,
305    /// A regular expression that will be matched against the thread's command.
306    thread: String,
307}
308
309impl std::fmt::Debug for TaskSchedulerMapping {
310    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
311        write!(f, "process `{}` thread `{}` role `{}`", self.process, self.thread, self.role)
312    }
313}
314
315fn default_uid() -> runner::serde::StoreAsString<u32> {
316    runner::serde::StoreAsString(42)
317}
318
319// Creates a CString from a String. Calling this with an invalid CString will panic.
320fn to_cstr(str: &str) -> CString {
321    CString::new(str.to_string()).unwrap()
322}
323
324#[must_use = "The container must run serve on this config"]
325pub struct ContainerServiceConfig {
326    start_info: ContainerStartInfo,
327    request_stream: frunner::ComponentControllerRequestStream,
328    receiver: oneshot::Receiver<Result<ExitStatus, Error>>,
329}
330
331pub struct Container {
332    /// The `Kernel` object that is associated with the container.
333    pub kernel: Arc<Kernel>,
334
335    memory_attribution_manager: ContainerMemoryAttributionManager,
336
337    /// Inspect node holding information about the state of the container.
338    _node: inspect::Node,
339
340    /// Until negative trait bound are implemented, using `*mut u8` to prevent transferring
341    /// Container across threads.
342    _thread_bound: std::marker::PhantomData<*mut u8>,
343}
344
345impl Container {
346    pub fn system_task(&self) -> &CurrentTask {
347        self.kernel.kthreads.system_task()
348    }
349
350    async fn serve_outgoing_directory(
351        &self,
352        outgoing_dir: Option<zx::Channel>,
353    ) -> Result<(), Error> {
354        if let Some(outgoing_dir) = outgoing_dir {
355            // Add `ComponentRunner` to the exposed services of the container, and then serve the
356            // outgoing directory.
357            let mut fs = ServiceFs::new_local();
358            fs.dir("svc")
359                .add_fidl_service(ExposedServices::ComponentRunner)
360                .add_fidl_service(ExposedServices::ContainerController)
361                .add_fidl_service(ExposedServices::GraphicalPresenter)
362                .add_fidl_service(ExposedServices::LutexController);
363
364            // Expose the root of the container's filesystem.
365            let (fs_root, fs_root_server_end) = fidl::endpoints::create_proxy();
366            fs.add_remote("fs_root", fs_root);
367            expose_root(
368                self.kernel.kthreads.unlocked_for_async().deref_mut(),
369                self.system_task(),
370                fs_root_server_end,
371            )?;
372
373            fs.serve_connection(outgoing_dir.into()).map_err(|_| errno!(EINVAL))?;
374
375            fs.for_each_concurrent(None, |request_stream| async {
376                match request_stream {
377                    ExposedServices::ComponentRunner(request_stream) => {
378                        match serve_component_runner(request_stream, self.system_task()).await {
379                            Ok(_) => {}
380                            Err(e) => {
381                                log_error!("Error serving component runner: {:?}", e);
382                            }
383                        }
384                    }
385                    ExposedServices::ContainerController(request_stream) => {
386                        serve_container_controller(request_stream, self.system_task())
387                            .await
388                            .expect("failed to start container.")
389                    }
390                    ExposedServices::GraphicalPresenter(request_stream) => {
391                        serve_graphical_presenter(request_stream, &self.kernel)
392                            .await
393                            .expect("failed to start GraphicalPresenter.")
394                    }
395                    ExposedServices::LutexController(request_stream) => {
396                        serve_lutex_controller(request_stream, self.system_task())
397                            .await
398                            .expect("failed to start LutexController.")
399                    }
400                }
401            })
402            .await
403        }
404        Ok(())
405    }
406
407    pub async fn serve(&self, service_config: ContainerServiceConfig) -> Result<(), Error> {
408        let (r, _) = futures::join!(
409            self.serve_outgoing_directory(service_config.start_info.outgoing_dir),
410            server_component_controller(
411                self.kernel.clone(),
412                service_config.request_stream,
413                service_config.receiver
414            )
415        );
416        r
417    }
418
419    pub fn new_memory_attribution_observer(
420        &self,
421        control_handle: fattribution::ProviderControlHandle,
422    ) -> attribution_server::Observer {
423        self.memory_attribution_manager.new_observer(control_handle)
424    }
425}
426
427/// The services that are exposed in the container component's outgoing directory.
428enum ExposedServices {
429    ComponentRunner(frunner::ComponentRunnerRequestStream),
430    ContainerController(fstarcontainer::ControllerRequestStream),
431    GraphicalPresenter(felement::GraphicalPresenterRequestStream),
432    LutexController(fbinder::LutexControllerRequestStream),
433}
434
435type TaskResult = Result<ExitStatus, Error>;
436
437async fn server_component_controller(
438    kernel: Arc<Kernel>,
439    request_stream: frunner::ComponentControllerRequestStream,
440    task_complete: oneshot::Receiver<TaskResult>,
441) {
442    *kernel.container_control_handle.lock() = Some(request_stream.control_handle());
443
444    enum Event<T, U> {
445        Controller(T),
446        Completion(U),
447    }
448
449    let mut stream = futures::stream::select(
450        request_stream.map(Event::Controller),
451        task_complete.into_stream().map(Event::Completion),
452    );
453
454    while let Some(event) = stream.next().await {
455        match event {
456            Event::Controller(Ok(frunner::ComponentControllerRequest::Stop { .. })) => {
457                log_info!("Stopping the container.");
458            }
459            Event::Controller(Ok(frunner::ComponentControllerRequest::Kill { control_handle })) => {
460                log_info!("Killing the container's job.");
461                control_handle.shutdown_with_epitaph(zx::Status::from_raw(
462                    fcomponent::Error::InstanceDied.into_primitive() as i32,
463                ));
464                fruntime::job_default().kill().expect("Failed to kill job");
465            }
466            Event::Controller(Ok(frunner::ComponentControllerRequest::_UnknownMethod {
467                ordinal,
468                method_type,
469                ..
470            })) => {
471                log_error!(ordinal, method_type:?; "Unknown component controller request received.");
472            }
473            Event::Controller(Err(e)) => {
474                log_warn!(e:?; "Container component controller channel encountered an error.");
475            }
476            Event::Completion(result) => {
477                log_info!(result:?; "init process exited.");
478            }
479        }
480
481        // We treat any event in the stream as an invitation to shut down.
482        if !kernel.is_shutting_down() {
483            kernel.shut_down();
484        }
485    }
486
487    log_debug!("done listening for container-terminating events");
488
489    // In case the stream ended without an event, shut down the kernel here.
490    if !kernel.is_shutting_down() {
491        kernel.shut_down();
492    }
493}
494
495pub async fn create_component_from_stream(
496    mut request_stream: frunner::ComponentRunnerRequestStream,
497    kernel_extra_features: Vec<String>,
498) -> Result<(Container, ContainerServiceConfig), Error> {
499    if let Some(event) = request_stream.try_next().await? {
500        match event {
501            frunner::ComponentRunnerRequest::Start { start_info, controller, .. } => {
502                let request_stream = controller.into_stream();
503                let mut start_info = ContainerStartInfo::new(start_info)?;
504                let (sender, receiver) = oneshot::channel::<TaskResult>();
505                let container = create_container(&mut start_info, &kernel_extra_features, sender)
506                    .await
507                    .with_source_context(|| {
508                        format!("creating container \"{}\"", start_info.program.name)
509                    })?;
510                let service_config =
511                    ContainerServiceConfig { start_info, request_stream, receiver };
512                return Ok((container, service_config));
513            }
514            frunner::ComponentRunnerRequest::_UnknownMethod { ordinal, .. } => {
515                log_warn!("Unknown ComponentRunner request: {ordinal}");
516            }
517        }
518    }
519    bail!("did not receive Start request");
520}
521
522async fn get_bootargs(device_tree: &Devicetree) -> Result<String, Error> {
523    device_tree
524        .root_node
525        .find("chosen")
526        .and_then(|n| {
527            n.get_property("bootargs").map(|p| {
528                let end =
529                    if p.value.last() == Some(&0) { p.value.len() - 1 } else { p.value.len() };
530                match std::str::from_utf8(&p.value[..end]) {
531                    Ok(s) => Ok(s.to_owned()),
532                    Err(e) => {
533                        log_warn!("Bootargs are not valid UTF-8: {e}");
534                        Err(anyhow!("Bootargs are not valid UTF-8"))
535                    }
536                }
537            })
538        })
539        .context("Couldn't find bootargs")?
540}
541
542async fn get_bootitems() -> Result<std::vec::Vec<u8>, Error> {
543    let items =
544        connect_to_protocol::<fboot::ItemsMarker>().context("Failed to connect to boot items")?;
545
546    let items_response = items
547        .get2(zbi::ZbiType::DeviceTree.into_raw(), None)
548        .await
549        .context("FIDL: Failed to get devicetree item")?
550        .map_err(|e| anyhow!("Failed to get devicetree item {:?}", e))?;
551
552    let Some(item) = items_response.last() else {
553        return Err(anyhow!("Failed to get items"));
554    };
555
556    let devicetree_vmo = &item.payload;
557    let bytes = devicetree_vmo
558        .read_to_vec(0, item.length as u64)
559        .context("Failed to read devicetree vmo")?;
560
561    Ok(bytes)
562}
563
564async fn create_container(
565    start_info: &mut ContainerStartInfo,
566    kernel_extra_features: &[String],
567    task_complete: oneshot::Sender<TaskResult>,
568) -> Result<Container, Error> {
569    trace_duration!(CATEGORY_STARNIX, NAME_CREATE_CONTAINER);
570    const DEFAULT_INIT: &str = "/container/init";
571
572    let pkg_channel = start_info.container_namespace.get_namespace_channel("/pkg").unwrap();
573    let pkg_dir_proxy = fio::DirectorySynchronousProxy::new(pkg_channel);
574
575    let device_tree: Option<Devicetree> = match get_bootitems().await {
576        Ok(items) => match parse_devicetree(&items) {
577            Ok(device_tree) => Some(device_tree),
578            Err(e) => {
579                log_warn!("Failed to parse devicetree: {e:?}");
580                None
581            }
582        },
583        Err(e) => {
584            log_warn!("Failed to get boot items for devicetree: {e:?}");
585            None
586        }
587    };
588    let mut features = parse_features(&start_info, kernel_extra_features)?;
589
590    log_debug!("Creating container with {:#?}", features);
591    let mut kernel_cmdline = BString::from(start_info.program.kernel_cmdline.as_bytes());
592    let mut android_provided_bootreason = None;
593
594    if features.android_serialno {
595        if let Some(device_tree) = &device_tree {
596            match get_bootargs(device_tree).await {
597                Ok(args) => {
598                    for item in parse_cmdline(&args) {
599                        if item.starts_with("androidboot.force_normal_boot") {
600                            // TODO(https://fxbug.dev/424152964): Support force_normal_boot.
601                            continue;
602                        }
603                        if item.starts_with("androidboot.bootreason") && features.android_bootreason
604                        {
605                            // androidboot.bootreason is sourced from the Fuchsia reboot reason.
606                            // It is still useful to log it from userspace to learn what the
607                            // possible values are.
608                            log_info!("Original devicetree bootarg {:?}", item);
609                            if let Some((_, v)) = item.split_once('=') {
610                                android_provided_bootreason = Some(v.to_string());
611                            }
612                            continue;
613                        }
614                        kernel_cmdline.extend(b" ");
615                        kernel_cmdline.extend(item.bytes());
616                    }
617                }
618                Err(err) => log_warn!("could not get bootargs: {err:?}"),
619            }
620        } else {
621            log_warn!("No devicetree available to get bootargs for android.serialno");
622        }
623    }
624    if features.android_bootreason {
625        kernel_cmdline.extend(b" androidboot.bootreason=");
626
627        let tmp_channel = start_info.container_namespace.get_namespace_channel("/tmp_lifecycle");
628        let tmp_proxy = match tmp_channel {
629            Ok(channel) => {
630                Some(fio::DirectoryProxy::new(fidl::AsyncChannel::from_channel(channel)))
631            }
632            _ => None,
633        };
634
635        match get_or_init_android_bootreason(tmp_proxy, android_provided_bootreason).await {
636            Ok(reason) => {
637                kernel_cmdline.extend(reason.bytes());
638            }
639            Err(err) => {
640                log_warn!("could not get android bootreason: {err:?}. falling back to 'unknown'");
641                kernel_cmdline.extend(b"unknown");
642            }
643        }
644    }
645    if let Some(supported_vendors) = &features.magma_supported_vendors {
646        kernel_cmdline.extend(b" ");
647        let params = get_magma_params(supported_vendors);
648        kernel_cmdline.extend(&*params);
649    }
650
651    // Check whether we actually have access to a role manager by trying to set our own
652    // thread's role.
653    let mut task_mappings = RoleOverrides::new();
654    for m in &start_info.program.task_role_overrides {
655        task_mappings.add(m.process.clone(), m.thread.clone(), m.role.clone());
656    }
657    let task_mappings = task_mappings.build().context("adding custom task role")?;
658    let scheduler_manager = SchedulerManager::new(task_mappings);
659
660    let crash_reporter = connect_to_protocol::<CrashReporterMarker>().unwrap();
661
662    let node = inspect::component::inspector().root().create_child("container");
663    let kernel_node = node.create_child("kernel");
664    kernel_node.record_int("created_at", zx::MonotonicInstant::get().into_nanos());
665    features.record_inspect(&kernel_node);
666
667    let security_state = security::kernel_init_security(
668        features.selinux.enabled,
669        features.selinux.options.clone(),
670        features.selinux.exceptions.clone(),
671        &kernel_node,
672    );
673
674    // `config.enable_utc_time_adjustment` is set through config capability
675    // `fuchsia.time.config.WritableUTCTime`.
676    let time_adjustment_proxy = if features.enable_utc_time_adjustment {
677        connect_to_protocol_sync::<AdjustMarker>()
678            .map_err(|e| log_error!("could not connect to fuchsia.time.external/Adjust: {:?}", e))
679            .ok()
680    } else {
681        // See the comment above. UTC adjustment is a per-product setting.
682        log_info!("UTC adjustment is forbidden.");
683        None
684    };
685
686    log_info!("final kernel cmdline: {kernel_cmdline:?}");
687    kernel_node.record_string("cmdline", kernel_cmdline.to_str_lossy());
688
689    let kernel = Kernel::new(
690        kernel_cmdline,
691        features.kernel.clone(),
692        std::mem::take(&mut features.system_limits),
693        start_info.container_namespace.try_clone()?,
694        scheduler_manager,
695        Some(crash_reporter),
696        kernel_node,
697        security_state,
698        time_adjustment_proxy,
699        device_tree,
700    )
701    .with_source_context(|| format!("creating Kernel: {}", &start_info.program.name))?;
702    let fs_context = create_fs_context(
703        kernel.kthreads.unlocked_for_async().deref_mut(),
704        &kernel,
705        &features,
706        start_info,
707        &pkg_dir_proxy,
708    )
709    .source_context("creating FsContext")?;
710    let init_pid = kernel.pids.write().allocate_pid();
711    // Lots of software assumes that the pid for the init process is 1.
712    debug_assert_eq!(init_pid, 1);
713
714    let system_task = create_system_task(
715        kernel.kthreads.unlocked_for_async().deref_mut(),
716        &kernel,
717        Arc::clone(&fs_context),
718    )
719    .source_context("create system task")?;
720    // The system task gives pid 2. This value is less critical than giving
721    // pid 1 to init, but this value matches what is supposed to happen.
722    debug_assert_eq!(system_task.tid, 2);
723
724    kernel.kthreads.init(system_task).source_context("initializing kthreads")?;
725    let system_task = kernel.kthreads.system_task();
726
727    kernel.syslog.init(&system_task).source_context("initializing syslog")?;
728
729    kernel.hrtimer_manager.init(system_task).source_context("initializing HrTimer manager")?;
730
731    log_info!("Initializing suspend resume manager.");
732    if let Err(e) = kernel.suspend_resume_manager.init(&system_task) {
733        log_warn!("Suspend/Resume manager initialization failed: ({e:?})");
734    }
735
736    // Real Time clock is present in all configuration.
737    log_info!("Initializing RTC device.");
738    rtc_device_init(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
739        .context("in starnix_kernel_runner, while initializing RTC")?;
740
741    // Register common devices and add them in sysfs and devtmpfs.
742    log_info!("Registering devices and filesystems.");
743    init_common_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel)?;
744    register_common_file_systems(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel);
745
746    log_info!("Mounting filesystems.");
747    mount_filesystems(
748        kernel.kthreads.unlocked_for_async().deref_mut(),
749        &system_task,
750        start_info,
751        &pkg_dir_proxy,
752    )
753    .source_context("mounting filesystems")?;
754
755    // Run all common features that were specified in the .cml.
756    {
757        log_info!("Running container features.");
758        run_container_features(
759            kernel.kthreads.unlocked_for_async().deref_mut(),
760            &system_task,
761            &features,
762        )?;
763    }
764
765    log_info!("Initializing remote block devices.");
766    init_remote_block_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
767        .source_context("initalizing remote block devices")?;
768
769    // If there is an init binary path, run it, optionally waiting for the
770    // startup_file_path to be created. The task struct is still used
771    // to initialize the system up until this point, regardless of whether
772    // or not there is an actual init to be run.
773    let argv = if start_info.program.init.is_empty() {
774        vec![DEFAULT_INIT.to_string()]
775    } else {
776        start_info.program.init.clone()
777    }
778    .iter()
779    .map(|s| to_cstr(s))
780    .collect::<Vec<_>>();
781
782    log_info!("Opening start_info file.");
783    let executable = system_task
784        .open_file(
785            kernel.kthreads.unlocked_for_async().deref_mut(),
786            argv[0].as_bytes().into(),
787            OpenFlags::RDONLY,
788        )
789        .with_source_context(|| format!("opening init: {:?}", &argv[0]))?;
790
791    let initial_name = if start_info.program.init.is_empty() {
792        TaskCommand::default()
793    } else {
794        TaskCommand::new(start_info.program.init[0].as_bytes())
795    };
796
797    let rlimits = parse_rlimits(&start_info.program.rlimits)?;
798
799    // Serve the runtime directory.
800    log_info!("Starting runtime directory.");
801    if let Some(runtime_dir) = start_info.runtime_dir.take() {
802        kernel.kthreads.spawn_future(
803            move || async move { serve_runtime_dir(runtime_dir).await },
804            "serve_runtime_dir",
805        );
806    }
807
808    // At this point the runtime environment has been prepared but nothing is actually running yet.
809    // Pause here if a debugger needs time to attach to the job.
810    if let Some(break_on_start) = start_info.break_on_start.take() {
811        log_info!("Waiting for signal from debugger before spawning init process...");
812        if let Err(e) =
813            fuchsia_async::OnSignals::new(break_on_start, zx::Signals::EVENTPAIR_PEER_CLOSED).await
814        {
815            log_warn!(e:%; "Received break_on_start eventpair but couldn't wait for PEER_CLOSED.");
816        }
817    }
818
819    log_info!("Creating init process.");
820    let init_task = create_init_process(
821        kernel.kthreads.unlocked_for_async().deref_mut(),
822        &kernel,
823        init_pid,
824        initial_name,
825        Arc::clone(&fs_context),
826        &rlimits,
827    )
828    .with_source_context(|| format!("creating init task: {:?}", &start_info.program.init))?;
829
830    execute_task_with_prerun_result(
831        kernel.kthreads.unlocked_for_async().deref_mut(),
832        init_task,
833        move |locked, init_task| {
834            parse_numbered_handles(locked, init_task, None, &init_task.live().files).expect("");
835            init_task.exec(locked, executable, argv[0].clone(), argv.clone(), vec![])
836        },
837        move |result| {
838            log_info!("Finished running init process: {:?}", result);
839            let _ = task_complete.send(result);
840        },
841        None,
842    )?;
843
844    if !start_info.program.startup_file_path.is_empty() {
845        wait_for_init_file(&start_info.program.startup_file_path, &system_task, init_pid).await?;
846    };
847
848    let memory_attribution_manager = ContainerMemoryAttributionManager::new(
849        Arc::downgrade(&kernel),
850        start_info.component_instance.take().ok_or_else(|| Error::msg("No component instance"))?,
851    );
852
853    Ok(Container {
854        kernel,
855        memory_attribution_manager,
856        _node: node,
857        _thread_bound: Default::default(),
858    })
859}
860
861fn create_fs_context(
862    locked: &mut Locked<Unlocked>,
863    kernel: &Kernel,
864    features: &Features,
865    start_info: &ContainerStartInfo,
866    pkg_dir_proxy: &fio::DirectorySynchronousProxy,
867) -> Result<Arc<FsContext>, Error> {
868    // The mounts are applied in the order listed. Mounting will fail if the designated mount
869    // point doesn't exist in a previous mount. The root must be first so other mounts can be
870    // applied on top of it.
871    let mut mounts_iter =
872        start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
873    let mut root = MountAction::new_for_root(
874        locked,
875        kernel,
876        pkg_dir_proxy,
877        mounts_iter.next().ok_or_else(|| anyhow!("Mounts list is empty"))?,
878    )?;
879    if root.path != "/" {
880        anyhow::bail!("First mount in mounts list is not the root");
881    }
882
883    // Create a layered fs to handle /container and /container/component
884    let mut mappings = vec![];
885    if features.container {
886        // /container/component will be a tmpfs where component using the starnix kernel will have their
887        // package mounted.
888        let component_tmpfs_options = FileSystemOptions {
889            params: kernel
890                .features
891                .ns_mount_options("#component_tmpfs")
892                .context("#component_tmpfs options")?,
893            ..Default::default()
894        };
895        let component_tmpfs = TmpFs::new_fs_with_options(locked, kernel, component_tmpfs_options)?;
896
897        // /container will mount the container pkg
898        let container_remotefs_options = FileSystemOptions {
899            source: "data".into(),
900            params: kernel.features.ns_mount_options("#container").context("#container options")?,
901            ..Default::default()
902        };
903        let container_remotefs = new_remotefs_in_root(
904            locked,
905            kernel,
906            pkg_dir_proxy,
907            container_remotefs_options,
908            fio::PERM_READABLE | fio::PERM_EXECUTABLE,
909        )?;
910
911        let container_fs = LayeredFs::new_fs(
912            locked,
913            kernel,
914            container_remotefs,
915            BTreeMap::from([("component".into(), component_tmpfs)]),
916        );
917        mappings.push(("container".into(), container_fs));
918    }
919    if features.custom_artifacts {
920        let mount_options = FileSystemOptions {
921            params: kernel
922                .features
923                .ns_mount_options("#custom_artifacts")
924                .context("#custom_artifacts options")?,
925            ..Default::default()
926        };
927        mappings.push((
928            "custom_artifacts".into(),
929            TmpFs::new_fs_with_options(locked, kernel, mount_options)?,
930        ));
931    }
932    if features.test_data {
933        let mount_options = FileSystemOptions {
934            params: kernel.features.ns_mount_options("#test_data").context("#test_data options")?,
935            ..Default::default()
936        };
937        mappings
938            .push(("test_data".into(), TmpFs::new_fs_with_options(locked, kernel, mount_options)?));
939    }
940
941    if !mappings.is_empty() {
942        // If this container has enabled any features that mount directories that might not exist
943        // in the root file system, we add a LayeredFs to hold these mappings.
944        root.fs = LayeredFs::new_fs(locked, kernel, root.fs, mappings.into_iter().collect());
945    }
946    if features.rootfs_rw {
947        root.fs = OverlayStack::wrap_fs_in_writable_layer(locked, kernel, root.fs)?;
948    }
949    Ok(FsContext::new(Namespace::new_with_flags(root.fs, root.flags)))
950}
951
952fn parse_rlimits(rlimits: &[String]) -> Result<Vec<(Resource, u64)>, Error> {
953    let mut res = Vec::new();
954
955    for rlimit in rlimits {
956        let (key, value) =
957            rlimit.split_once('=').ok_or_else(|| anyhow!("Invalid rlimit: {rlimit}"))?;
958        let value = value.parse::<u64>()?;
959        let kv = match key {
960            "RLIMIT_NOFILE" => (Resource::NOFILE, value),
961            "RLIMIT_RTPRIO" => (Resource::RTPRIO, value),
962            _ => bail!("Unknown rlimit: {key}"),
963        };
964        res.push(kv);
965    }
966
967    Ok(res)
968}
969
970fn mount_filesystems(
971    locked: &mut Locked<Unlocked>,
972    system_task: &CurrentTask,
973    start_info: &ContainerStartInfo,
974    pkg_dir_proxy: &fio::DirectorySynchronousProxy,
975) -> Result<(), Error> {
976    // Skip the first mount, that was used to create the root filesystem.
977    let mut mounts_iter =
978        start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
979    let _ = mounts_iter.next();
980    for mount_spec in mounts_iter {
981        let action = MountAction::from_spec(locked, system_task, pkg_dir_proxy, mount_spec)
982            .with_source_context(|| format!("creating filesystem from spec: {}", &mount_spec))?;
983        let mount_point = system_task
984            .lookup_path_from_root(locked, action.path.as_ref())
985            .with_source_context(|| format!("lookup path from root: {}", action.path))?;
986        mount_point.mount(WhatToMount::Fs(action.fs), action.flags)?;
987    }
988    Ok(())
989}
990
991fn init_remote_block_devices(
992    locked: &mut Locked<Unlocked>,
993    system_task: &CurrentTask,
994) -> Result<(), Error> {
995    remote_block_device_init(locked, system_task);
996    let entries = match std::fs::read_dir("/block") {
997        Ok(entries) => entries,
998        Err(e) => {
999            log_warn!("Failed to read block directory: {}", e);
1000            return Ok(());
1001        }
1002    };
1003    for entry in entries {
1004        let entry = entry?;
1005        let path_buf = entry.path();
1006        let path = path_buf.to_str().ok_or_else(|| anyhow!("Invalid block device path"))?;
1007        let (client_end, server_end) = fidl::endpoints::create_endpoints();
1008        match fdio::service_connect(
1009            &format!("{}/fuchsia.storage.block.Block", path),
1010            server_end.into(),
1011        ) {
1012            Ok(()) => (),
1013            Err(e) => {
1014                log_warn!("Failed to connect to block device at {}: {}", path, e);
1015                continue;
1016            }
1017        }
1018        system_task.kernel().remote_block_device_registry.create_remote_block_device(
1019            locked,
1020            system_task,
1021            entry.file_name().to_str().unwrap(),
1022            client_end,
1023        )?;
1024    }
1025    Ok(())
1026}
1027
1028async fn wait_for_init_file(
1029    startup_file_path: &str,
1030    current_task: &CurrentTask,
1031    init_tid: tid_t,
1032) -> Result<(), Error> {
1033    // TODO(https://fxbug.dev/42178400): Use inotify machinery to wait for the file.
1034    loop {
1035        fasync::Timer::new(fasync::MonotonicDuration::from_millis(100).after_now()).await;
1036
1037        let creds = security::creds_start_internal_operation(current_task);
1038        if let Some(result) = current_task.override_creds(creds, || {
1039            let root = current_task.fs().root();
1040            let mut context = LookupContext::default();
1041
1042            match current_task.lookup_path(
1043                current_task.kernel().kthreads.unlocked_for_async().deref_mut(),
1044                &mut context,
1045                root,
1046                startup_file_path.into(),
1047            ) {
1048                Ok(_) => return Some(Ok(())),
1049                Err(error) if error == ENOENT => {}
1050                Err(error) => return Some(Err(anyhow::Error::from(error))),
1051            };
1052
1053            if current_task.get_task(init_tid).upgrade().is_none() {
1054                return Some(Err(anyhow!(
1055                    "Init task terminated before startup_file_path was ready"
1056                )));
1057            }
1058
1059            None
1060        }) {
1061            return result;
1062        }
1063    }
1064}
1065
1066async fn serve_runtime_dir(runtime_dir: ServerEnd<fio::DirectoryMarker>) {
1067    let mut fs = fuchsia_component::server::ServiceFs::new();
1068    match create_job_id_vmo() {
1069        Ok(vmo) => {
1070            fs.dir("elf").add_vmo_file_at("job_id", vmo);
1071        }
1072        Err(e) => log_error!(e:%; "failed to create vmo with job id for debuggers"),
1073    }
1074    match fs.serve_connection(runtime_dir) {
1075        Ok(_) => {
1076            fs.add_fidl_service(|job_requests: TaskProviderRequestStream| {
1077                fuchsia_async::Task::local(async move {
1078                    if let Err(e) = serve_task_provider(job_requests).await {
1079                        log_warn!(e:?; "Error serving TaskProvider");
1080                    }
1081                })
1082                .detach();
1083            });
1084            fs.collect::<()>().await;
1085        }
1086        Err(e) => log_error!("Couldn't serve runtime directory: {e:?}"),
1087    }
1088}
1089
1090fn create_job_id_vmo() -> Result<zx::Vmo, Error> {
1091    let job_id = fuchsia_runtime::job_default().koid().context("reading own job koid")?;
1092    let job_id_str = job_id.raw_koid().to_string();
1093    let job_id_vmo = zx::Vmo::create(job_id_str.len() as u64).context("creating job id vmo")?;
1094    job_id_vmo.write(job_id_str.as_bytes(), 0).context("write job id to vmo")?;
1095    Ok(job_id_vmo)
1096}
1097
1098async fn serve_task_provider(mut job_requests: TaskProviderRequestStream) -> Result<(), Error> {
1099    while let Some(request) = job_requests.next().await {
1100        match request.context("getting next TaskProvider request")? {
1101            TaskProviderRequest::GetJob { responder } => {
1102                responder
1103                    .send(
1104                        fuchsia_runtime::job_default()
1105                            .duplicate_handle(zx::Rights::SAME_RIGHTS)
1106                            .map_err(|s| s.into_raw()),
1107                    )
1108                    .context("sending job for runtime dir")?;
1109            }
1110            unknown => bail!("Unknown TaskProvider method {unknown:?}"),
1111        }
1112    }
1113    Ok(())
1114}
1115
1116#[cfg(test)]
1117mod test {
1118    use super::wait_for_init_file;
1119    use fuchsia_async as fasync;
1120    use futures::{SinkExt, StreamExt};
1121    #[allow(deprecated, reason = "pre-existing usage")]
1122    use starnix_core::testing::create_kernel_task_and_unlocked;
1123    use starnix_core::vfs::FdNumber;
1124    use starnix_uapi::CLONE_FS;
1125    use starnix_uapi::file_mode::{AccessCheck, FileMode};
1126    use starnix_uapi::open_flags::OpenFlags;
1127    use starnix_uapi::signals::SIGCHLD;
1128    use starnix_uapi::vfs::ResolveFlags;
1129
1130    #[fuchsia::test]
1131    async fn test_init_file_already_exists() {
1132        #[allow(deprecated, reason = "pre-existing usage")]
1133        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1134        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1135
1136        let path = "/path";
1137        current_task
1138            .open_file_at(
1139                locked,
1140                FdNumber::AT_FDCWD,
1141                path.into(),
1142                OpenFlags::CREAT,
1143                FileMode::default(),
1144                ResolveFlags::empty(),
1145                AccessCheck::default(),
1146            )
1147            .expect("Failed to create file");
1148
1149        fasync::Task::local(async move {
1150            wait_for_init_file(path, &current_task, current_task.get_tid())
1151                .await
1152                .expect("failed to wait for file");
1153            sender.send(()).await.expect("failed to send message");
1154        })
1155        .detach();
1156
1157        // Wait for the file creation to have been detected.
1158        assert!(receiver.next().await.is_some());
1159    }
1160
1161    #[fuchsia::test]
1162    async fn test_init_file_wait_required() {
1163        #[allow(deprecated, reason = "pre-existing usage")]
1164        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1165        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1166
1167        let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1168        let path = "/path";
1169
1170        let test_init_tid = current_task.get_tid();
1171        fasync::Task::local(async move {
1172            sender.send(()).await.expect("failed to send message");
1173            wait_for_init_file(path, &init_task, test_init_tid)
1174                .await
1175                .expect("failed to wait for file");
1176            sender.send(()).await.expect("failed to send message");
1177        })
1178        .detach();
1179
1180        // Wait for message that file check has started.
1181        assert!(receiver.next().await.is_some());
1182
1183        // Create the file that is being waited on.
1184        current_task
1185            .open_file_at(
1186                locked,
1187                FdNumber::AT_FDCWD,
1188                path.into(),
1189                OpenFlags::CREAT,
1190                FileMode::default(),
1191                ResolveFlags::empty(),
1192                AccessCheck::default(),
1193            )
1194            .expect("Failed to create file");
1195
1196        // Wait for the file creation to be detected.
1197        assert!(receiver.next().await.is_some());
1198    }
1199
1200    #[fuchsia::test]
1201    async fn test_init_exits_before_file_exists() {
1202        #[allow(deprecated, reason = "pre-existing usage")]
1203        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1204        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1205
1206        let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1207        const STARTUP_FILE_PATH: &str = "/path";
1208
1209        let test_init_tid = init_task.get_tid();
1210        fasync::Task::local(async move {
1211            sender.send(()).await.expect("failed to send message");
1212            wait_for_init_file(STARTUP_FILE_PATH, &current_task, test_init_tid)
1213                .await
1214                .expect_err("Did not detect init exit");
1215            sender.send(()).await.expect("failed to send message");
1216        })
1217        .detach();
1218
1219        // Wait for message that file check has started.
1220        assert!(receiver.next().await.is_some());
1221
1222        // Drop the `init_task`.
1223        std::mem::drop(init_task);
1224
1225        // Wait for the init failure to be detected.
1226        assert!(receiver.next().await.is_some());
1227    }
1228}