Skip to main content

starnix_kernel_runner/
container.rs

1// Copyright 2022 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::{
6    Features, MountAction, expose_root, parse_features, parse_numbered_handles,
7    run_container_features, serve_component_runner, serve_container_controller,
8    serve_graphical_presenter, serve_lutex_controller,
9};
10use anyhow::{Context, Error, anyhow, bail};
11use bootreason::get_or_init_android_bootreason;
12use bstr::{BString, ByteSlice};
13use devicetree::parser::parse_devicetree;
14use devicetree::types::Devicetree;
15use fidl::endpoints::{ControlHandle, RequestStream, ServerEnd};
16use fidl_fuchsia_boot as fboot;
17use fidl_fuchsia_component as fcomponent;
18use fidl_fuchsia_component_runner as frunner;
19use fidl_fuchsia_component_runner::{TaskProviderRequest, TaskProviderRequestStream};
20use fidl_fuchsia_element as felement;
21use fidl_fuchsia_feedback::CrashReporterMarker;
22use fidl_fuchsia_io as fio;
23use fidl_fuchsia_mem as fmem;
24use fidl_fuchsia_memory_attribution as fattribution;
25use fidl_fuchsia_starnix_binder as fbinder;
26use fidl_fuchsia_starnix_container as fstarcontainer;
27use fidl_fuchsia_time_external::AdjustMarker;
28use fuchsia_async as fasync;
29use fuchsia_async::DurationExt;
30use fuchsia_component::client::{connect_to_protocol, connect_to_protocol_sync};
31use fuchsia_component::server::ServiceFs;
32use fuchsia_inspect as inspect;
33use fuchsia_runtime as fruntime;
34use fuchsia_zbi as zbi;
35use futures::channel::oneshot;
36use futures::{FutureExt, StreamExt, TryStreamExt};
37use serde::Deserialize;
38use starnix_container_structured_config::Config as ContainerStructuredConfig;
39use starnix_core::device::remote_block_device::remote_block_device_init;
40use starnix_core::execution::{
41    create_init_process, create_system_task, execute_task_with_prerun_result,
42};
43use starnix_core::fs::fuchsia::new_remotefs_in_root;
44use starnix_core::fs::tmpfs::TmpFs;
45use starnix_core::security;
46use starnix_core::task::container_namespace::ContainerNamespace;
47use starnix_core::task::{
48    CurrentTask, ExitStatus, Kernel, RoleOverrides, SchedulerManager, parse_cmdline,
49};
50use starnix_core::vfs::{FileSystemOptions, FsContext, LookupContext, Namespace, WhatToMount};
51use starnix_logging::{
52    CATEGORY_STARNIX, NAME_CREATE_CONTAINER, log_debug, log_error, log_info, log_warn,
53    trace_duration,
54};
55use starnix_modules::{init_common_devices, register_common_file_systems};
56use starnix_modules_layeredfs::{LayeredFsBuilder, LayeredFsMounts};
57use starnix_modules_magma::get_magma_params;
58use starnix_modules_overlayfs::OverlayStack;
59use starnix_modules_rtc::rtc_device_init;
60use starnix_sync::{Locked, Unlocked};
61use starnix_task_command::TaskCommand;
62use starnix_uapi::errors::{ENOENT, SourceContext};
63use starnix_uapi::open_flags::OpenFlags;
64use starnix_uapi::resource_limits::Resource;
65use starnix_uapi::{errno, tid_t};
66use std::ffi::CString;
67use std::ops::DerefMut;
68use std::sync::Arc;
69use zx::Task as _;
70
71use std::sync::Weak;
72
73use crate::serve_memory_attribution_provider_container;
74use attribution_server::{AttributionServer, AttributionServerHandle};
75use fidl::HandleBased;
76
77/// Manages the memory attribution protocol for a Starnix container.
78struct ContainerMemoryAttributionManager {
79    /// Holds state for the hanging-get attribution protocol.
80    memory_attribution_server: AttributionServerHandle,
81}
82
83impl ContainerMemoryAttributionManager {
84    /// Creates a new [ContainerMemoryAttributionManager] from a Starnix kernel and the moniker
85    /// token of the container component.
86    pub fn new(kernel: Weak<Kernel>, component_instance: zx::Event) -> Self {
87        let memory_attribution_server = AttributionServer::new(Box::new(move || {
88            let kernel_ref = match kernel.upgrade() {
89                None => return vec![],
90                Some(k) => k,
91            };
92            attribution_info_for_kernel(kernel_ref.as_ref(), &component_instance)
93        }));
94
95        ContainerMemoryAttributionManager { memory_attribution_server }
96    }
97
98    /// Creates a new observer for the attribution information from this container.
99    pub fn new_observer(
100        &self,
101        control_handle: fattribution::ProviderControlHandle,
102    ) -> attribution_server::Observer {
103        self.memory_attribution_server.new_observer(control_handle)
104    }
105}
106
107/// Generates the attribution information for the Starnix kernel ELF component. The attribution
108/// information for the container is handled by the container component, not the kernel
109/// component itself, even if both are hosted within the same kernel process.
110fn attribution_info_for_kernel(
111    kernel: &Kernel,
112    component_instance: &zx::Event,
113) -> Vec<fattribution::AttributionUpdate> {
114    // Start the server to handle the memory attribution requests for the container, and provide
115    // a handle to get detailed attribution. We start a new task as each incoming connection is
116    // independent.
117    let (client_end, server_end) =
118        fidl::endpoints::create_request_stream::<fattribution::ProviderMarker>();
119    fuchsia_async::Task::spawn(serve_memory_attribution_provider_container(server_end, kernel))
120        .detach();
121
122    let starnix_kernel_id = Some(1);
123    let starnix_kernel_principal = fattribution::NewPrincipal {
124        identifier: starnix_kernel_id,
125        description: Some(fattribution::Description::Part("starnix_kernel".to_string())),
126        principal_type: Some(fattribution::PrincipalType::Part),
127        // This part is created for accounting. It holds the resource used for starnix
128        // kernel operation. It neither has sub-principals, nor publishes attribution,
129        // hence it does not need to be tied to a provider server end.
130        detailed_attribution: None,
131        ..Default::default()
132    };
133
134    let starnix_kernel_attribution = fattribution::UpdatedPrincipal {
135        identifier: starnix_kernel_id, // Recipient.
136        resources: Some(fattribution::Resources::Data(fattribution::Data {
137            resources: vec![fattribution::Resource::ProcessMapped(fattribution::ProcessMapped {
138                process: fuchsia_runtime::process_self().koid().unwrap().raw_koid(),
139                base: 0, // Attribute all the range.
140                len: u64::max_value(),
141                hint_skip_handle_table: false,
142            })],
143        })),
144        ..Default::default()
145    };
146
147    let container_id = Some(2);
148    let new_principal = fattribution::NewPrincipal {
149        identifier: container_id,
150        description: Some(fattribution::Description::Component(
151            component_instance.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap(),
152        )),
153        principal_type: Some(fattribution::PrincipalType::Runnable),
154        detailed_attribution: Some(client_end),
155        ..Default::default()
156    };
157    let attribution = fattribution::UpdatedPrincipal {
158        identifier: container_id,
159        resources: Some(fattribution::Resources::Data(fattribution::Data {
160            resources: vec![fattribution::Resource::KernelObject(
161                fuchsia_runtime::job_default().koid().unwrap().raw_koid(),
162            )],
163        })),
164        ..Default::default()
165    };
166
167    vec![
168        fattribution::AttributionUpdate::Add(new_principal),
169        fattribution::AttributionUpdate::Add(starnix_kernel_principal),
170        fattribution::AttributionUpdate::Update(attribution),
171        fattribution::AttributionUpdate::Update(starnix_kernel_attribution),
172    ]
173}
174
175#[derive(Debug)]
176pub struct ContainerStartInfo {
177    /// Configuration specified by the component's `program` block.
178    pub program: ContainerProgram,
179
180    pub config: ContainerStructuredConfig,
181
182    /// The outgoing directory of the container, used to serve protocols on behalf of the container.
183    /// For example, the starnix_kernel serves a component runner in the containers' outgoing
184    /// directory.
185    outgoing_dir: Option<zx::Channel>,
186
187    /// Mapping of top-level namespace entries to an associated channel.
188    /// For example, "/svc" to the respective channel.
189    pub container_namespace: ContainerNamespace,
190
191    /// The runtime directory of the container, used to provide CF introspection.
192    runtime_dir: Option<ServerEnd<fio::DirectoryMarker>>,
193
194    /// An eventpair that debuggers can use to defer the launch of the container.
195    break_on_start: Option<zx::EventPair>,
196
197    /// Component moniker token for the container component. This token is used in various protocols
198    /// to uniquely identify a component.
199    component_instance: Option<zx::Event>,
200}
201
202const MISSING_CONFIG_VMO_CONTEXT: &str = concat!(
203    "Retrieving container config VMO. ",
204    "If this fails, make sure your container CML includes ",
205    "//src/starnix/containers/container.shard.cml.",
206);
207
208impl ContainerStartInfo {
209    fn new(mut start_info: frunner::ComponentStartInfo) -> Result<Self, Error> {
210        let program = start_info.program.as_ref().context("retrieving program block")?;
211        let program: ContainerProgram =
212            runner::serde::deserialize_program(&program).context("parsing program block")?;
213
214        let encoded_config =
215            start_info.encoded_config.as_ref().context(MISSING_CONFIG_VMO_CONTEXT)?;
216        let config = match encoded_config {
217            fmem::Data::Bytes(b) => ContainerStructuredConfig::from_bytes(b),
218            fmem::Data::Buffer(b) => ContainerStructuredConfig::from_vmo(&b.vmo),
219            other => anyhow::bail!("unknown Data variant {other:?}"),
220        }
221        .context("parsing container structured config")?;
222
223        let ns = start_info.ns.take().context("retrieving container namespace")?;
224        let container_namespace = ContainerNamespace::from(ns);
225
226        let outgoing_dir = start_info.outgoing_dir.take().map(|dir| dir.into_channel());
227        let component_instance = start_info.component_instance;
228
229        Ok(Self {
230            program,
231            config,
232            outgoing_dir,
233            container_namespace,
234            component_instance,
235            break_on_start: start_info.break_on_start,
236            runtime_dir: start_info.runtime_dir,
237        })
238    }
239}
240
241#[derive(Debug, Default, Deserialize)]
242#[serde(deny_unknown_fields)]
243pub struct ContainerProgram {
244    /// The name of this container.
245    name: String,
246
247    /// The command line for the initial process for this container.
248    init: Vec<String>,
249
250    /// The command line for the kernel.
251    #[serde(default)]
252    kernel_cmdline: String,
253
254    /// The specifications for the file system mounts for this container.
255    #[serde(default)]
256    mounts: Vec<String>,
257
258    /// The features enabled for this container.
259    #[serde(default)]
260    pub features: Vec<String>,
261
262    /// The resource limits to apply to this container.
263    #[serde(default)]
264    rlimits: Vec<String>,
265
266    /// The path that the container will wait until exists before considering itself to have started.
267    #[serde(default)]
268    startup_file_path: String,
269
270    /// The default seclabel that is applied to components that are instantiated in this container.
271    ///
272    /// Components can override this by setting the `seclabel` field in their program block.
273    #[serde(default)]
274    pub default_seclabel: Option<String>,
275
276    /// The default uid that is applied to components that are instantiated in this container.
277    ///
278    /// Components can override this by setting the `uid` field in their program block.
279    #[serde(default = "default_uid")]
280    pub default_uid: runner::serde::StoreAsString<u32>,
281
282    /// The default mount options to use when mounting directories from a component's namespace.
283    ///
284    /// Each string is expected to follow the format: "<namespace_path>:<mount_options>".
285    pub default_ns_mount_options: Option<Vec<String>>,
286
287    /// Specifies role names to use for "realtime" tasks based on their process & thread names.
288    ///
289    /// Zircon's scheduler doesn't support configuring tasks to always preempt non-"realtime"
290    /// tasks without specifying a constant bandwidth profile. These profiles specify the period and
291    /// expected runtime of a "realtime" task, bounding the amount of work it is allowed to perform
292    /// at an elevated "realtime" priority.
293    ///
294    /// Because constant bandwidth profiles require workload-specific tuning, we can't uniformly
295    /// apply a single profile for all "realtime" tasks. Instead, this container configuration
296    /// allows us to specify different constant bandwidth profiles for different workloads.
297    #[serde(default)]
298    task_role_overrides: Vec<TaskSchedulerMapping>,
299}
300
301/// Specifies a role override for a class of tasks whose process and thread names match provided
302/// patterns.
303#[derive(Default, Deserialize)]
304struct TaskSchedulerMapping {
305    /// The role name to use for tasks matching the provided patterns.
306    role: String,
307    /// A regular expression that will be matched against the process' command.
308    process: String,
309    /// A regular expression that will be matched against the thread's command.
310    thread: String,
311}
312
313impl std::fmt::Debug for TaskSchedulerMapping {
314    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
315        write!(f, "process `{}` thread `{}` role `{}`", self.process, self.thread, self.role)
316    }
317}
318
319fn default_uid() -> runner::serde::StoreAsString<u32> {
320    runner::serde::StoreAsString(42)
321}
322
323// Creates a CString from a String. Calling this with an invalid CString will panic.
324fn to_cstr(str: &str) -> CString {
325    CString::new(str.to_string()).unwrap()
326}
327
328#[must_use = "The container must run serve on this config"]
329pub struct ContainerServiceConfig {
330    start_info: ContainerStartInfo,
331    request_stream: frunner::ComponentControllerRequestStream,
332    receiver: oneshot::Receiver<Result<ExitStatus, Error>>,
333}
334
335pub struct Container {
336    /// The `Kernel` object that is associated with the container.
337    pub kernel: Arc<Kernel>,
338
339    memory_attribution_manager: ContainerMemoryAttributionManager,
340
341    /// Inspect node holding information about the state of the container.
342    _node: inspect::Node,
343
344    /// Until negative trait bound are implemented, using `*mut u8` to prevent transferring
345    /// Container across threads.
346    _thread_bound: std::marker::PhantomData<*mut u8>,
347}
348
349impl Container {
350    pub fn system_task(&self) -> &CurrentTask {
351        self.kernel.kthreads.system_task()
352    }
353
354    async fn serve_outgoing_directory(
355        &self,
356        outgoing_dir: Option<zx::Channel>,
357    ) -> Result<(), Error> {
358        if let Some(outgoing_dir) = outgoing_dir {
359            // Add `ComponentRunner` to the exposed services of the container, and then serve the
360            // outgoing directory.
361            let mut fs = ServiceFs::new_local();
362            fs.dir("svc")
363                .add_fidl_service(ExposedServices::ComponentRunner)
364                .add_fidl_service(ExposedServices::ContainerController)
365                .add_fidl_service(ExposedServices::GraphicalPresenter)
366                .add_fidl_service(ExposedServices::LutexController);
367
368            // Expose the root of the container's filesystem.
369            let (fs_root, fs_root_server_end) = fidl::endpoints::create_proxy();
370            fs.add_remote("fs_root", fs_root);
371            expose_root(
372                self.kernel.kthreads.unlocked_for_async().deref_mut(),
373                self.system_task(),
374                fs_root_server_end,
375            )?;
376
377            fs.serve_connection(outgoing_dir.into()).map_err(|_| errno!(EINVAL))?;
378
379            fs.for_each_concurrent(None, |request_stream| async {
380                match request_stream {
381                    ExposedServices::ComponentRunner(request_stream) => {
382                        match serve_component_runner(request_stream, self.system_task()).await {
383                            Ok(_) => {}
384                            Err(e) => {
385                                log_error!("Error serving component runner: {:?}", e);
386                            }
387                        }
388                    }
389                    ExposedServices::ContainerController(request_stream) => {
390                        serve_container_controller(request_stream, self.system_task())
391                            .await
392                            .expect("failed to start container.")
393                    }
394                    ExposedServices::GraphicalPresenter(request_stream) => {
395                        serve_graphical_presenter(request_stream, &self.kernel)
396                            .await
397                            .expect("failed to start GraphicalPresenter.")
398                    }
399                    ExposedServices::LutexController(request_stream) => {
400                        serve_lutex_controller(request_stream, self.system_task())
401                            .await
402                            .expect("failed to start LutexController.")
403                    }
404                }
405            })
406            .await
407        }
408        Ok(())
409    }
410
411    pub async fn serve(&self, service_config: ContainerServiceConfig) -> Result<(), Error> {
412        let (r, _) = futures::join!(
413            self.serve_outgoing_directory(service_config.start_info.outgoing_dir),
414            server_component_controller(
415                self.kernel.clone(),
416                service_config.request_stream,
417                service_config.receiver
418            )
419        );
420        r
421    }
422
423    pub fn new_memory_attribution_observer(
424        &self,
425        control_handle: fattribution::ProviderControlHandle,
426    ) -> attribution_server::Observer {
427        self.memory_attribution_manager.new_observer(control_handle)
428    }
429}
430
431/// The services that are exposed in the container component's outgoing directory.
432enum ExposedServices {
433    ComponentRunner(frunner::ComponentRunnerRequestStream),
434    ContainerController(fstarcontainer::ControllerRequestStream),
435    GraphicalPresenter(felement::GraphicalPresenterRequestStream),
436    LutexController(fbinder::LutexControllerRequestStream),
437}
438
439type TaskResult = Result<ExitStatus, Error>;
440
441async fn server_component_controller(
442    kernel: Arc<Kernel>,
443    request_stream: frunner::ComponentControllerRequestStream,
444    task_complete: oneshot::Receiver<TaskResult>,
445) {
446    *kernel.container_control_handle.lock() = Some(request_stream.control_handle());
447
448    enum Event<T, U> {
449        Controller(T),
450        Completion(U),
451    }
452
453    let mut stream = futures::stream::select(
454        request_stream.map(Event::Controller),
455        task_complete.into_stream().map(Event::Completion),
456    );
457
458    while let Some(event) = stream.next().await {
459        match event {
460            Event::Controller(Ok(frunner::ComponentControllerRequest::Stop { .. })) => {
461                log_info!("Stopping the container.");
462            }
463            Event::Controller(Ok(frunner::ComponentControllerRequest::Kill { control_handle })) => {
464                log_info!("Killing the container's job.");
465                control_handle.shutdown_with_epitaph(zx::Status::from_raw(
466                    fcomponent::Error::InstanceDied.into_primitive() as i32,
467                ));
468                fruntime::job_default().kill().expect("Failed to kill job");
469            }
470            Event::Controller(Ok(frunner::ComponentControllerRequest::_UnknownMethod {
471                ordinal,
472                method_type,
473                ..
474            })) => {
475                log_error!(ordinal, method_type:?; "Unknown component controller request received.");
476            }
477            Event::Controller(Err(e)) => {
478                log_warn!(e:?; "Container component controller channel encountered an error.");
479            }
480            Event::Completion(result) => {
481                log_info!(result:?; "init process exited.");
482            }
483        }
484
485        // We treat any event in the stream as an invitation to shut down.
486        if !kernel.is_shutting_down() {
487            kernel.shut_down();
488        }
489    }
490
491    log_debug!("done listening for container-terminating events");
492
493    // In case the stream ended without an event, shut down the kernel here.
494    if !kernel.is_shutting_down() {
495        kernel.shut_down();
496    }
497}
498
499pub async fn create_component_from_stream(
500    mut request_stream: frunner::ComponentRunnerRequestStream,
501    kernel_extra_features: Vec<String>,
502) -> Result<(Container, ContainerServiceConfig), Error> {
503    if let Some(event) = request_stream.try_next().await? {
504        match event {
505            frunner::ComponentRunnerRequest::Start { start_info, controller, .. } => {
506                let request_stream = controller.into_stream();
507                let mut start_info = ContainerStartInfo::new(start_info)?;
508                let (sender, receiver) = oneshot::channel::<TaskResult>();
509                let container = create_container(&mut start_info, &kernel_extra_features, sender)
510                    .await
511                    .with_source_context(|| {
512                        format!("creating container \"{}\"", start_info.program.name)
513                    })?;
514                let service_config =
515                    ContainerServiceConfig { start_info, request_stream, receiver };
516                return Ok((container, service_config));
517            }
518            frunner::ComponentRunnerRequest::_UnknownMethod { ordinal, .. } => {
519                log_warn!("Unknown ComponentRunner request: {ordinal}");
520            }
521        }
522    }
523    bail!("did not receive Start request");
524}
525
526async fn get_bootargs(device_tree: &Devicetree) -> Result<String, Error> {
527    device_tree
528        .root_node
529        .find("chosen")
530        .and_then(|n| {
531            n.get_property("bootargs").map(|p| {
532                let end =
533                    if p.value.last() == Some(&0) { p.value.len() - 1 } else { p.value.len() };
534                match std::str::from_utf8(&p.value[..end]) {
535                    Ok(s) => Ok(s.to_owned()),
536                    Err(e) => {
537                        log_warn!("Bootargs are not valid UTF-8: {e}");
538                        Err(anyhow!("Bootargs are not valid UTF-8"))
539                    }
540                }
541            })
542        })
543        .context("Couldn't find bootargs")?
544}
545
546async fn get_bootitems() -> Result<std::vec::Vec<u8>, Error> {
547    let items =
548        connect_to_protocol::<fboot::ItemsMarker>().context("Failed to connect to boot items")?;
549
550    let items_response = items
551        .get2(zbi::ZbiType::DeviceTree.into_raw(), None)
552        .await
553        .context("FIDL: Failed to get devicetree item")?
554        .map_err(|e| anyhow!("Failed to get devicetree item {:?}", e))?;
555
556    let Some(item) = items_response.last() else {
557        return Err(anyhow!("Failed to get items"));
558    };
559
560    let devicetree_vmo = &item.payload;
561    let bytes = devicetree_vmo
562        .read_to_vec(0, item.length as u64)
563        .context("Failed to read devicetree vmo")?;
564
565    Ok(bytes)
566}
567
568async fn create_container(
569    start_info: &mut ContainerStartInfo,
570    kernel_extra_features: &[String],
571    task_complete: oneshot::Sender<TaskResult>,
572) -> Result<Container, Error> {
573    trace_duration!(CATEGORY_STARNIX, NAME_CREATE_CONTAINER);
574    const DEFAULT_INIT: &str = "/container/init";
575
576    let pkg_channel = start_info.container_namespace.get_namespace_channel("/pkg").unwrap();
577    let pkg_dir_proxy = fio::DirectorySynchronousProxy::new(pkg_channel);
578
579    let device_tree: Option<Devicetree> = match get_bootitems().await {
580        Ok(items) => match parse_devicetree(&items) {
581            Ok(device_tree) => Some(device_tree),
582            Err(e) => {
583                log_warn!("Failed to parse devicetree: {e:?}");
584                None
585            }
586        },
587        Err(e) => {
588            log_warn!("Failed to get boot items for devicetree: {e:?}");
589            None
590        }
591    };
592    let mut features = parse_features(&start_info, kernel_extra_features)?;
593
594    log_debug!("Creating container with {:#?}", features);
595    let mut kernel_cmdline = BString::from(start_info.program.kernel_cmdline.as_bytes());
596    let mut android_provided_bootreason = None;
597
598    if features.android_serialno {
599        if let Some(device_tree) = &device_tree {
600            match get_bootargs(device_tree).await {
601                Ok(args) => {
602                    for item in parse_cmdline(&args) {
603                        if item.starts_with("androidboot.force_normal_boot") {
604                            // TODO(https://fxbug.dev/424152964): Support force_normal_boot.
605                            continue;
606                        }
607                        if item.starts_with("androidboot.bootreason") && features.android_bootreason
608                        {
609                            // androidboot.bootreason is sourced from the Fuchsia reboot reason.
610                            // It is still useful to log it from userspace to learn what the
611                            // possible values are.
612                            log_info!("Original devicetree bootarg {:?}", item);
613                            if let Some((_, v)) = item.split_once('=') {
614                                android_provided_bootreason = Some(v.to_string());
615                            }
616                            continue;
617                        }
618                        kernel_cmdline.extend(b" ");
619                        kernel_cmdline.extend(item.bytes());
620                    }
621                }
622                Err(err) => log_warn!("could not get bootargs: {err:?}"),
623            }
624        } else {
625            log_warn!("No devicetree available to get bootargs for android.serialno");
626        }
627    }
628    if features.android_bootreason {
629        kernel_cmdline.extend(b" androidboot.bootreason=");
630
631        let tmp_channel = start_info.container_namespace.get_namespace_channel("/tmp_lifecycle");
632        let tmp_proxy = match tmp_channel {
633            Ok(channel) => {
634                Some(fio::DirectoryProxy::new(fidl::AsyncChannel::from_channel(channel)))
635            }
636            _ => None,
637        };
638
639        match get_or_init_android_bootreason(tmp_proxy, android_provided_bootreason).await {
640            Ok(reason) => {
641                kernel_cmdline.extend(reason.bytes());
642            }
643            Err(err) => {
644                log_warn!("could not get android bootreason: {err:?}. falling back to 'unknown'");
645                kernel_cmdline.extend(b"unknown");
646            }
647        }
648    }
649    if let Some(supported_vendors) = &features.magma_supported_vendors {
650        kernel_cmdline.extend(b" ");
651        let params = get_magma_params(supported_vendors);
652        kernel_cmdline.extend(&*params);
653    }
654
655    // Check whether we actually have access to a role manager by trying to set our own
656    // thread's role.
657    let mut task_mappings = RoleOverrides::new();
658    for m in &start_info.program.task_role_overrides {
659        task_mappings.add(m.process.clone(), m.thread.clone(), m.role.clone());
660    }
661    let task_mappings = task_mappings.build().context("adding custom task role")?;
662    let scheduler_manager = SchedulerManager::new(task_mappings);
663
664    let crash_reporter = connect_to_protocol::<CrashReporterMarker>().unwrap();
665
666    let node = inspect::component::inspector().root().create_child("container");
667    let kernel_node = node.create_child("kernel");
668    kernel_node.record_int("created_at", zx::MonotonicInstant::get().into_nanos());
669    features.record_inspect(&kernel_node);
670
671    let security_state = security::kernel_init_security(
672        features.selinux.enabled,
673        features.selinux.options.clone(),
674        features.selinux.exceptions.clone(),
675        &kernel_node,
676    );
677
678    // `config.enable_utc_time_adjustment` is set through config capability
679    // `fuchsia.time.config.WritableUTCTime`.
680    let time_adjustment_proxy = if features.enable_utc_time_adjustment {
681        connect_to_protocol_sync::<AdjustMarker>()
682            .map_err(|e| log_error!("could not connect to fuchsia.time.external/Adjust: {:?}", e))
683            .ok()
684    } else {
685        // See the comment above. UTC adjustment is a per-product setting.
686        log_info!("UTC adjustment is forbidden.");
687        None
688    };
689
690    log_info!("final kernel cmdline: {kernel_cmdline:?}");
691    kernel_node.record_string("cmdline", kernel_cmdline.to_str_lossy());
692
693    let kernel = Kernel::new(
694        kernel_cmdline,
695        features.kernel.clone(),
696        std::mem::take(&mut features.system_limits),
697        start_info.container_namespace.try_clone()?,
698        scheduler_manager,
699        Some(crash_reporter),
700        kernel_node,
701        security_state,
702        time_adjustment_proxy,
703        device_tree,
704    )
705    .with_source_context(|| format!("creating Kernel: {}", &start_info.program.name))?;
706    let (fs_context, feature_mounts) = create_fs_context(
707        kernel.kthreads.unlocked_for_async().deref_mut(),
708        &kernel,
709        &features,
710        start_info,
711        &pkg_dir_proxy,
712    )
713    .source_context("creating FsContext")?;
714    let init_pid = kernel.pids.write().allocate_pid();
715    // Lots of software assumes that the pid for the init process is 1.
716    debug_assert_eq!(init_pid, 1);
717
718    let system_task = create_system_task(
719        kernel.kthreads.unlocked_for_async().deref_mut(),
720        &kernel,
721        Arc::clone(&fs_context),
722    )
723    .source_context("create system task")?;
724    // The system task gives pid 2. This value is less critical than giving
725    // pid 1 to init, but this value matches what is supposed to happen.
726    debug_assert_eq!(system_task.tid, 2);
727
728    feature_mounts(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
729        .source_context("mounting feature filesystems")?;
730
731    kernel.kthreads.init(system_task).source_context("initializing kthreads")?;
732    let system_task = kernel.kthreads.system_task();
733
734    kernel.syslog.init(&system_task).source_context("initializing syslog")?;
735
736    kernel.hrtimer_manager.init(system_task).source_context("initializing HrTimer manager")?;
737
738    log_info!("Initializing suspend resume manager.");
739    if let Err(e) = kernel.suspend_resume_manager.init(&system_task) {
740        log_warn!("Suspend/Resume manager initialization failed: ({e:?})");
741    }
742
743    // Real Time clock is present in all configuration.
744    log_info!("Initializing RTC device.");
745    rtc_device_init(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
746        .context("in starnix_kernel_runner, while initializing RTC")?;
747
748    // Register common devices and add them in sysfs and devtmpfs.
749    log_info!("Registering devices and filesystems.");
750    init_common_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel)?;
751    register_common_file_systems(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel);
752
753    log_info!("Mounting filesystems.");
754    mount_filesystems(
755        kernel.kthreads.unlocked_for_async().deref_mut(),
756        &system_task,
757        start_info,
758        &pkg_dir_proxy,
759    )
760    .source_context("mounting filesystems")?;
761
762    // Run all common features that were specified in the .cml.
763    {
764        log_info!("Running container features.");
765        run_container_features(
766            kernel.kthreads.unlocked_for_async().deref_mut(),
767            &system_task,
768            &features,
769        )?;
770    }
771
772    log_info!("Initializing remote block devices.");
773    init_remote_block_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
774        .source_context("initalizing remote block devices")?;
775
776    // If there is an init binary path, run it, optionally waiting for the
777    // startup_file_path to be created. The task struct is still used
778    // to initialize the system up until this point, regardless of whether
779    // or not there is an actual init to be run.
780    let argv = if start_info.program.init.is_empty() {
781        vec![DEFAULT_INIT.to_string()]
782    } else {
783        start_info.program.init.clone()
784    }
785    .iter()
786    .map(|s| to_cstr(s))
787    .collect::<Vec<_>>();
788
789    log_info!("Opening start_info file.");
790    let executable = system_task
791        .open_file(
792            kernel.kthreads.unlocked_for_async().deref_mut(),
793            argv[0].as_bytes().into(),
794            OpenFlags::RDONLY,
795        )
796        .with_source_context(|| format!("opening init: {:?}", &argv[0]))?;
797
798    let initial_name = if start_info.program.init.is_empty() {
799        TaskCommand::default()
800    } else {
801        TaskCommand::new(start_info.program.init[0].as_bytes())
802    };
803
804    let rlimits = parse_rlimits(&start_info.program.rlimits)?;
805
806    // Serve the runtime directory.
807    log_info!("Starting runtime directory.");
808    if let Some(runtime_dir) = start_info.runtime_dir.take() {
809        kernel.kthreads.spawn_future(
810            move || async move { serve_runtime_dir(runtime_dir).await },
811            "serve_runtime_dir",
812        );
813    }
814
815    // At this point the runtime environment has been prepared but nothing is actually running yet.
816    // Pause here if a debugger needs time to attach to the job.
817    if let Some(break_on_start) = start_info.break_on_start.take() {
818        log_info!("Waiting for signal from debugger before spawning init process...");
819        if let Err(e) =
820            fuchsia_async::OnSignals::new(break_on_start, zx::Signals::EVENTPAIR_PEER_CLOSED).await
821        {
822            log_warn!(e:%; "Received break_on_start eventpair but couldn't wait for PEER_CLOSED.");
823        }
824    }
825
826    log_info!("Creating init process.");
827    let init_task = create_init_process(
828        kernel.kthreads.unlocked_for_async().deref_mut(),
829        &kernel,
830        init_pid,
831        initial_name,
832        Arc::clone(&fs_context),
833        &rlimits,
834    )
835    .with_source_context(|| format!("creating init task: {:?}", &start_info.program.init))?;
836
837    execute_task_with_prerun_result(
838        kernel.kthreads.unlocked_for_async().deref_mut(),
839        init_task,
840        move |locked, init_task| {
841            parse_numbered_handles(locked, init_task, None, &init_task.live().files).expect("");
842            init_task.exec(locked, executable, argv[0].clone(), argv.clone(), vec![])
843        },
844        move |result| {
845            log_info!("Finished running init process: {:?}", result);
846            let _ = task_complete.send(result);
847        },
848        None,
849    )?;
850
851    if !start_info.program.startup_file_path.is_empty() {
852        wait_for_init_file(&start_info.program.startup_file_path, &system_task, init_pid).await?;
853    };
854
855    let memory_attribution_manager = ContainerMemoryAttributionManager::new(
856        Arc::downgrade(&kernel),
857        start_info.component_instance.take().ok_or_else(|| Error::msg("No component instance"))?,
858    );
859
860    Ok(Container {
861        kernel,
862        memory_attribution_manager,
863        _node: node,
864        _thread_bound: Default::default(),
865    })
866}
867
868fn create_fs_context(
869    locked: &mut Locked<Unlocked>,
870    kernel: &Kernel,
871    features: &Features,
872    start_info: &ContainerStartInfo,
873    pkg_dir_proxy: &fio::DirectorySynchronousProxy,
874) -> Result<(Arc<FsContext>, LayeredFsMounts), Error> {
875    // The mounts are applied in the order listed. Mounting will fail if the designated mount
876    // point doesn't exist in a previous mount. The root must be first so other mounts can be
877    // applied on top of it.
878    let mut mounts_iter =
879        start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
880    let root = MountAction::new_for_root(
881        locked,
882        kernel,
883        pkg_dir_proxy,
884        mounts_iter.next().ok_or_else(|| anyhow!("Mounts list is empty"))?,
885    )?;
886    if root.path != "/" {
887        anyhow::bail!("First mount in mounts list is not the root");
888    }
889
890    let mut builder = LayeredFsBuilder::new(root.fs);
891    if features.container {
892        // /container/component will be a tmpfs where component using the starnix kernel will have their
893        // package mounted.
894        let component_tmpfs_options = FileSystemOptions {
895            params: kernel
896                .features
897                .ns_mount_options("#component_tmpfs")
898                .context("#component_tmpfs options")?,
899            ..Default::default()
900        };
901        let component_tmpfs = TmpFs::new_fs_with_options(locked, kernel, component_tmpfs_options)?;
902
903        // /container will mount the container pkg
904        let container_remotefs_options = FileSystemOptions {
905            source: "data".into(),
906            params: kernel.features.ns_mount_options("#container").context("#container options")?,
907            ..Default::default()
908        };
909        let container_remotefs = new_remotefs_in_root(
910            locked,
911            kernel,
912            pkg_dir_proxy,
913            container_remotefs_options,
914            fio::PERM_READABLE | fio::PERM_EXECUTABLE,
915        )?;
916
917        builder.add("/container", container_remotefs);
918        builder.add("/container/component", component_tmpfs);
919    }
920    if features.custom_artifacts {
921        let mount_options = FileSystemOptions {
922            params: kernel
923                .features
924                .ns_mount_options("#custom_artifacts")
925                .context("#custom_artifacts options")?,
926            ..Default::default()
927        };
928        let fs = TmpFs::new_fs_with_options(locked, kernel, mount_options)?;
929        builder.add("/custom_artifacts", fs);
930    }
931    if features.test_data {
932        let mount_options = FileSystemOptions {
933            params: kernel.features.ns_mount_options("#test_data").context("#test_data options")?,
934            ..Default::default()
935        };
936        let fs = TmpFs::new_fs_with_options(locked, kernel, mount_options)?;
937        builder.add("/test_data", fs);
938    }
939
940    let (mut root_fs, feature_mounts) = builder.build(locked, kernel);
941    if features.rootfs_rw {
942        root_fs = OverlayStack::wrap_fs_in_writable_layer(locked, kernel, root_fs)?;
943    }
944
945    Ok((FsContext::new(Namespace::new_with_flags(root_fs, root.flags)), feature_mounts))
946}
947
948fn parse_rlimits(rlimits: &[String]) -> Result<Vec<(Resource, u64)>, Error> {
949    let mut res = Vec::new();
950
951    for rlimit in rlimits {
952        let (key, value) =
953            rlimit.split_once('=').ok_or_else(|| anyhow!("Invalid rlimit: {rlimit}"))?;
954        let value = value.parse::<u64>()?;
955        let kv = match key {
956            "RLIMIT_NOFILE" => (Resource::NOFILE, value),
957            "RLIMIT_RTPRIO" => (Resource::RTPRIO, value),
958            _ => bail!("Unknown rlimit: {key}"),
959        };
960        res.push(kv);
961    }
962
963    Ok(res)
964}
965
966fn mount_filesystems(
967    locked: &mut Locked<Unlocked>,
968    system_task: &CurrentTask,
969    start_info: &ContainerStartInfo,
970    pkg_dir_proxy: &fio::DirectorySynchronousProxy,
971) -> Result<(), Error> {
972    // Skip the first mount, that was used to create the root filesystem.
973    let mut mounts_iter =
974        start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
975    let _ = mounts_iter.next();
976    for mount_spec in mounts_iter {
977        let action = MountAction::from_spec(locked, system_task, pkg_dir_proxy, mount_spec)
978            .with_source_context(|| format!("creating filesystem from spec: {}", &mount_spec))?;
979        let mount_point = system_task
980            .lookup_path_from_root(locked, action.path.as_ref())
981            .with_source_context(|| format!("lookup path from root: {}", action.path))?;
982        mount_point.mount(WhatToMount::Fs(action.fs), action.flags)?;
983    }
984    Ok(())
985}
986
987fn init_remote_block_devices(
988    locked: &mut Locked<Unlocked>,
989    system_task: &CurrentTask,
990) -> Result<(), Error> {
991    remote_block_device_init(locked, system_task);
992    let entries = match std::fs::read_dir("/block") {
993        Ok(entries) => entries,
994        Err(e) => {
995            log_warn!("Failed to read block directory: {}", e);
996            return Ok(());
997        }
998    };
999    for entry in entries {
1000        let entry = entry?;
1001        let path_buf = entry.path();
1002        let path = path_buf.to_str().ok_or_else(|| anyhow!("Invalid block device path"))?;
1003        let (client_end, server_end) = fidl::endpoints::create_endpoints();
1004        match fdio::service_connect(
1005            &format!("{}/fuchsia.storage.block.Block", path),
1006            server_end.into(),
1007        ) {
1008            Ok(()) => (),
1009            Err(e) => {
1010                log_warn!("Failed to connect to block device at {}: {}", path, e);
1011                continue;
1012            }
1013        }
1014        system_task.kernel().remote_block_device_registry.create_remote_block_device(
1015            locked,
1016            system_task,
1017            entry.file_name().to_str().unwrap(),
1018            client_end,
1019        )?;
1020    }
1021    Ok(())
1022}
1023
1024async fn wait_for_init_file(
1025    startup_file_path: &str,
1026    current_task: &CurrentTask,
1027    init_tid: tid_t,
1028) -> Result<(), Error> {
1029    // TODO(https://fxbug.dev/42178400): Use inotify machinery to wait for the file.
1030    loop {
1031        fasync::Timer::new(fasync::MonotonicDuration::from_millis(100).after_now()).await;
1032
1033        let creds = security::creds_start_internal_operation(current_task);
1034        if let Some(result) = current_task.override_creds(creds, || {
1035            let root = current_task.fs().root();
1036            let mut context = LookupContext::default();
1037
1038            match current_task.lookup_path(
1039                current_task.kernel().kthreads.unlocked_for_async().deref_mut(),
1040                &mut context,
1041                root,
1042                startup_file_path.into(),
1043            ) {
1044                Ok(_) => return Some(Ok(())),
1045                Err(error) if error == ENOENT => {}
1046                Err(error) => return Some(Err(anyhow::Error::from(error))),
1047            };
1048
1049            if current_task.get_task(init_tid).upgrade().is_none() {
1050                return Some(Err(anyhow!(
1051                    "Init task terminated before startup_file_path was ready"
1052                )));
1053            }
1054
1055            None
1056        }) {
1057            return result;
1058        }
1059    }
1060}
1061
1062async fn serve_runtime_dir(runtime_dir: ServerEnd<fio::DirectoryMarker>) {
1063    let mut fs = fuchsia_component::server::ServiceFs::new();
1064    match create_job_id_vmo() {
1065        Ok(vmo) => {
1066            fs.dir("elf").add_vmo_file_at("job_id", vmo);
1067        }
1068        Err(e) => log_error!(e:%; "failed to create vmo with job id for debuggers"),
1069    }
1070    match fs.serve_connection(runtime_dir) {
1071        Ok(_) => {
1072            fs.add_fidl_service(|job_requests: TaskProviderRequestStream| {
1073                fuchsia_async::Task::local(async move {
1074                    if let Err(e) = serve_task_provider(job_requests).await {
1075                        log_warn!(e:?; "Error serving TaskProvider");
1076                    }
1077                })
1078                .detach();
1079            });
1080            fs.collect::<()>().await;
1081        }
1082        Err(e) => log_error!("Couldn't serve runtime directory: {e:?}"),
1083    }
1084}
1085
1086fn create_job_id_vmo() -> Result<zx::Vmo, Error> {
1087    let job_id = fuchsia_runtime::job_default().koid().context("reading own job koid")?;
1088    let job_id_str = job_id.raw_koid().to_string();
1089    let job_id_vmo = zx::Vmo::create(job_id_str.len() as u64).context("creating job id vmo")?;
1090    job_id_vmo.write(job_id_str.as_bytes(), 0).context("write job id to vmo")?;
1091    Ok(job_id_vmo)
1092}
1093
1094async fn serve_task_provider(mut job_requests: TaskProviderRequestStream) -> Result<(), Error> {
1095    while let Some(request) = job_requests.next().await {
1096        match request.context("getting next TaskProvider request")? {
1097            TaskProviderRequest::GetJob { responder } => {
1098                responder
1099                    .send(
1100                        fuchsia_runtime::job_default()
1101                            .duplicate_handle(zx::Rights::SAME_RIGHTS)
1102                            .map_err(|s| s.into_raw()),
1103                    )
1104                    .context("sending job for runtime dir")?;
1105            }
1106            unknown => bail!("Unknown TaskProvider method {unknown:?}"),
1107        }
1108    }
1109    Ok(())
1110}
1111
1112#[cfg(test)]
1113mod test {
1114    use super::wait_for_init_file;
1115    use fuchsia_async as fasync;
1116    use futures::{SinkExt, StreamExt};
1117    #[allow(deprecated, reason = "pre-existing usage")]
1118    use starnix_core::testing::create_kernel_task_and_unlocked;
1119    use starnix_core::vfs::FdNumber;
1120    use starnix_uapi::CLONE_FS;
1121    use starnix_uapi::file_mode::{AccessCheck, FileMode};
1122    use starnix_uapi::open_flags::OpenFlags;
1123    use starnix_uapi::signals::SIGCHLD;
1124    use starnix_uapi::vfs::ResolveFlags;
1125
1126    #[fuchsia::test]
1127    async fn test_init_file_already_exists() {
1128        #[allow(deprecated, reason = "pre-existing usage")]
1129        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1130        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1131
1132        let path = "/path";
1133        current_task
1134            .open_file_at(
1135                locked,
1136                FdNumber::AT_FDCWD,
1137                path.into(),
1138                OpenFlags::CREAT,
1139                FileMode::default(),
1140                ResolveFlags::empty(),
1141                AccessCheck::default(),
1142            )
1143            .expect("Failed to create file");
1144
1145        fasync::Task::local(async move {
1146            wait_for_init_file(path, &current_task, current_task.get_tid())
1147                .await
1148                .expect("failed to wait for file");
1149            sender.send(()).await.expect("failed to send message");
1150        })
1151        .detach();
1152
1153        // Wait for the file creation to have been detected.
1154        assert!(receiver.next().await.is_some());
1155    }
1156
1157    #[fuchsia::test]
1158    async fn test_init_file_wait_required() {
1159        #[allow(deprecated, reason = "pre-existing usage")]
1160        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1161        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1162
1163        let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1164        let path = "/path";
1165
1166        let test_init_tid = current_task.get_tid();
1167        fasync::Task::local(async move {
1168            sender.send(()).await.expect("failed to send message");
1169            wait_for_init_file(path, &init_task, test_init_tid)
1170                .await
1171                .expect("failed to wait for file");
1172            sender.send(()).await.expect("failed to send message");
1173        })
1174        .detach();
1175
1176        // Wait for message that file check has started.
1177        assert!(receiver.next().await.is_some());
1178
1179        // Create the file that is being waited on.
1180        current_task
1181            .open_file_at(
1182                locked,
1183                FdNumber::AT_FDCWD,
1184                path.into(),
1185                OpenFlags::CREAT,
1186                FileMode::default(),
1187                ResolveFlags::empty(),
1188                AccessCheck::default(),
1189            )
1190            .expect("Failed to create file");
1191
1192        // Wait for the file creation to be detected.
1193        assert!(receiver.next().await.is_some());
1194    }
1195
1196    #[fuchsia::test]
1197    async fn test_init_exits_before_file_exists() {
1198        #[allow(deprecated, reason = "pre-existing usage")]
1199        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1200        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1201
1202        let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1203        const STARTUP_FILE_PATH: &str = "/path";
1204
1205        let test_init_tid = init_task.get_tid();
1206        fasync::Task::local(async move {
1207            sender.send(()).await.expect("failed to send message");
1208            wait_for_init_file(STARTUP_FILE_PATH, &current_task, test_init_tid)
1209                .await
1210                .expect_err("Did not detect init exit");
1211            sender.send(()).await.expect("failed to send message");
1212        })
1213        .detach();
1214
1215        // Wait for message that file check has started.
1216        assert!(receiver.next().await.is_some());
1217
1218        // Drop the `init_task`.
1219        std::mem::drop(init_task);
1220
1221        // Wait for the init failure to be detected.
1222        assert!(receiver.next().await.is_some());
1223    }
1224}