starnix_core/execution/
crash_reporter.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::{CurrentTask, ExitStatus};
6use fidl_fuchsia_feedback::{
7    Annotation, CrashReport, CrashReporterProxy, MAX_ANNOTATION_VALUE_LENGTH,
8    MAX_CRASH_SIGNATURE_LENGTH, NativeCrashReport, SpecificCrashReport,
9};
10use fuchsia_inspect::{Inspector, Node};
11use futures::FutureExt;
12use starnix_logging::{
13    CATEGORY_STARNIX, CoreDumpInfo, CoreDumpList, TraceScope, log_error, log_info, log_warn,
14    trace_instant,
15};
16use starnix_sync::Mutex;
17use std::collections::{HashMap, VecDeque};
18use std::sync::Arc;
19use zx::{self as zx, AsHandleRef};
20
21/// The maximum number of crashes we allow to happen for a process within the last
22/// CrashReporter.crash_loop_age_out before we consider it to be crash looping. 8 within 8 minutes
23/// was chosen as a balance between "definitely a crash loop" and "still saves system resources."
24pub const CRASH_LOOP_LIMIT: usize = 8;
25
26/// While throttled, we should still occasionally file a report with a higher "weight" that can
27/// represent the rest of the crashes.
28const REPORT_EVERY_X_WHILE_THROTTLED: u32 = 10;
29
30pub struct CrashReporter {
31    /// Diagnostics information about crashed tasks.
32    core_dumps: CoreDumpList,
33
34    /// Diagnostics information. A mapping from process name -> number of crashes for that process
35    /// that weren't uploaded because of process throttling.
36    throttled_core_dumps: Arc<Mutex<HashMap<String, i64>>>,
37
38    /// Tracks when crashes occurred for each process name.
39    crashes_per_process: Arc<Mutex<HashMap<String, CrashInfo>>>,
40
41    /// Connection to the feedback stack for reporting crashes.
42    proxy: Option<CrashReporterProxy>,
43
44    /// The period before a crash is no longer considered for detecting crash loops.
45    crash_loop_age_out: zx::MonotonicDuration,
46
47    /// Whether excessive crash reports should be throttled.
48    enable_throttling: bool,
49}
50
51pub struct PendingCrashReport {
52    /// The current task's argv.
53    argv: Vec<String>,
54
55    /// The crashed process name.
56    argv0: String,
57
58    /// How many crashes this report represents. For example, a value of 10 would indicate that
59    /// this report will represent 9 other throttled crashes for this process.
60    weight: u32,
61}
62
63impl CrashReporter {
64    pub fn new(
65        inspect_node: &Node,
66        proxy: Option<CrashReporterProxy>,
67        crash_loop_age_out: zx::MonotonicDuration,
68        enable_throttling: bool,
69    ) -> Self {
70        let crash_reporter = Self {
71            core_dumps: CoreDumpList::new(inspect_node.create_child("coredumps")),
72            throttled_core_dumps: Arc::new(Mutex::new(Default::default())),
73            crashes_per_process: Arc::new(Mutex::new(Default::default())),
74            proxy,
75            crash_loop_age_out,
76            enable_throttling,
77        };
78
79        crash_reporter.record_throttling_in_inspect(inspect_node);
80        crash_reporter
81    }
82
83    /// Returns a PendingCrashReport if the crash report should be reported. Otherwise, returns
84    /// None.
85    pub fn begin_crash_report(&self, current_task: &CurrentTask) -> Option<PendingCrashReport> {
86        let argv = current_task
87            .read_argv(MAX_ANNOTATION_VALUE_LENGTH as usize)
88            .unwrap_or_else(|_| vec!["<unknown>".into()])
89            .into_iter()
90            .map(|a| a.to_string())
91            .collect::<Vec<_>>();
92        let argv0 = argv.get(0).map(AsRef::as_ref).unwrap_or_else(|| "<unknown>");
93
94        // Get the filename.
95        let argv0 = argv0.rsplit_once("/").unwrap_or(("", &argv0)).1.to_string();
96
97        self.report_guard(argv, argv0, zx::MonotonicInstant::get())
98    }
99
100    /// Callers should first check whether the crash should be reported via begin_crash_report.
101    pub fn handle_core_dump(
102        &self,
103        current_task: &CurrentTask,
104        exit_status: &ExitStatus,
105        pending_crash_report: PendingCrashReport,
106    ) {
107        trace_instant!(CATEGORY_STARNIX, "RecordCoreDump", TraceScope::Process);
108
109        let argv = pending_crash_report.argv;
110        let argv0 = pending_crash_report.argv0;
111        let process_koid = current_task
112            .thread_group()
113            .process
114            .get_koid()
115            .expect("handles for processes with crashing threads are still valid");
116        let thread_koid = current_task
117            .thread
118            .read()
119            .as_ref()
120            .expect("coredumps occur in tasks with associated threads")
121            .get_koid()
122            .expect("handles for crashing threads are still valid");
123        let linux_pid = current_task.thread_group().leader as i64;
124        let thread_name = current_task.command().to_string();
125        let signal = match exit_status {
126            ExitStatus::CoreDump(s) => s.signal,
127            other => {
128                log_error!(
129                    "only core dump exit statuses should be handled as core dumps, got {other:?}"
130                );
131                return;
132            }
133        };
134
135        // TODO(https://fxbug.dev/356912301) use boot time
136        let uptime = zx::MonotonicInstant::get() - current_task.thread_group().start_time;
137
138        let dump_info = CoreDumpInfo {
139            process_koid,
140            thread_koid,
141            linux_pid,
142            uptime: uptime.into_nanos(),
143            argv: argv.clone(),
144            thread_name: thread_name.clone(),
145            signal: signal.to_string(),
146        };
147        self.core_dumps.record_core_dump(dump_info);
148
149        let mut argv_joined = argv.join(" ");
150        truncate_with_ellipsis(&mut argv_joined, MAX_ANNOTATION_VALUE_LENGTH as usize);
151
152        let mut env_joined = current_task
153            .read_env(MAX_ANNOTATION_VALUE_LENGTH as usize)
154            .unwrap_or_else(|_| vec![])
155            .into_iter()
156            .map(|a| a.to_string())
157            .collect::<Vec<_>>()
158            .join(" ");
159        truncate_with_ellipsis(&mut env_joined, MAX_ANNOTATION_VALUE_LENGTH as usize);
160
161        let signal_str = signal.to_string();
162
163        // Truncate program name to fit in crash signature with a space and signal string added.
164        let max_signature_prefix_len = MAX_CRASH_SIGNATURE_LENGTH as usize - (signal_str.len() + 1);
165        let mut crash_signature = argv0.clone();
166        truncate_with_ellipsis(&mut crash_signature, max_signature_prefix_len);
167        crash_signature.push(' ');
168        crash_signature.push_str(&signal_str);
169
170        let crash_report = CrashReport {
171            crash_signature: Some(crash_signature),
172            program_name: Some(argv0.clone()),
173            program_uptime: Some(uptime.into_nanos()),
174            specific_report: Some(SpecificCrashReport::Native(NativeCrashReport {
175                process_koid: Some(process_koid.raw_koid()),
176                process_name: Some(argv0),
177                thread_koid: Some(thread_koid.raw_koid()),
178                thread_name: Some(thread_name),
179                ..Default::default()
180            })),
181            annotations: Some(vec![
182                // Note that this pid will be different from the Zircon process koid that's visible
183                // to the rest of Fuchsia. We want to include both so that this can be correlated
184                // against debugging artifacts produced by Android code.
185                Annotation { key: "linux.pid".to_string(), value: linux_pid.to_string() },
186                Annotation { key: "linux.argv".to_string(), value: argv_joined },
187                Annotation { key: "linux.env".to_string(), value: env_joined },
188                Annotation { key: "linux.signal".to_string(), value: signal_str },
189            ]),
190            is_fatal: Some(true),
191            weight: Some(pending_crash_report.weight),
192            ..Default::default()
193        };
194
195        if let Some(reporter) = &self.proxy {
196            let reporter = reporter.clone();
197            // Do the actual report in the background since they can take a while to file.
198            current_task.kernel().kthreads.spawn_future(async move || {
199                match reporter.file_report(crash_report).await {
200                    Ok(Ok(_)) => (),
201                    Ok(Err(filing_error)) => {
202                        log_error!(filing_error:?; "Couldn't file crash report.");
203                    }
204                    Err(fidl_error) => log_warn!(
205                        fidl_error:?;
206                        "Couldn't file crash report due to error on underlying channel."
207                    ),
208                };
209            });
210        } else {
211            log_info!(crash_report:?; "no crash reporter available for crash");
212        }
213    }
214
215    /// Locally records that a crash for `process_name` occurred at `runtime` and returns a guard
216    /// for an in-flight report if few enough overall are in-flight, as well as the weight that
217    /// should be assigned to the crash report.
218    ///
219    /// Note: runtime is the total time the device has been on according to the monotonic clock, not
220    /// the amount of time the process was running.
221    fn report_guard(
222        &self,
223        argv: Vec<String>,
224        argv0: String,
225        runtime: zx::MonotonicInstant,
226    ) -> Option<PendingCrashReport> {
227        if !self.enable_throttling {
228            return Some(PendingCrashReport { argv, argv0, weight: 1 });
229        }
230
231        // Locally record that the crash occurred.
232        let mut crashes_per_process = self.crashes_per_process.lock();
233        let crash_info = crashes_per_process.entry(argv0.clone()).or_default();
234        crash_info.crash_runtimes.push_back(runtime);
235
236        crash_info.prune_crash_runtimes(runtime, self.crash_loop_age_out);
237
238        // Even if we're not throttled, we still need to have a weight of 1 so incrementing this
239        // here will let us later use it as the weight.
240        crash_info.num_crashes_while_throttled += 1;
241
242        // Check if this particular process has been filing too many reports.
243        if crash_info.is_throttled_at(runtime, self.crash_loop_age_out)
244            && (crash_info.num_crashes_while_throttled < REPORT_EVERY_X_WHILE_THROTTLED)
245        {
246            log_info!(
247                "Process '{argv0}' is throttled due to suspected crash loop, will fold report into later crash"
248            );
249            *self.throttled_core_dumps.lock().entry(argv0).or_default() += 1;
250            return None;
251        }
252
253        let weight = crash_info.num_crashes_while_throttled;
254        crash_info.num_crashes_while_throttled = 0;
255
256        Some(PendingCrashReport { argv, argv0, weight })
257    }
258
259    fn record_throttling_in_inspect(&self, inspect_node: &Node) {
260        let throttled_core_dumps = self.throttled_core_dumps.clone();
261        let crashes_per_process = self.crashes_per_process.clone();
262        let crash_loop_age_out = self.crash_loop_age_out;
263
264        inspect_node.record_lazy_child("coredumps_throttled", move || {
265            let throttled_core_dumps = throttled_core_dumps.clone();
266            let crashes_per_process = crashes_per_process.clone();
267
268            async move {
269                let inspector = Inspector::default();
270                let mut crashes_per_process = crashes_per_process.lock();
271                let runtime = zx::MonotonicInstant::get();
272
273                for (process, count) in throttled_core_dumps.lock().iter() {
274                    let Some(crash_info) = crashes_per_process.get_mut(process) else {
275                        continue;
276                    };
277
278                    crash_info.prune_crash_runtimes(runtime, crash_loop_age_out);
279
280                    let process_node = inspector.root().create_child(process);
281                    process_node.record_bool(
282                        "currently_throttled",
283                        crash_info.is_throttled_at(runtime, crash_loop_age_out),
284                    );
285                    process_node.record_int("total_throttled_crashes", *count);
286                    if let Some(end) = crash_info.throttling_end(crash_loop_age_out) {
287                        process_node.record_int("throttling_runtime_end_millis", end.into_millis());
288                    }
289
290                    inspector.root().record(process_node);
291                }
292                Ok(inspector)
293            }
294            .boxed()
295        });
296    }
297}
298
299#[derive(Default)]
300struct CrashInfo {
301    /// How many crashes have occurred while throttled. Resets to 0 if the throttling ends or if a
302    /// representative report is uploaded every REPORT_EVERY_X_WHILE_THROTTLED.
303    num_crashes_while_throttled: u32,
304
305    /// When the crashes occurred. Crashes that occurred more than CrashReporter.crash_loop_age_out
306    /// ago may be removed.
307    crash_runtimes: VecDeque<zx::MonotonicInstant>,
308}
309
310impl CrashInfo {
311    /// Whether the process is throttled at a given instant.
312    fn is_throttled_at(
313        &self,
314        runtime: zx::MonotonicInstant,
315        crash_loop_age_out: zx::MonotonicDuration,
316    ) -> bool {
317        self.crash_runtimes.iter().filter(|&&x| (runtime - x) < crash_loop_age_out).count()
318            > CRASH_LOOP_LIMIT
319    }
320
321    /// When a process will no longer be throttled, if it currently is throttled.
322    fn throttling_end(
323        &self,
324        crash_loop_age_out: zx::MonotonicDuration,
325    ) -> Option<zx::MonotonicDuration> {
326        let throttling_end = self.crash_runtimes.iter().nth_back(CRASH_LOOP_LIMIT - 1)?;
327        Some(crash_loop_age_out + zx::Duration::from_nanos(throttling_end.into_nanos()))
328    }
329
330    // Only keeps entries that are within `crash_loop_age_out`.
331    fn prune_crash_runtimes(
332        &mut self,
333        runtime: zx::MonotonicInstant,
334        crash_loop_age_out: zx::MonotonicDuration,
335    ) {
336        self.crash_runtimes.retain(|&x| (runtime - x) < crash_loop_age_out);
337    }
338}
339
340fn truncate_with_ellipsis(s: &mut String, max_len: usize) {
341    if s.len() <= max_len {
342        return;
343    }
344
345    // 3 bytes for ellipsis.
346    let max_content_len = max_len - 3;
347
348    // String::truncate panics if the new max length is in the middle of a character, so we need to
349    // find an appropriate byte boundary.
350    let mut new_len = 0;
351    let mut iter = s.char_indices();
352    while let Some((offset, _)) = iter.next() {
353        if offset > max_content_len {
354            break;
355        }
356        new_len = offset;
357    }
358
359    s.truncate(new_len);
360    s.push_str("...");
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366    use crate::testing::spawn_kernel_and_run;
367
368    const CRASH_LOOP_AGE_OUT: zx::MonotonicDuration = zx::Duration::from_minutes(8);
369
370    #[test]
371    fn truncate_noop_on_max_length_string() {
372        let mut s = String::from("1234567890");
373        let before = s.clone();
374        truncate_with_ellipsis(&mut s, 10);
375        assert_eq!(s, before);
376    }
377
378    #[test]
379    fn truncate_adds_ellipsis() {
380        let mut s = String::from("1234567890");
381        truncate_with_ellipsis(&mut s, 9);
382        assert_eq!(s.len(), 9);
383        assert_eq!(s, "123456...", "truncate must add ellipsis and still fit under max len");
384    }
385
386    #[test]
387    fn truncate_is_sensible_in_middle_of_multibyte_chars() {
388        let mut s = String::from("æææææææææ");
389        // æ is 2 bytes, so any odd byte length should be in the middle of a character. Truncate
390        // adds 3 bytes for the ellipsis so we actually need an even max length to hit the middle
391        // of a character.
392        truncate_with_ellipsis(&mut s, 8);
393        assert_eq!(s.len(), 7, "may end up shorter than provided max length w/ multi-byte chars");
394        assert_eq!(s, "ææ...", "truncate must remove whole characters and add ellipsis");
395    }
396
397    #[test]
398    fn not_throttled() {
399        let crash_reporter = CrashReporter::new(
400            &fuchsia_inspect::Node::default(),
401            /*proxy=*/ None,
402            CRASH_LOOP_AGE_OUT,
403            /*enable_throttling=*/ true,
404        );
405
406        assert!(
407            crash_reporter
408                .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
409                .is_some()
410        );
411    }
412
413    #[test]
414    fn throttled() {
415        let crash_reporter = CrashReporter::new(
416            &fuchsia_inspect::Node::default(),
417            /*proxy=*/ None,
418            CRASH_LOOP_AGE_OUT,
419            /*enable_throttling=*/ true,
420        );
421
422        for _ in 0..CRASH_LOOP_LIMIT {
423            assert!(
424                crash_reporter
425                    .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
426                    .is_some()
427            );
428        }
429        assert!(
430            crash_reporter
431                .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
432                .is_none()
433        );
434    }
435
436    #[test]
437    fn throttling_ages_out() {
438        let crash_reporter = CrashReporter::new(
439            &fuchsia_inspect::Node::default(),
440            /*proxy=*/ None,
441            CRASH_LOOP_AGE_OUT,
442            /*enable_throttling=*/ true,
443        );
444
445        for _ in 0..CRASH_LOOP_LIMIT {
446            assert!(
447                crash_reporter
448                    .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
449                    .is_some()
450            );
451        }
452        assert!(
453            crash_reporter
454                .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
455                .is_none()
456        );
457        assert!(
458            crash_reporter
459                .report_guard(
460                    vec![],
461                    "test-process".to_string(),
462                    zx::Instant::from_nanos(CRASH_LOOP_AGE_OUT.into_nanos())
463                )
464                .is_some()
465        );
466    }
467
468    #[test]
469    fn reports_some_crashes_while_throttled() {
470        const RUNTIME: zx::MonotonicInstant = zx::Instant::from_nanos(0);
471        let crash_reporter = CrashReporter::new(
472            &fuchsia_inspect::Node::default(),
473            /*proxy=*/ None,
474            CRASH_LOOP_AGE_OUT,
475            /*enable_throttling=*/ true,
476        );
477
478        for _ in 0..CRASH_LOOP_LIMIT {
479            assert!(
480                crash_reporter.report_guard(vec![], "test-process".to_string(), RUNTIME).is_some()
481            );
482        }
483
484        for _ in 0..REPORT_EVERY_X_WHILE_THROTTLED - 1 {
485            assert!(
486                crash_reporter.report_guard(vec![], "test-process".to_string(), RUNTIME).is_none()
487            );
488        }
489
490        assert_eq!(
491            crash_reporter
492                .report_guard(vec![], "test-process".to_string(), RUNTIME)
493                .unwrap()
494                .weight,
495            REPORT_EVERY_X_WHILE_THROTTLED
496        );
497    }
498
499    #[test]
500    fn is_throttled_filters() {
501        let mut crash_info: CrashInfo = Default::default();
502
503        crash_info.crash_runtimes.push_back(zx::MonotonicInstant::from_nanos(0));
504        for _ in 0..CRASH_LOOP_LIMIT {
505            crash_info.crash_runtimes.push_back(zx::MonotonicInstant::from_nanos(50));
506        }
507
508        assert!(
509            crash_info.is_throttled_at(zx::MonotonicInstant::from_nanos(0), CRASH_LOOP_AGE_OUT)
510        );
511        assert!(!crash_info.is_throttled_at(
512            zx::MonotonicInstant::from_nanos(CRASH_LOOP_AGE_OUT.into_nanos()),
513            CRASH_LOOP_AGE_OUT
514        ));
515    }
516
517    #[fuchsia::test]
518    async fn begin_crash_report_throttling_ends() {
519        spawn_kernel_and_run(async |_, current_task| {
520            let crash_reporter = CrashReporter::new(
521                &fuchsia_inspect::Node::default(),
522                /*proxy=*/ None,
523                zx::Duration::from_millis(200),
524                /*enable_throttling=*/ true,
525            );
526
527            for _ in 0..CRASH_LOOP_LIMIT {
528                assert!(crash_reporter.begin_crash_report(current_task).is_some());
529            }
530            assert!(crash_reporter.begin_crash_report(current_task).is_none());
531
532            std::thread::sleep(std::time::Duration::from_millis(250));
533            assert!(crash_reporter.begin_crash_report(current_task).is_some());
534        })
535        .await;
536    }
537}