1use crate::task::{CurrentTask, ExitStatus};
6use fidl_fuchsia_feedback::{
7 Annotation, CrashReport, CrashReporterProxy, MAX_ANNOTATION_VALUE_LENGTH,
8 MAX_CRASH_SIGNATURE_LENGTH, NativeCrashReport, SpecificCrashReport,
9};
10use fuchsia_inspect::{Inspector, Node};
11use futures::FutureExt;
12use starnix_logging::{
13 CATEGORY_STARNIX, CoreDumpInfo, CoreDumpList, TraceScope, log_error, log_info, log_warn,
14 trace_instant,
15};
16use starnix_sync::Mutex;
17use std::collections::{HashMap, VecDeque};
18use std::sync::Arc;
19use zx::{self as zx, AsHandleRef};
20
21pub const CRASH_LOOP_LIMIT: usize = 8;
25
26const REPORT_EVERY_X_WHILE_THROTTLED: u32 = 10;
29
30pub struct CrashReporter {
31 core_dumps: CoreDumpList,
33
34 throttled_core_dumps: Arc<Mutex<HashMap<String, i64>>>,
37
38 crashes_per_process: Arc<Mutex<HashMap<String, CrashInfo>>>,
40
41 proxy: Option<CrashReporterProxy>,
43
44 crash_loop_age_out: zx::MonotonicDuration,
46
47 enable_throttling: bool,
49}
50
51pub struct PendingCrashReport {
52 argv: Vec<String>,
54
55 argv0: String,
57
58 weight: u32,
61}
62
63impl CrashReporter {
64 pub fn new(
65 inspect_node: &Node,
66 proxy: Option<CrashReporterProxy>,
67 crash_loop_age_out: zx::MonotonicDuration,
68 enable_throttling: bool,
69 ) -> Self {
70 let crash_reporter = Self {
71 core_dumps: CoreDumpList::new(inspect_node.create_child("coredumps")),
72 throttled_core_dumps: Arc::new(Mutex::new(Default::default())),
73 crashes_per_process: Arc::new(Mutex::new(Default::default())),
74 proxy,
75 crash_loop_age_out,
76 enable_throttling,
77 };
78
79 crash_reporter.record_throttling_in_inspect(inspect_node);
80 crash_reporter
81 }
82
83 pub fn begin_crash_report(&self, current_task: &CurrentTask) -> Option<PendingCrashReport> {
86 let argv = current_task
87 .read_argv(MAX_ANNOTATION_VALUE_LENGTH as usize)
88 .unwrap_or_else(|_| vec!["<unknown>".into()])
89 .into_iter()
90 .map(|a| a.to_string())
91 .collect::<Vec<_>>();
92 let argv0 = argv.get(0).map(AsRef::as_ref).unwrap_or_else(|| "<unknown>");
93
94 let argv0 = argv0.rsplit_once("/").unwrap_or(("", &argv0)).1.to_string();
96
97 self.report_guard(argv, argv0, zx::MonotonicInstant::get())
98 }
99
100 pub fn handle_core_dump(
102 &self,
103 current_task: &CurrentTask,
104 exit_status: &ExitStatus,
105 pending_crash_report: PendingCrashReport,
106 ) {
107 trace_instant!(CATEGORY_STARNIX, "RecordCoreDump", TraceScope::Process);
108
109 let argv = pending_crash_report.argv;
110 let argv0 = pending_crash_report.argv0;
111 let process_koid = current_task
112 .thread_group()
113 .process
114 .get_koid()
115 .expect("handles for processes with crashing threads are still valid");
116 let thread_koid = current_task
117 .thread
118 .read()
119 .as_ref()
120 .expect("coredumps occur in tasks with associated threads")
121 .get_koid()
122 .expect("handles for crashing threads are still valid");
123 let linux_pid = current_task.thread_group().leader as i64;
124 let thread_name = current_task.command().to_string();
125 let signal = match exit_status {
126 ExitStatus::CoreDump(s) => s.signal,
127 other => {
128 log_error!(
129 "only core dump exit statuses should be handled as core dumps, got {other:?}"
130 );
131 return;
132 }
133 };
134
135 let uptime = zx::MonotonicInstant::get() - current_task.thread_group().start_time;
137
138 let dump_info = CoreDumpInfo {
139 process_koid,
140 thread_koid,
141 linux_pid,
142 uptime: uptime.into_nanos(),
143 argv: argv.clone(),
144 thread_name: thread_name.clone(),
145 signal: signal.to_string(),
146 };
147 self.core_dumps.record_core_dump(dump_info);
148
149 let mut argv_joined = argv.join(" ");
150 truncate_with_ellipsis(&mut argv_joined, MAX_ANNOTATION_VALUE_LENGTH as usize);
151
152 let mut env_joined = current_task
153 .read_env(MAX_ANNOTATION_VALUE_LENGTH as usize)
154 .unwrap_or_else(|_| vec![])
155 .into_iter()
156 .map(|a| a.to_string())
157 .collect::<Vec<_>>()
158 .join(" ");
159 truncate_with_ellipsis(&mut env_joined, MAX_ANNOTATION_VALUE_LENGTH as usize);
160
161 let signal_str = signal.to_string();
162
163 let max_signature_prefix_len = MAX_CRASH_SIGNATURE_LENGTH as usize - (signal_str.len() + 1);
165 let mut crash_signature = argv0.clone();
166 truncate_with_ellipsis(&mut crash_signature, max_signature_prefix_len);
167 crash_signature.push(' ');
168 crash_signature.push_str(&signal_str);
169
170 let crash_report = CrashReport {
171 crash_signature: Some(crash_signature),
172 program_name: Some(argv0.clone()),
173 program_uptime: Some(uptime.into_nanos()),
174 specific_report: Some(SpecificCrashReport::Native(NativeCrashReport {
175 process_koid: Some(process_koid.raw_koid()),
176 process_name: Some(argv0),
177 thread_koid: Some(thread_koid.raw_koid()),
178 thread_name: Some(thread_name),
179 ..Default::default()
180 })),
181 annotations: Some(vec![
182 Annotation { key: "linux.pid".to_string(), value: linux_pid.to_string() },
186 Annotation { key: "linux.argv".to_string(), value: argv_joined },
187 Annotation { key: "linux.env".to_string(), value: env_joined },
188 Annotation { key: "linux.signal".to_string(), value: signal_str },
189 ]),
190 is_fatal: Some(true),
191 weight: Some(pending_crash_report.weight),
192 ..Default::default()
193 };
194
195 if let Some(reporter) = &self.proxy {
196 let reporter = reporter.clone();
197 current_task.kernel().kthreads.spawn_future(async move || {
199 match reporter.file_report(crash_report).await {
200 Ok(Ok(_)) => (),
201 Ok(Err(filing_error)) => {
202 log_error!(filing_error:?; "Couldn't file crash report.");
203 }
204 Err(fidl_error) => log_warn!(
205 fidl_error:?;
206 "Couldn't file crash report due to error on underlying channel."
207 ),
208 };
209 });
210 } else {
211 log_info!(crash_report:?; "no crash reporter available for crash");
212 }
213 }
214
215 fn report_guard(
222 &self,
223 argv: Vec<String>,
224 argv0: String,
225 runtime: zx::MonotonicInstant,
226 ) -> Option<PendingCrashReport> {
227 if !self.enable_throttling {
228 return Some(PendingCrashReport { argv, argv0, weight: 1 });
229 }
230
231 let mut crashes_per_process = self.crashes_per_process.lock();
233 let crash_info = crashes_per_process.entry(argv0.clone()).or_default();
234 crash_info.crash_runtimes.push_back(runtime);
235
236 crash_info.prune_crash_runtimes(runtime, self.crash_loop_age_out);
237
238 crash_info.num_crashes_while_throttled += 1;
241
242 if crash_info.is_throttled_at(runtime, self.crash_loop_age_out)
244 && (crash_info.num_crashes_while_throttled < REPORT_EVERY_X_WHILE_THROTTLED)
245 {
246 log_info!(
247 "Process '{argv0}' is throttled due to suspected crash loop, will fold report into later crash"
248 );
249 *self.throttled_core_dumps.lock().entry(argv0).or_default() += 1;
250 return None;
251 }
252
253 let weight = crash_info.num_crashes_while_throttled;
254 crash_info.num_crashes_while_throttled = 0;
255
256 Some(PendingCrashReport { argv, argv0, weight })
257 }
258
259 fn record_throttling_in_inspect(&self, inspect_node: &Node) {
260 let throttled_core_dumps = self.throttled_core_dumps.clone();
261 let crashes_per_process = self.crashes_per_process.clone();
262 let crash_loop_age_out = self.crash_loop_age_out;
263
264 inspect_node.record_lazy_child("coredumps_throttled", move || {
265 let throttled_core_dumps = throttled_core_dumps.clone();
266 let crashes_per_process = crashes_per_process.clone();
267
268 async move {
269 let inspector = Inspector::default();
270 let mut crashes_per_process = crashes_per_process.lock();
271 let runtime = zx::MonotonicInstant::get();
272
273 for (process, count) in throttled_core_dumps.lock().iter() {
274 let Some(crash_info) = crashes_per_process.get_mut(process) else {
275 continue;
276 };
277
278 crash_info.prune_crash_runtimes(runtime, crash_loop_age_out);
279
280 let process_node = inspector.root().create_child(process);
281 process_node.record_bool(
282 "currently_throttled",
283 crash_info.is_throttled_at(runtime, crash_loop_age_out),
284 );
285 process_node.record_int("total_throttled_crashes", *count);
286 if let Some(end) = crash_info.throttling_end(crash_loop_age_out) {
287 process_node.record_int("throttling_runtime_end_millis", end.into_millis());
288 }
289
290 inspector.root().record(process_node);
291 }
292 Ok(inspector)
293 }
294 .boxed()
295 });
296 }
297}
298
299#[derive(Default)]
300struct CrashInfo {
301 num_crashes_while_throttled: u32,
304
305 crash_runtimes: VecDeque<zx::MonotonicInstant>,
308}
309
310impl CrashInfo {
311 fn is_throttled_at(
313 &self,
314 runtime: zx::MonotonicInstant,
315 crash_loop_age_out: zx::MonotonicDuration,
316 ) -> bool {
317 self.crash_runtimes.iter().filter(|&&x| (runtime - x) < crash_loop_age_out).count()
318 > CRASH_LOOP_LIMIT
319 }
320
321 fn throttling_end(
323 &self,
324 crash_loop_age_out: zx::MonotonicDuration,
325 ) -> Option<zx::MonotonicDuration> {
326 let throttling_end = self.crash_runtimes.iter().nth_back(CRASH_LOOP_LIMIT - 1)?;
327 Some(crash_loop_age_out + zx::Duration::from_nanos(throttling_end.into_nanos()))
328 }
329
330 fn prune_crash_runtimes(
332 &mut self,
333 runtime: zx::MonotonicInstant,
334 crash_loop_age_out: zx::MonotonicDuration,
335 ) {
336 self.crash_runtimes.retain(|&x| (runtime - x) < crash_loop_age_out);
337 }
338}
339
340fn truncate_with_ellipsis(s: &mut String, max_len: usize) {
341 if s.len() <= max_len {
342 return;
343 }
344
345 let max_content_len = max_len - 3;
347
348 let mut new_len = 0;
351 let mut iter = s.char_indices();
352 while let Some((offset, _)) = iter.next() {
353 if offset > max_content_len {
354 break;
355 }
356 new_len = offset;
357 }
358
359 s.truncate(new_len);
360 s.push_str("...");
361}
362
363#[cfg(test)]
364mod tests {
365 use super::*;
366 use crate::testing::spawn_kernel_and_run;
367
368 const CRASH_LOOP_AGE_OUT: zx::MonotonicDuration = zx::Duration::from_minutes(8);
369
370 #[test]
371 fn truncate_noop_on_max_length_string() {
372 let mut s = String::from("1234567890");
373 let before = s.clone();
374 truncate_with_ellipsis(&mut s, 10);
375 assert_eq!(s, before);
376 }
377
378 #[test]
379 fn truncate_adds_ellipsis() {
380 let mut s = String::from("1234567890");
381 truncate_with_ellipsis(&mut s, 9);
382 assert_eq!(s.len(), 9);
383 assert_eq!(s, "123456...", "truncate must add ellipsis and still fit under max len");
384 }
385
386 #[test]
387 fn truncate_is_sensible_in_middle_of_multibyte_chars() {
388 let mut s = String::from("æææææææææ");
389 truncate_with_ellipsis(&mut s, 8);
393 assert_eq!(s.len(), 7, "may end up shorter than provided max length w/ multi-byte chars");
394 assert_eq!(s, "ææ...", "truncate must remove whole characters and add ellipsis");
395 }
396
397 #[test]
398 fn not_throttled() {
399 let crash_reporter = CrashReporter::new(
400 &fuchsia_inspect::Node::default(),
401 None,
402 CRASH_LOOP_AGE_OUT,
403 true,
404 );
405
406 assert!(
407 crash_reporter
408 .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
409 .is_some()
410 );
411 }
412
413 #[test]
414 fn throttled() {
415 let crash_reporter = CrashReporter::new(
416 &fuchsia_inspect::Node::default(),
417 None,
418 CRASH_LOOP_AGE_OUT,
419 true,
420 );
421
422 for _ in 0..CRASH_LOOP_LIMIT {
423 assert!(
424 crash_reporter
425 .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
426 .is_some()
427 );
428 }
429 assert!(
430 crash_reporter
431 .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
432 .is_none()
433 );
434 }
435
436 #[test]
437 fn throttling_ages_out() {
438 let crash_reporter = CrashReporter::new(
439 &fuchsia_inspect::Node::default(),
440 None,
441 CRASH_LOOP_AGE_OUT,
442 true,
443 );
444
445 for _ in 0..CRASH_LOOP_LIMIT {
446 assert!(
447 crash_reporter
448 .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
449 .is_some()
450 );
451 }
452 assert!(
453 crash_reporter
454 .report_guard(vec![], "test-process".to_string(), zx::Instant::from_nanos(0))
455 .is_none()
456 );
457 assert!(
458 crash_reporter
459 .report_guard(
460 vec![],
461 "test-process".to_string(),
462 zx::Instant::from_nanos(CRASH_LOOP_AGE_OUT.into_nanos())
463 )
464 .is_some()
465 );
466 }
467
468 #[test]
469 fn reports_some_crashes_while_throttled() {
470 const RUNTIME: zx::MonotonicInstant = zx::Instant::from_nanos(0);
471 let crash_reporter = CrashReporter::new(
472 &fuchsia_inspect::Node::default(),
473 None,
474 CRASH_LOOP_AGE_OUT,
475 true,
476 );
477
478 for _ in 0..CRASH_LOOP_LIMIT {
479 assert!(
480 crash_reporter.report_guard(vec![], "test-process".to_string(), RUNTIME).is_some()
481 );
482 }
483
484 for _ in 0..REPORT_EVERY_X_WHILE_THROTTLED - 1 {
485 assert!(
486 crash_reporter.report_guard(vec![], "test-process".to_string(), RUNTIME).is_none()
487 );
488 }
489
490 assert_eq!(
491 crash_reporter
492 .report_guard(vec![], "test-process".to_string(), RUNTIME)
493 .unwrap()
494 .weight,
495 REPORT_EVERY_X_WHILE_THROTTLED
496 );
497 }
498
499 #[test]
500 fn is_throttled_filters() {
501 let mut crash_info: CrashInfo = Default::default();
502
503 crash_info.crash_runtimes.push_back(zx::MonotonicInstant::from_nanos(0));
504 for _ in 0..CRASH_LOOP_LIMIT {
505 crash_info.crash_runtimes.push_back(zx::MonotonicInstant::from_nanos(50));
506 }
507
508 assert!(
509 crash_info.is_throttled_at(zx::MonotonicInstant::from_nanos(0), CRASH_LOOP_AGE_OUT)
510 );
511 assert!(!crash_info.is_throttled_at(
512 zx::MonotonicInstant::from_nanos(CRASH_LOOP_AGE_OUT.into_nanos()),
513 CRASH_LOOP_AGE_OUT
514 ));
515 }
516
517 #[fuchsia::test]
518 async fn begin_crash_report_throttling_ends() {
519 spawn_kernel_and_run(async |_, current_task| {
520 let crash_reporter = CrashReporter::new(
521 &fuchsia_inspect::Node::default(),
522 None,
523 zx::Duration::from_millis(200),
524 true,
525 );
526
527 for _ in 0..CRASH_LOOP_LIMIT {
528 assert!(crash_reporter.begin_crash_report(current_task).is_some());
529 }
530 assert!(crash_reporter.begin_crash_report(current_task).is_none());
531
532 std::thread::sleep(std::time::Duration::from_millis(250));
533 assert!(crash_reporter.begin_crash_report(current_task).is_some());
534 })
535 .await;
536 }
537}