detect/
lib.rs

1// Copyright 2020 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! `triage-detect` is responsible for auto-triggering crash reports in Fuchsia.
6
7mod delay_tracker;
8mod diagnostics;
9mod snapshot;
10mod test_invoker;
11mod triage_shim;
12
13use anyhow::{bail, Context, Error};
14use argh::FromArgs;
15use delay_tracker::DelayTracker;
16use fidl_fuchsia_diagnostics_test::DetectControllerRequestStream;
17use fidl_fuchsia_feedback::MAX_CRASH_SIGNATURE_LENGTH;
18use fuchsia_async as fasync;
19use fuchsia_async::Task;
20use fuchsia_component::server::{ServiceFs, ServiceObj};
21use fuchsia_inspect::health::Reporter;
22use fuchsia_inspect::{self as inspect, NumericProperty};
23use fuchsia_inspect_contrib::nodes::BoundedListNode;
24use fuchsia_inspect_derive::{Inspect, WithInspect};
25use fuchsia_triage::{inspect_logger, SnapshotTrigger};
26use futures::lock::Mutex;
27use futures::{join, StreamExt};
28use glob::glob;
29use injectable_time::MonotonicInstant;
30use log::{error, info, warn};
31use serde_derive::Deserialize;
32use snapshot::SnapshotRequest;
33use std::collections::HashMap;
34use std::sync::Arc;
35use triage_detect_config::Config as ComponentConfig;
36
37const MINIMUM_CHECK_TIME_NANOS: i64 = 60 * 1_000_000_000;
38const CONFIG_GLOB: &str = "/config/data/*.triage";
39const PROGRAM_CONFIG_PATH: &str = "/config/data/config.json";
40const SIGNATURE_PREFIX: &str = "fuchsia-detect-";
41// Minimum time between sending any particular signature (except in integration test mode)
42const MINIMUM_SIGNATURE_INTERVAL_NANOS: i64 = 3600 * 1_000_000_000;
43// The max number of warnings to store (for example, "Snapshot trigger was not boolean"...)
44const WARNING_LIST_MAX_SIZE: usize = 100;
45
46/// The name of the subcommand and the logs-tag.
47pub const PROGRAM_NAME: &str = "detect";
48
49/// Command line args
50#[derive(FromArgs, Debug, PartialEq)]
51#[argh(subcommand, name = "detect")]
52pub struct CommandLine {}
53
54/// Mode::IntegrationTest is for integration testing. It's unrelated to test_invoker, which is
55/// used for performance testing. test_invoker will typically be used in Production mode.
56#[derive(PartialEq, Debug, Clone, Copy)]
57pub(crate) enum Mode {
58    IntegrationTest,
59    Production,
60}
61
62#[derive(Deserialize, Default, Debug)]
63struct ProgramConfig {
64    enable_filing: Option<bool>,
65}
66
67fn load_program_config() -> Result<ProgramConfig, Error> {
68    let config_text = match std::fs::read_to_string(PROGRAM_CONFIG_PATH) {
69        Ok(text) => text,
70        Err(_) => {
71            info!("Program config file not found; using default config.");
72            "{}".to_string()
73        }
74    };
75    match serde_json5::from_str::<ProgramConfig>(&config_text) {
76        Ok(config) => Ok(config),
77        Err(e) => bail!("Program config format error: {}", e),
78    }
79}
80
81fn load_configuration_files() -> Result<HashMap<String, String>, Error> {
82    fn file_stem(file_path: &std::path::PathBuf) -> Result<String, Error> {
83        if let Some(s) = file_path.file_stem() {
84            if let Some(s) = s.to_str() {
85                return Ok(s.to_owned());
86            }
87        }
88        bail!("Bad path {:?} - can't find file_stem", file_path)
89    }
90
91    let mut file_contents = HashMap::new();
92    for file_path in glob(CONFIG_GLOB)? {
93        let file_path = file_path?;
94        let stem = file_stem(&file_path)?;
95        let config_text = std::fs::read_to_string(file_path)?;
96        file_contents.insert(stem, config_text);
97    }
98    Ok(file_contents)
99}
100
101/// appropriate_check_interval determines the interval to check diagnostics, or signals error.
102///
103/// If the command line can't be evaluated to an integer, an error is returned.
104/// If the integer is below minimum and mode isn't IntegrationTest, an error is returned.
105/// If a valid integer is determined, it is returned as a zx::MonotonicDuration.
106fn appropriate_check_interval(
107    expression: &str,
108    mode: Mode,
109) -> Result<zx::MonotonicDuration, Error> {
110    let check_every = triage_shim::evaluate_int_math(expression)
111        .or_else(|e| bail!("Check_every argument must be Minutes(n), Hours(n), etc. but: {}", e))?;
112    if check_every < MINIMUM_CHECK_TIME_NANOS && mode != Mode::IntegrationTest {
113        bail!(
114            "Minimum time to check is {} seconds; {} nanos is too small",
115            MINIMUM_CHECK_TIME_NANOS / 1_000_000_000,
116            check_every
117        );
118    }
119    info!(
120        "Checking every {} seconds from command line '{:?}'",
121        check_every / 1_000_000_000,
122        expression
123    );
124    Ok(zx::MonotonicDuration::from_nanos(check_every))
125}
126
127fn build_signature(snapshot: SnapshotTrigger, mode: Mode) -> String {
128    // Character and length restrictions are documented in
129    // https://fuchsia.dev/reference/fidl/fuchsia.feedback#CrashReport
130    let sanitized: String = snapshot
131        .signature
132        .chars()
133        .map(|char| match char {
134            c if char.is_ascii_lowercase() => c,
135            c if char.is_ascii_uppercase() => c.to_ascii_lowercase(),
136            _ => '-',
137        })
138        .collect();
139    if sanitized != snapshot.signature {
140        let message = format!("Signature {} was sanitized to {}", snapshot.signature, sanitized);
141        if mode == Mode::IntegrationTest {
142            warn!("{}", message);
143        } else {
144            error!("{}", message);
145        }
146    }
147    let mut signature = format!("{}{}", SIGNATURE_PREFIX, sanitized);
148    if signature.len() > fidl_fuchsia_feedback::MAX_CRASH_SIGNATURE_LENGTH as usize {
149        let new_signature =
150            signature.chars().take(MAX_CRASH_SIGNATURE_LENGTH as usize).collect::<String>();
151        let message = format!("Signature '{}' truncated to '{}'", signature, new_signature);
152        if mode == Mode::IntegrationTest {
153            warn!("{}", message);
154        } else {
155            error!("{}", message);
156        }
157        signature = new_signature;
158    }
159    signature
160}
161
162#[derive(Inspect, Default)]
163struct Stats {
164    scan_count: inspect::UintProperty,
165    scan_test_count: inspect::UintProperty,
166    missed_deadlines: inspect::UintProperty,
167    triage_warnings: inspect::UintProperty,
168    issues_detected: inspect::UintProperty,
169    issues_throttled: inspect::UintProperty,
170    issues_send_count: inspect::UintProperty,
171    issues_send_errors: inspect::UintProperty,
172    inspect_node: fuchsia_inspect::Node,
173}
174
175// Detect serves a test-invoker protocol for performance testing.
176enum IncomingService {
177    DetectController(DetectControllerRequestStream),
178}
179
180pub async fn main() -> Result<(), Error> {
181    let inspector = inspect::component::inspector();
182    let _inspect_server_task =
183        inspect_runtime::publish(inspector, inspect_runtime::PublishOptions::default());
184
185    let component_config = ComponentConfig::take_from_startup_handle();
186    inspector
187        .root()
188        .record_child("config", |config_node| component_config.record_inspect(config_node));
189
190    inspect_logger::set_log_list_node(BoundedListNode::new(
191        inspector.root().create_child("triage_warnings"),
192        WARNING_LIST_MAX_SIZE,
193    ));
194
195    let stats = Stats::default().with_inspect(inspector.root(), "stats")?;
196    let mode = match component_config.test_only {
197        true => Mode::IntegrationTest,
198        false => Mode::Production,
199    };
200    let check_every = appropriate_check_interval(&component_config.check_every, mode)
201        .context("Invalid command line arg for check time")?;
202    let program_config = load_program_config().context("Error loading program config")?;
203    info!("Test mode: {:?}, program config: {:?}", mode, program_config);
204    let configuration = load_configuration_files().context("Error reading configuration files")?;
205
206    let triage_engine = triage_shim::TriageLib::new(configuration)
207        .context("Failed to parse Detect configuration files")?;
208    info!("Loaded and parsed .triage files");
209
210    let selectors = triage_engine.selectors();
211    let diagnostic_source = diagnostics::DiagnosticFetcher::create(selectors)?;
212    let snapshot_service =
213        snapshot::CrashReportHandlerBuilder::new(MonotonicInstant::new()).build().await?;
214    let system_time = MonotonicInstant::new();
215    let delay_tracker = DelayTracker::new(Arc::new(system_time), mode);
216    let detection_runner = DetectionRunner {
217        stats,
218        mode,
219        diagnostic_source,
220        snapshot_service,
221        delay_tracker,
222        program_config,
223        triage_engine,
224    };
225    // The test_invoker system may ask us to do stuff. We shouldn't do any main-loop stuff while
226    // the test stuff is going on.
227    let detection_runner = Arc::new(Mutex::new(detection_runner));
228    let mut service_fs: ServiceFs<ServiceObj<'_, IncomingService>> = ServiceFs::new();
229    service_fs.dir("svc").add_fidl_service(IncomingService::DetectController);
230    service_fs.take_and_serve_directory_handle()?;
231    const MAX_CONCURRENT: usize = 10_000;
232    let runner_clone = detection_runner.clone();
233    let test_invoker_task = Task::local(service_fs.for_each_concurrent(
234        MAX_CONCURRENT,
235        move |IncomingService::DetectController(stream)| {
236            test_invoker::run_test_service(stream, runner_clone.clone())
237        },
238    ));
239    inspect::component::health().set_ok();
240
241    let main_loop_fut = main_loop(detection_runner, check_every);
242
243    join!(main_loop_fut, test_invoker_task);
244
245    Ok(())
246}
247
248async fn main_loop(
249    detection_runner: Arc<Mutex<impl RunsDetection>>,
250    check_every: zx::MonotonicDuration,
251) {
252    // Start the first scan as soon as the program starts, via the "missed deadline" logic below.
253    let mut next_check_time = fasync::MonotonicInstant::INFINITE_PAST;
254    loop {
255        if next_check_time < fasync::MonotonicInstant::now() {
256            // We missed a deadline, so don't wait at all; start the check. But first
257            // schedule the next check time at now() + check_every.
258            if next_check_time != fasync::MonotonicInstant::INFINITE_PAST {
259                (*detection_runner.lock().await).stats().missed_deadlines.add(1);
260                warn!(
261                    "Missed diagnostic check deadline {:?} by {:?} nanos",
262                    next_check_time,
263                    fasync::MonotonicInstant::now() - next_check_time
264                );
265            }
266            next_check_time = fasync::MonotonicInstant::now() + check_every;
267        } else {
268            // Wait until time for the next check.
269            fasync::Timer::new(next_check_time).await;
270            // Now it should be approximately next_check_time o'clock. To avoid drift from
271            // delays, calculate the next check time by adding check_every to the current
272            // next_check_time.
273            next_check_time += check_every;
274        }
275        detection_runner.lock().await.run_detection(DetectionOpts::default()).await;
276    }
277}
278
279pub(crate) trait RunsDetection {
280    //async fn run_detection(&mut self, opts: DetectionOpts);
281    fn run_detection(
282        &mut self,
283        opts: DetectionOpts,
284    ) -> impl std::future::Future<Output = ()> + std::marker::Send;
285
286    fn stats(&self) -> &Stats;
287}
288
289// If we're in test mode, don't update the stats and don't actually file snapshots.
290struct DetectionRunner {
291    stats: Stats,
292    mode: Mode,
293    diagnostic_source: diagnostics::DiagnosticFetcher,
294    snapshot_service: snapshot::CrashReportHandler,
295    delay_tracker: DelayTracker,
296    program_config: ProgramConfig,
297    triage_engine: triage_shim::TriageLib,
298}
299
300#[derive(Default)]
301pub(crate) struct DetectionOpts {
302    cpu_test: bool,
303}
304
305impl RunsDetection for DetectionRunner {
306    fn stats(&self) -> &Stats {
307        &self.stats
308    }
309
310    //async fn run_detection(&mut self, opts: DetectionOpts) {
311    async fn run_detection(&mut self, opts: DetectionOpts) {
312        if opts.cpu_test {
313            self.stats().scan_test_count.add(1);
314        } else {
315            self.stats().scan_count.add(1);
316        }
317        let diagnostics = self.diagnostic_source.get_diagnostics().await;
318        let diagnostics = match diagnostics {
319            Ok(diagnostics) => diagnostics,
320            Err(e) => {
321                // This happens when the integration tester runs out of Inspect data.
322                if self.mode != Mode::IntegrationTest {
323                    error!("Fetching diagnostics failed: {}", e);
324                }
325                return;
326            }
327        };
328
329        let (snapshot_requests, warnings) = self.triage_engine.evaluate(diagnostics);
330
331        if opts.cpu_test {
332            return;
333        }
334        self.stats().triage_warnings.add(warnings.len() as u64);
335        for snapshot in snapshot_requests {
336            self.handle_snapshot(snapshot);
337        }
338    }
339}
340
341impl DetectionRunner {
342    fn handle_snapshot(&mut self, snapshot: SnapshotTrigger) {
343        self.stats().issues_detected.add(1);
344        if self.delay_tracker.ok_to_send(&snapshot) {
345            let signature = build_signature(snapshot, self.mode);
346            if self.program_config.enable_filing == Some(true) {
347                self.stats.issues_send_count.add(1);
348                if let Err(e) =
349                    self.snapshot_service.request_snapshot(SnapshotRequest::new(signature))
350                {
351                    self.stats.issues_send_errors.add(1);
352                    error!("Snapshot request failed: {}", e);
353                }
354            } else {
355                warn!("Detect would have filed {}", signature);
356            }
357        } else {
358            self.stats().issues_throttled.add(1);
359        }
360    }
361}
362
363#[cfg(test)]
364mod test {
365    use super::*;
366
367    #[fuchsia::test]
368    fn verify_appropriate_check_interval() -> Result<(), Error> {
369        let error_a = "a".to_string();
370        let error_empty = "".to_string();
371        let raw_1 = "1".to_string();
372        let raw_1_result = zx::MonotonicDuration::from_nanos(1);
373        let short_time = format!("Nanos({})", MINIMUM_CHECK_TIME_NANOS - 1);
374        let short_time_result = zx::MonotonicDuration::from_nanos(MINIMUM_CHECK_TIME_NANOS - 1);
375        let minimum_time = format!("Nanos({})", MINIMUM_CHECK_TIME_NANOS);
376        let minimum_time_result = zx::MonotonicDuration::from_nanos(MINIMUM_CHECK_TIME_NANOS);
377        let long_time = format!("Nanos({})", MINIMUM_CHECK_TIME_NANOS + 1);
378        let long_time_result = zx::MonotonicDuration::from_nanos(MINIMUM_CHECK_TIME_NANOS + 1);
379
380        assert!(appropriate_check_interval(&error_a, Mode::IntegrationTest).is_err());
381        assert!(appropriate_check_interval(&error_empty, Mode::IntegrationTest).is_err());
382        assert_eq!(appropriate_check_interval(&raw_1, Mode::IntegrationTest)?, raw_1_result);
383        assert_eq!(
384            appropriate_check_interval(&short_time, Mode::IntegrationTest)?,
385            short_time_result
386        );
387        assert_eq!(
388            appropriate_check_interval(&minimum_time, Mode::IntegrationTest)?,
389            minimum_time_result
390        );
391        assert_eq!(
392            appropriate_check_interval(&long_time, Mode::IntegrationTest)?,
393            long_time_result
394        );
395
396        assert!(appropriate_check_interval(&error_a, Mode::Production).is_err());
397        assert!(appropriate_check_interval(&error_empty, Mode::Production).is_err());
398        assert!(appropriate_check_interval(&raw_1, Mode::Production).is_err());
399        assert!(appropriate_check_interval(&short_time, Mode::Production).is_err());
400        assert_eq!(
401            appropriate_check_interval(&minimum_time, Mode::Production)?,
402            minimum_time_result
403        );
404        assert_eq!(
405            appropriate_check_interval(&long_time, Mode::IntegrationTest)?,
406            long_time_result
407        );
408        assert_eq!(appropriate_check_interval(&long_time, Mode::Production)?, long_time_result);
409        Ok(())
410    }
411
412    #[fuchsia::test]
413    fn verify_build_signature() {
414        fn sig(signature: &str) -> fuchsia_triage::SnapshotTrigger {
415            fuchsia_triage::SnapshotTrigger { interval: 0, signature: signature.to_string() }
416        }
417
418        let long_sig_input = "A very very long string that just keeps going and going and won't \
419          quit no matter how long it goes and I don't know when it will ever stop";
420        let desired_long_output = "fuchsia-detect-a-very-very-long-string-that-just-keeps-going-and-going-and-won-t-quit-no-matter-how-long-it-goes-and-i-don-t-kno";
421        let long_sig_output = build_signature(sig(long_sig_input), Mode::Production);
422
423        assert_eq!(long_sig_output, desired_long_output.to_string());
424        assert_eq!(long_sig_output.len(), MAX_CRASH_SIGNATURE_LENGTH as usize);
425        assert_eq!(
426            build_signature(sig("Test lowercase"), Mode::Production),
427            "fuchsia-detect-test-lowercase".to_string()
428        );
429    }
430}