stack_migration/
main.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5mod rollback;
6
7use std::pin::{Pin, pin};
8use std::time::Duration;
9
10use cobalt_client::traits::AsEventCode as _;
11use fuchsia_async::Task;
12use fuchsia_component::server::{ServiceFs, ServiceFsDir};
13use fuchsia_inspect::Property as _;
14use futures::channel::mpsc;
15use futures::{FutureExt as _, Stream, StreamExt as _};
16use log::{error, info, warn};
17use networking_metrics_registry::networking_metrics_registry as metrics_registry;
18use {
19    fidl_fuchsia_metrics as fmetrics, fidl_fuchsia_net_stackmigrationdeprecated as fnet_migration,
20    fidl_fuchsia_power_internal as fpower,
21};
22
23const DEFAULT_NETSTACK: NetstackVersion = NetstackVersion::Netstack3;
24
25/// The number of times a device is allowed to fail to migrate to Netstack3,
26/// before abandoning migration attempts during the `CURRENT_EPOCH`.
27const MAX_ROLLBACKS_PER_EPOCH: u8 = 3;
28
29/// The "epoch" is an arbitrary interval used to limit the number of attempts a
30/// device will make to migrate to Netstack3. Once the epoch increases, the
31/// device will resume attempts to migrate to Netstack3.
32///
33/// Pragmatically the intention is to increment this number every time a Fuchsia
34/// Release is cut that contains a known fix to a Netstack3 bug.
35const CURRENT_EPOCH: u32 = 1;
36
37/// The number of failed healthchecks at which we begin generating crash
38/// reports.
39const HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS: usize = 2;
40
41#[derive(Debug, Clone, Copy, serde::Deserialize, serde::Serialize, Eq, PartialEq)]
42enum NetstackVersion {
43    Netstack2,
44    Netstack3,
45}
46
47impl NetstackVersion {
48    fn inspect_uint_value(&self) -> u64 {
49        match self {
50            Self::Netstack2 => 2,
51            Self::Netstack3 => 3,
52        }
53    }
54
55    fn optional_inspect_uint_value(o: &Option<Self>) -> u64 {
56        o.as_ref().map(Self::inspect_uint_value).unwrap_or(0)
57    }
58}
59
60impl From<fnet_migration::NetstackVersion> for NetstackVersion {
61    fn from(value: fnet_migration::NetstackVersion) -> Self {
62        match value {
63            fnet_migration::NetstackVersion::Netstack2 => NetstackVersion::Netstack2,
64            fnet_migration::NetstackVersion::Netstack3 => NetstackVersion::Netstack3,
65        }
66    }
67}
68
69impl From<NetstackVersion> for fnet_migration::NetstackVersion {
70    fn from(value: NetstackVersion) -> Self {
71        match value {
72            NetstackVersion::Netstack2 => fnet_migration::NetstackVersion::Netstack2,
73            NetstackVersion::Netstack3 => fnet_migration::NetstackVersion::Netstack3,
74        }
75    }
76}
77
78impl From<NetstackVersion> for Box<fnet_migration::VersionSetting> {
79    fn from(value: NetstackVersion) -> Self {
80        Box::new(fnet_migration::VersionSetting { version: value.into() })
81    }
82}
83
84#[derive(Debug, PartialEq)]
85enum RollbackNetstackVersion {
86    Netstack2,
87    // The automated setting requested Netstack3, but the persisted state
88    // indicates that the previous boot failed to migrate to Netstack3 (had too
89    // many health check failure). Forcibly use Netstack2 for this boot.
90    ForceNetstack2,
91    // The automated setting requested Netstack3, but we've already failed too
92    // many migration attempts in the `CURRENT_EPOCH`. Forcibly Use Netstack2
93    // until the next epoch.
94    ForceNetstack2Indefinitely,
95    Netstack3,
96}
97
98impl RollbackNetstackVersion {
99    // Convert into a `NetstackVersion`, while honoring the forced setting.
100    fn version(&self) -> NetstackVersion {
101        match self {
102            Self::Netstack2 | Self::ForceNetstack2 | Self::ForceNetstack2Indefinitely => {
103                NetstackVersion::Netstack2
104            }
105            Self::Netstack3 => NetstackVersion::Netstack3,
106        }
107    }
108
109    // Convert into a `NetstackVersion`, while ignoring the forced setting.
110    fn version_ignoring_force(&self) -> NetstackVersion {
111        match self {
112            Self::Netstack2 => NetstackVersion::Netstack2,
113            Self::Netstack3 | Self::ForceNetstack2 | Self::ForceNetstack2Indefinitely => {
114                NetstackVersion::Netstack3
115            }
116        }
117    }
118}
119
120impl From<NetstackVersion> for RollbackNetstackVersion {
121    fn from(version: NetstackVersion) -> Self {
122        match version {
123            NetstackVersion::Netstack2 => RollbackNetstackVersion::Netstack2,
124            NetstackVersion::Netstack3 => RollbackNetstackVersion::Netstack3,
125        }
126    }
127}
128
129#[derive(Default, Debug, serde::Deserialize, serde::Serialize)]
130#[cfg_attr(test, derive(Eq, PartialEq))]
131struct Persisted {
132    automated: Option<NetstackVersion>,
133    user: Option<NetstackVersion>,
134    rollback: Option<rollback::Persisted>,
135    rollback_history: Option<RollbackHistory>,
136}
137
138impl Persisted {
139    fn load<R: std::io::Read>(r: R) -> Self {
140        serde_json::from_reader(std::io::BufReader::new(r)).unwrap_or_else(|e| {
141            error!("error loading persisted config {e:?}, using defaults");
142            Persisted::default()
143        })
144    }
145
146    fn save<W: std::io::Write>(&self, w: W) {
147        serde_json::to_writer(w, self).unwrap_or_else(|e: serde_json::Error| {
148            error!("error persisting configuration {self:?}: {e:?}")
149        })
150    }
151
152    // Determine the desired NetstackVersion based on the persisted values
153    fn desired_netstack_version(&self) -> RollbackNetstackVersion {
154        match self {
155            // Always prefer the user setting, if present.
156            Persisted { user: Some(user), automated: _, rollback: _, rollback_history: _ } => {
157                (*user).into()
158            }
159            // If the automated setting is Netstack2, honor it unconditionally.
160            Persisted {
161                user: None,
162                automated: Some(NetstackVersion::Netstack2),
163                rollback: _,
164                rollback_history: _,
165            } => RollbackNetstackVersion::Netstack2,
166            // If the automated setting is Netstack3, we need to first check the
167            // rollback state.
168            Persisted {
169                user: None,
170                automated: Some(NetstackVersion::Netstack3),
171                rollback,
172                rollback_history,
173            } => {
174                let rollback_count = match rollback_history {
175                    None => 0,
176                    Some(RollbackHistory { epoch, count }) => {
177                        if *epoch == CURRENT_EPOCH {
178                            *count
179                        } else {
180                            // Ignore failed rollbacks in an different epoch.
181                            0
182                        }
183                    }
184                };
185                let healthcheck_failure_count = match rollback {
186                    None => 0,
187                    Some(rollback::Persisted::Success) => 0,
188                    Some(rollback::Persisted::HealthcheckFailures(count)) => *count,
189                };
190
191                if rollback_count >= MAX_ROLLBACKS_PER_EPOCH {
192                    // If we've failed all of our allotted migration attempts
193                    // abandon further migration attempts in this epoch.
194                    RollbackNetstackVersion::ForceNetstack2Indefinitely
195                } else if healthcheck_failure_count >= rollback::MAX_FAILED_HEALTHCHECKS {
196                    // If we've failed all the healthchecks in the current
197                    // migration attempt, temporarily rollback to Netstack2.
198                    RollbackNetstackVersion::ForceNetstack2
199                } else {
200                    // Otherwise, proceed as normal with Netstack3.
201                    RollbackNetstackVersion::Netstack3
202                }
203            }
204            // Use the default version if nothing is set.
205            Persisted { user: None, automated: None, rollback: _, rollback_history: _ } => {
206                DEFAULT_NETSTACK.into()
207            }
208        }
209    }
210}
211
212/// A history of rollbacks that have occurred on the device.
213#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
214pub(crate) struct RollbackHistory {
215    /// The epoch during which the rollback occurred.
216    epoch: u32,
217    /// The number of rollbacks that took place.
218    count: u8,
219}
220
221impl Default for RollbackHistory {
222    fn default() -> Self {
223        RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }
224    }
225}
226
227enum ServiceRequest {
228    Control(fnet_migration::ControlRequest),
229    State(fnet_migration::StateRequest),
230}
231
232struct Migration<P, CR, R> {
233    current_boot: RollbackNetstackVersion,
234    persisted: Persisted,
235    persistence: P,
236    collaborative_reboot: CollaborativeReboot<CR>,
237    crash_reporter: R,
238}
239
240trait PersistenceProvider {
241    type Writer: std::io::Write;
242    type Reader: std::io::Read;
243
244    fn open_writer(&mut self) -> std::io::Result<Self::Writer>;
245    fn open_reader(&self) -> std::io::Result<Self::Reader>;
246}
247
248fn try_persist<P: PersistenceProvider>(provider: &mut P, data: &Persisted) {
249    let w = match provider.open_writer() {
250        Ok(w) => w,
251        Err(e) => {
252            error!("failed to open writer to persist settings: {e:?}");
253            return;
254        }
255    };
256    data.save(w);
257}
258
259struct DataPersistenceProvider {}
260
261const PERSISTED_FILE_PATH: &'static str = "/data/config.json";
262
263impl PersistenceProvider for DataPersistenceProvider {
264    type Writer = std::fs::File;
265    type Reader = std::fs::File;
266
267    fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
268        std::fs::File::create(PERSISTED_FILE_PATH)
269    }
270
271    fn open_reader(&self) -> std::io::Result<Self::Reader> {
272        std::fs::File::open(PERSISTED_FILE_PATH)
273    }
274}
275
276struct CollaborativeReboot<CR> {
277    scheduler: CR,
278    /// `Some(<cancellation_token>)` if there's an outstanding collaborative
279    /// reboot scheduled.
280    scheduled_req: Option<zx::EventPair>,
281}
282
283impl<CR: CollaborativeRebootScheduler> CollaborativeReboot<CR> {
284    /// Schedules a collaborative reboot.
285    ///
286    /// No-Op if there's already a reboot scheduled.
287    async fn schedule(&mut self) {
288        let Self { scheduler, scheduled_req } = self;
289        if scheduled_req.is_some() {
290            // We already have an outstanding request.
291            return;
292        }
293
294        info!("Scheduling collaborative reboot");
295        let (mine, theirs) = zx::EventPair::create();
296        *scheduled_req = Some(mine);
297        scheduler
298            .schedule(fpower::CollaborativeRebootReason::NetstackMigration, Some(theirs))
299            .await;
300    }
301
302    /// Cancels the currently scheduled collaborative Reboot.
303    ///
304    /// No-Op if there's none scheduled.
305    fn cancel(&mut self) {
306        if let Some(cancel) = self.scheduled_req.take() {
307            info!("Canceling collaborative reboot request. It's no longer necessary.");
308            // Dropping the eventpair cancels the request.
309            std::mem::drop(cancel);
310        }
311    }
312}
313
314/// An abstraction over the `fpower::CollaborativeRebootScheduler` FIDL API.
315trait CollaborativeRebootScheduler {
316    async fn schedule(
317        &mut self,
318        reason: fpower::CollaborativeRebootReason,
319        cancel: Option<zx::EventPair>,
320    );
321}
322
323/// An implementation of `CollaborativeRebootScheduler` that connects to the
324/// API over FIDL.
325struct Scheduler {}
326
327impl CollaborativeRebootScheduler for Scheduler {
328    async fn schedule(
329        &mut self,
330        reason: fpower::CollaborativeRebootReason,
331        cancel: Option<zx::EventPair>,
332    ) {
333        let proxy = match fuchsia_component::client::connect_to_protocol::<
334            fpower::CollaborativeRebootSchedulerMarker,
335        >() {
336            Ok(proxy) => proxy,
337            Err(e) => {
338                error!("Failed to connect to collaborative reboot scheduler: {e:?}");
339                return;
340            }
341        };
342        match proxy.schedule_reboot(reason, cancel).await {
343            Ok(()) => {}
344            Err(e) => error!("Failed to schedule collaborative reboot: {e:?}"),
345        }
346    }
347}
348
349/// The reason to file a crash report.
350#[derive(Debug, Clone, PartialEq, Eq)]
351enum CrashReportReason {
352    FailedHealthcheck,
353    RolledBack,
354}
355
356/// An abstraction over the `fidl_fuchsia_feedback::CrashReporter` FIDL API.
357trait CrashReporter {
358    /// The amount of time after startup to wait before filling a rollback
359    /// induced crash report.
360    const ROLLBACK_CRASH_REPORT_DELAY: Duration;
361
362    fn file_report(&mut self, reason: CrashReportReason);
363}
364
365/// An implementation of `CrashReporter` that connects to the API over FIDL.
366struct Reporter {}
367
368impl CrashReporter for Reporter {
369    const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_secs(180);
370
371    fn file_report(&mut self, reason: CrashReportReason) {
372        const PROGRAM_NAME: &str = "netstack_migration";
373        const FAILED_HEALTHCHECK_SIGNATURE: &str = "fuchsia-netstack3-healthcheck-failed";
374        const ROLLED_BACK_SIGNATURE: &str = "fuchsia-netstack3-rolled-back";
375
376        info!("Generating a crash report for reason: {reason:?}");
377
378        let proxy = match fuchsia_component::client::connect_to_protocol::<
379            fidl_fuchsia_feedback::CrashReporterMarker,
380        >() {
381            Ok(proxy) => proxy,
382            Err(e) => {
383                error!("Failed to connect to CrashReport proxy: {e:?}");
384                return;
385            }
386        };
387
388        let signature = match reason {
389            CrashReportReason::FailedHealthcheck => FAILED_HEALTHCHECK_SIGNATURE,
390            CrashReportReason::RolledBack => ROLLED_BACK_SIGNATURE,
391        };
392        let report = fidl_fuchsia_feedback::CrashReport {
393            program_name: Some(PROGRAM_NAME.to_string()),
394            program_uptime: Some(zx::MonotonicInstant::get().into_nanos()),
395            crash_signature: Some(signature.to_string()),
396            is_fatal: Some(false),
397            ..Default::default()
398        };
399
400        // File the crash report in a background task. It may take a while and
401        // we don't want to block the migration worker from serving other
402        // operations.
403        Task::spawn(proxy.file_report(report).map(|result| match result {
404            Ok(Ok(status)) => info!("Successfully filed crash report. Status={status:?}"),
405            Ok(Err(e)) => error!("Failed to file crash report: {e:?}"),
406            Err(e) => error!("Failed to request crash report: {e:?}"),
407        }))
408        .detach();
409    }
410}
411
412impl<P: PersistenceProvider, CR: CollaborativeRebootScheduler, R: CrashReporter>
413    Migration<P, CR, R>
414{
415    fn new(mut persistence: P, cr_scheduler: CR, crash_reporter: R) -> Self {
416        let mut persisted = persistence.open_reader().map(Persisted::load).unwrap_or_else(|e| {
417            warn!("could not open persistence reader: {e:?}. using defaults");
418            Persisted::default()
419        });
420
421        // Routine to update the rollback history on startup based on persisted
422        // values from the previous boot.
423        //
424        // This performs two important operations:
425        //   1) reset the failure count if the epoch has advanced, and
426        //   2) increment the failure count if the previous boot failed to
427        //      migrate.
428        let mut new = match persisted.rollback_history {
429            None => RollbackHistory::default(),
430            Some(RollbackHistory { epoch, count }) if epoch != CURRENT_EPOCH => {
431                if count > 0 {
432                    info!("Reseting Rollback History for new epoch");
433                }
434                RollbackHistory::default()
435            }
436            Some(history) => history,
437        };
438        if let Some(rollback::Persisted::HealthcheckFailures(failures)) = persisted.rollback {
439            if failures >= rollback::MAX_FAILED_HEALTHCHECKS {
440                new.count = new.count.saturating_add(1);
441            }
442        }
443
444        if Some(new) != persisted.rollback_history {
445            persisted.rollback_history = Some(new);
446            try_persist(&mut persistence, &persisted);
447        }
448
449        info!("Netstack Migration starting with persisted state: {persisted:?}");
450
451        let current_boot = persisted.desired_netstack_version();
452
453        match current_boot {
454            RollbackNetstackVersion::ForceNetstack2 => {
455                warn!(
456                    "Previous boot failed to migrate to Netstack3. \
457                    Ignoring automated setting and forcibly using Netstack2 \
458                    for the current boot."
459                );
460            }
461            RollbackNetstackVersion::ForceNetstack2Indefinitely => {
462                warn!(
463                    "Previous boot failed to migrate to Netstack3 and the \
464                    rollback limit has been exceeded. Ignoring automated \
465                    setting and forcibly using Netstack2 until the next epoch."
466                );
467            }
468            RollbackNetstackVersion::Netstack2 | RollbackNetstackVersion::Netstack3 => {}
469        }
470
471        Self {
472            current_boot,
473            persisted,
474            persistence,
475            collaborative_reboot: CollaborativeReboot {
476                scheduler: cr_scheduler,
477                scheduled_req: None,
478            },
479            crash_reporter,
480        }
481    }
482
483    fn persist(&mut self) {
484        let Self {
485            current_boot: _,
486            persisted,
487            persistence,
488            collaborative_reboot: _,
489            crash_reporter: _,
490        } = self;
491        try_persist(persistence, persisted);
492    }
493
494    fn map_version_setting(
495        version: Option<Box<fnet_migration::VersionSetting>>,
496    ) -> Option<NetstackVersion> {
497        version.map(|v| {
498            let fnet_migration::VersionSetting { version } = &*v;
499            (*version).into()
500        })
501    }
502
503    async fn update_collaborative_reboot(&mut self) {
504        let Self {
505            current_boot,
506            persisted,
507            persistence: _,
508            collaborative_reboot,
509            crash_reporter: _,
510        } = self;
511        if persisted.desired_netstack_version().version() != current_boot.version() {
512            // When the current boot differs from our desired version, schedule
513            // a reboot (if there's not already one).
514            collaborative_reboot.schedule().await
515        } else {
516            // When the current_boot matches our desired version, we no longer
517            // need reboot. Cancel the outstanding request (if any)
518            collaborative_reboot.cancel()
519        }
520    }
521
522    async fn update_rollback_state(&mut self, new_state: rollback::Persisted) {
523        if self.persisted.rollback != Some(new_state) {
524            self.persisted.rollback = Some(new_state);
525            self.update_collaborative_reboot().await;
526            self.persist();
527
528            if let rollback::Persisted::HealthcheckFailures(f) = new_state {
529                if f >= HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS {
530                    self.crash_reporter.file_report(CrashReportReason::FailedHealthcheck);
531                }
532            }
533        }
534    }
535
536    async fn handle_control_request(
537        &mut self,
538        req: fnet_migration::ControlRequest,
539    ) -> Result<(), fidl::Error> {
540        match req {
541            fnet_migration::ControlRequest::SetAutomatedNetstackVersion { version, responder } => {
542                let version = Self::map_version_setting(version);
543                let Self {
544                    current_boot: _,
545                    persisted: Persisted { automated, user: _, rollback: _, rollback_history: _ },
546                    persistence: _,
547                    collaborative_reboot: _,
548                    crash_reporter: _,
549                } = self;
550                if version != *automated {
551                    info!("automated netstack version switched to {version:?}");
552                    *automated = version;
553                    self.persist();
554                    self.update_collaborative_reboot().await;
555                }
556                responder.send()
557            }
558            fnet_migration::ControlRequest::SetUserNetstackVersion { version, responder } => {
559                let version = Self::map_version_setting(version);
560                let Self {
561                    current_boot: _,
562                    persisted: Persisted { automated: _, user, rollback: _, rollback_history: _ },
563                    persistence: _,
564                    collaborative_reboot: _,
565                    crash_reporter: _,
566                } = self;
567                if version != *user {
568                    info!("user netstack version switched to {version:?}");
569                    *user = version;
570                    self.persist();
571                    self.update_collaborative_reboot().await;
572                }
573                responder.send()
574            }
575        }
576    }
577
578    fn handle_state_request(&self, req: fnet_migration::StateRequest) -> Result<(), fidl::Error> {
579        let Migration {
580            current_boot,
581            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
582            persistence: _,
583            collaborative_reboot: _,
584            crash_reporter: _,
585        } = self;
586        match req {
587            fnet_migration::StateRequest::GetNetstackVersion { responder } => {
588                responder.send(&fnet_migration::InEffectVersion {
589                    current_boot: current_boot.version().into(),
590                    user: (*user).map(Into::into),
591                    automated: (*automated).map(Into::into),
592                })
593            }
594        }
595    }
596
597    async fn handle_request(&mut self, req: ServiceRequest) -> Result<(), fidl::Error> {
598        match req {
599            ServiceRequest::Control(r) => self.handle_control_request(r).await,
600            ServiceRequest::State(r) => self.handle_state_request(r),
601        }
602    }
603}
604
605struct InspectNodes {
606    automated_setting: fuchsia_inspect::UintProperty,
607    user_setting: fuchsia_inspect::UintProperty,
608    rollback_state: fuchsia_inspect::StringProperty,
609}
610
611impl InspectNodes {
612    fn new<P, CR, F>(inspector: &fuchsia_inspect::Inspector, m: &Migration<P, CR, F>) -> Self {
613        let root = inspector.root();
614        let Migration {
615            current_boot,
616            persisted: Persisted { automated, user, rollback, rollback_history },
617            ..
618        } = m;
619        let automated_setting = root.create_uint(
620            "automated_setting",
621            NetstackVersion::optional_inspect_uint_value(automated),
622        );
623        let user_setting =
624            root.create_uint("user_setting", NetstackVersion::optional_inspect_uint_value(user));
625
626        let rollback_state = root.create_string("rollback_state", format!("{rollback:?}"));
627
628        // Record immutable state once, instead of keeping track of a property node.
629        root.record_uint("current_boot", current_boot.version().inspect_uint_value());
630        let forced_netstack2 = match current_boot {
631            RollbackNetstackVersion::Netstack2 | RollbackNetstackVersion::Netstack3 => false,
632            RollbackNetstackVersion::ForceNetstack2
633            | RollbackNetstackVersion::ForceNetstack2Indefinitely => true,
634        };
635        root.record_bool("forced_netstack2", forced_netstack2);
636        root.record_uint("current_epoch", CURRENT_EPOCH.into());
637        root.record_string("rollback_history", format!("{rollback_history:?}"));
638
639        Self { automated_setting, user_setting, rollback_state }
640    }
641
642    fn update<P, CR, F>(&self, m: &Migration<P, CR, F>) {
643        let Migration {
644            persisted: Persisted { automated, user, rollback, rollback_history: _ },
645            ..
646        } = m;
647        let Self { automated_setting, user_setting, rollback_state } = self;
648        automated_setting.set(NetstackVersion::optional_inspect_uint_value(automated));
649        user_setting.set(NetstackVersion::optional_inspect_uint_value(user));
650        rollback_state.set(&format!("{rollback:?}"));
651    }
652}
653
654/// Wraps communication with metrics (cobalt) server.
655struct MetricsLogger {
656    logger: Option<fmetrics::MetricEventLoggerProxy>,
657}
658
659impl MetricsLogger {
660    async fn new() -> Self {
661        let (logger, server_end) =
662            fidl::endpoints::create_proxy::<fmetrics::MetricEventLoggerMarker>();
663
664        let factory = match fuchsia_component::client::connect_to_protocol::<
665            fmetrics::MetricEventLoggerFactoryMarker,
666        >() {
667            Ok(f) => f,
668            Err(e) => {
669                warn!("can't connect to logger factory {e:?}");
670                return Self { logger: None };
671            }
672        };
673
674        match factory
675            .create_metric_event_logger(
676                &fmetrics::ProjectSpec {
677                    customer_id: Some(metrics_registry::CUSTOMER_ID),
678                    project_id: Some(metrics_registry::PROJECT_ID),
679                    ..Default::default()
680                },
681                server_end,
682            )
683            .await
684        {
685            Ok(Ok(())) => Self { logger: Some(logger) },
686            Ok(Err(e)) => {
687                warn!("can't create event logger {e:?}");
688                Self { logger: None }
689            }
690            Err(e) => {
691                warn!("error connecting to metric event logger {e:?}");
692                Self { logger: None }
693            }
694        }
695    }
696
697    /// Logs metrics from `migration` to the metrics server.
698    async fn log_metrics<P, CR, F>(&self, migration: &Migration<P, CR, F>) {
699        let logger = if let Some(logger) = self.logger.as_ref() {
700            logger
701        } else {
702            // Silently don't log metrics if we didn't manage to create a
703            // logger, warnings are emitted upon creation.
704            return;
705        };
706
707        let current_boot = match migration.current_boot {
708            RollbackNetstackVersion::Netstack2
709            | RollbackNetstackVersion::ForceNetstack2
710            | RollbackNetstackVersion::ForceNetstack2Indefinitely => {
711                metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack2
712            }
713            RollbackNetstackVersion::Netstack3 => {
714                metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack3
715            }
716        }
717        .as_event_code();
718        let user = match migration.persisted.user {
719            None => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::NoSelection,
720            Some(NetstackVersion::Netstack2) => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack2,
721            Some(NetstackVersion::Netstack3) => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack3,
722        }
723        .as_event_code();
724        let automated = match migration.persisted.automated {
725            None => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::NoSelection,
726            Some(NetstackVersion::Netstack2) => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack2,
727            Some(NetstackVersion::Netstack3) => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack3,
728        }.as_event_code();
729        let rollback_state = compute_state_metric(migration).as_event_code();
730        for (metric_id, event_code) in [
731            (metrics_registry::STACK_MIGRATION_CURRENT_BOOT_METRIC_ID, current_boot),
732            (metrics_registry::STACK_MIGRATION_USER_SETTING_METRIC_ID, user),
733            (metrics_registry::STACK_MIGRATION_AUTOMATED_SETTING_METRIC_ID, automated),
734            (metrics_registry::STACK_MIGRATION_STATE_METRIC_ID, rollback_state),
735        ] {
736            let occurrence_count = 1;
737            logger
738                .log_occurrence(metric_id, occurrence_count, &[event_code][..])
739                .await
740                .map(|r| {
741                    r.unwrap_or_else(|e| warn!("error reported logging metric {metric_id} {e:?}"))
742                })
743                .unwrap_or_else(|fidl_error| {
744                    warn!("error logging metric {metric_id} {fidl_error:?}")
745                });
746        }
747    }
748}
749
750fn compute_state_metric<P, CR, F>(
751    migration: &Migration<P, CR, F>,
752) -> metrics_registry::StackMigrationStateMetricDimensionMigrationState {
753    use metrics_registry::StackMigrationStateMetricDimensionMigrationState as state_metric;
754    let Migration {
755        current_boot,
756        persisted: Persisted { automated, user: _, rollback, rollback_history: _ },
757        persistence: _,
758        collaborative_reboot: _,
759        crash_reporter: _,
760    } = migration;
761
762    match (current_boot, automated, rollback) {
763        (RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack2) | None, _) => {
764            state_metric::NotStarted
765        }
766        (RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack3), _) => {
767            state_metric::Scheduled
768        }
769        (RollbackNetstackVersion::ForceNetstack2Indefinitely, _, _) => state_metric::Abandoned,
770        (RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack3), _) => {
771            state_metric::RolledBack
772        }
773        // NB: If a device observes the automated setting switch to Netstack2
774        // while it's current boot is `ForceNetstack2` it won't have any need
775        // to schedule a reboot, and it will perpetually stay in
776        // `ForceNetstack`. Rather that perpetually reporting
777        // `state_metric::RolledBack`, it's more useful from a diagnostics
778        // perspective to consider this device `state_metric::NotStarted`.
779        (RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack2) | None, _) => {
780            state_metric::NotStarted
781        }
782        (RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack2) | None, _) => {
783            state_metric::Canceled
784        }
785        (RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3), None) => {
786            state_metric::InProgress
787        }
788        (
789            RollbackNetstackVersion::Netstack3,
790            Some(NetstackVersion::Netstack3),
791            Some(rollback::Persisted::HealthcheckFailures(f)),
792        ) => {
793            if *f >= rollback::MAX_FAILED_HEALTHCHECKS {
794                state_metric::Failed
795            } else {
796                state_metric::InProgress
797            }
798        }
799        (
800            RollbackNetstackVersion::Netstack3,
801            Some(NetstackVersion::Netstack3),
802            Some(rollback::Persisted::Success),
803        ) => state_metric::Success,
804    }
805}
806
807#[fuchsia::main]
808pub async fn main() {
809    info!("running netstack migration service");
810
811    let mut fs = ServiceFs::new();
812    let _: &mut ServiceFsDir<'_, _> = fs
813        .dir("svc")
814        .add_fidl_service(|rs: fnet_migration::ControlRequestStream| {
815            rs.map(|req| req.map(ServiceRequest::Control)).left_stream()
816        })
817        .add_fidl_service(|rs: fnet_migration::StateRequestStream| {
818            rs.map(|req| req.map(ServiceRequest::State)).right_stream()
819        });
820    let _: &mut ServiceFs<_> =
821        fs.take_and_serve_directory_handle().expect("failed to take out directory handle");
822
823    let mut migration = Migration::new(DataPersistenceProvider {}, Scheduler {}, Reporter {});
824    main_inner(
825        &mut migration,
826        fs.fuse().flatten_unordered(None),
827        rollback::FidlHttpFetcher::new(),
828        rollback::new_healthcheck_stream(),
829    )
830    .await
831}
832
833async fn main_inner<
834    P: PersistenceProvider,
835    CR: CollaborativeRebootScheduler,
836    R: CrashReporter,
837    H: rollback::HttpFetcher + Send + 'static,
838    T: Stream<Item = ()> + Send + 'static,
839    SR: Stream<Item = Result<ServiceRequest, fidl::Error>>,
840>(
841    migration: &mut Migration<P, CR, R>,
842    service_request_stream: SR,
843    http_fetcher: H,
844    healthcheck_tick: T,
845) {
846    let inspector = fuchsia_inspect::component::inspector();
847    let _inspect_server =
848        inspect_runtime::publish(inspector, inspect_runtime::PublishOptions::default())
849            .expect("failed to serve inspector");
850    let inspect_nodes = InspectNodes::new(inspector, &migration);
851
852    let metrics_logger = MetricsLogger::new().await;
853
854    let (desired_version_sender, desired_version_receiver) = mpsc::unbounded();
855    let (rollback_state_sender, rollback_state_receiver) = mpsc::unbounded();
856
857    // NB: Check if we failed to migrate to NS3 in the previous boot.
858    //
859    // It's important to perform this check before the rollback state is reset
860    // below.
861    let current_boot_is_rollback = match migration.persisted.rollback {
862        Some(rollback::Persisted::Success) | None => false,
863        Some(rollback::Persisted::HealthcheckFailures(f)) => f >= rollback::MAX_FAILED_HEALTHCHECKS,
864    };
865
866    let rollback_state =
867        rollback::State::new(migration.persisted.rollback, migration.current_boot.version());
868
869    // Update rollback persistence immediately in case the device reboots before
870    // the rollback module has time to send an asynchronous update. This is
871    // required for correctness if Netstack3 is crashing on startup, or in the
872    // following case:
873    //
874    // 1. Device fails to migrate to Netstack3 and persists
875    //    HealthcheckFailures(MAX_FAILED_HEALTHCHECKS), which will force
876    //    Netstack2 on subsequent boots.
877    // 2. Device reboots into Netstack2, sees that it should be running
878    //    Netstack3, and schedules a reboot without clearing the failures.
879    // 3. Device reboots back into Netstack3 and sees that it should schedule
880    //    a reboot because the persisted failures are above the limit.
881    migration.update_rollback_state(rollback_state.persisted()).await;
882
883    Task::spawn(async move {
884        rollback::run(
885            rollback_state,
886            http_fetcher,
887            desired_version_receiver,
888            rollback_state_sender,
889            pin!(healthcheck_tick),
890        )
891        .await
892    })
893    .detach();
894
895    enum Action {
896        ServiceRequest(Result<ServiceRequest, fidl::Error>),
897        LogMetrics,
898        UpdateRollbackState(rollback::Persisted),
899        FileRollbackCrashReport,
900    }
901
902    let metrics_logging_interval = fuchsia_async::MonotonicDuration::from_hours(1);
903    let mut stream: futures::stream::SelectAll<Pin<Box<dyn Stream<Item = Action>>>> =
904        futures::stream::SelectAll::new();
905
906    // Always log metrics once on startup then periodically log new values so
907    // the aggregation window always contains one sample of the current
908    // settings.
909    stream.push(Box::pin(Box::new(
910        futures::stream::once(futures::future::ready(()))
911            .chain(fuchsia_async::Interval::new(metrics_logging_interval))
912            .map(|()| Action::LogMetrics),
913    )));
914    stream.push(Box::pin(Box::new(Box::pin(service_request_stream.map(Action::ServiceRequest)))));
915    stream.push(Box::pin(rollback_state_receiver.map(|state| Action::UpdateRollbackState(state))));
916
917    // If the current boot is a rollback, schedule a crash report to be filed
918    // soon. Don't file it right away, as during early boot there is not much
919    // useful diagnostics data available.
920    if current_boot_is_rollback {
921        stream.push(Box::pin(
922            futures::stream::once(fuchsia_async::Timer::new(R::ROLLBACK_CRASH_REPORT_DELAY))
923                .map(|()| Action::FileRollbackCrashReport),
924        ));
925    }
926
927    while let Some(action) = stream.next().await {
928        match action {
929            Action::ServiceRequest(req) => {
930                let result = match req {
931                    Ok(req) => migration.handle_request(req).await,
932                    Err(e) => Err(e),
933                };
934                // Always update inspector state after handling a request.
935                inspect_nodes.update(&migration);
936
937                // Send the desired netstack version to the rollback mechanism,
938                // but ignore the "forced" setting. The "forced" setting comes
939                // from the rollback mechanism, and sending that signal back
940                // into it would cause a the mechanism to incorrectly detect
941                // a cancelation.
942                match desired_version_sender.unbounded_send(
943                    migration.persisted.desired_netstack_version().version_ignoring_force(),
944                ) {
945                    Ok(()) => (),
946                    Err(e) => {
947                        error!("error sending update to rollback module: {:?}", e);
948                    }
949                }
950
951                match result {
952                    Ok(()) => (),
953                    Err(e) => {
954                        if !e.is_closed() {
955                            error!("error processing FIDL request {:?}", e)
956                        }
957                    }
958                }
959            }
960            Action::LogMetrics => {
961                metrics_logger.log_metrics(&migration).await;
962            }
963            Action::UpdateRollbackState(new_state) => {
964                migration.update_rollback_state(new_state).await;
965                // Always update inspector state when the rollback state
966                // changes.
967                inspect_nodes.update(&migration);
968            }
969            Action::FileRollbackCrashReport => {
970                migration.crash_reporter.file_report(CrashReportReason::RolledBack);
971            }
972        }
973    }
974}
975
976#[cfg(test)]
977mod tests {
978    use super::*;
979    use assert_matches::assert_matches;
980    use async_utils::event::{Event, EventWait};
981    use diagnostics_assertions::assert_data_tree;
982    use fidl::Peered as _;
983    use fidl_fuchsia_net_http as fnet_http;
984    use fuchsia_async::TimeoutExt;
985    use futures::FutureExt;
986    use std::cell::RefCell;
987    use std::rc::Rc;
988    use std::time::Duration;
989    use test_case::test_case;
990
991    #[derive(Default, Clone)]
992    struct InMemory {
993        file: Rc<RefCell<Option<Vec<u8>>>>,
994    }
995
996    impl InMemory {
997        fn with_persisted(p: Persisted) -> Self {
998            let mut s = Self::default();
999            p.save(s.open_writer().unwrap());
1000            s
1001        }
1002    }
1003
1004    impl PersistenceProvider for InMemory {
1005        type Writer = Self;
1006        type Reader = std::io::Cursor<Vec<u8>>;
1007
1008        fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
1009            *self.file.borrow_mut() = Some(Vec::new());
1010            Ok(self.clone())
1011        }
1012
1013        fn open_reader(&self) -> std::io::Result<Self::Reader> {
1014            self.file
1015                .borrow()
1016                .clone()
1017                .map(std::io::Cursor::new)
1018                .ok_or_else(|| std::io::ErrorKind::NotFound.into())
1019        }
1020    }
1021
1022    impl std::io::Write for InMemory {
1023        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1024            let r = self.file.borrow_mut().as_mut().expect("no file open").write(buf);
1025            r
1026        }
1027
1028        fn flush(&mut self) -> std::io::Result<()> {
1029            Ok(())
1030        }
1031    }
1032
1033    struct NoCollaborativeReboot;
1034
1035    impl CollaborativeRebootScheduler for NoCollaborativeReboot {
1036        async fn schedule(
1037            &mut self,
1038            _reason: fpower::CollaborativeRebootReason,
1039            _cancel: Option<zx::EventPair>,
1040        ) {
1041            panic!("unexpectedly attempted to schedule a collaborative reboot");
1042        }
1043    }
1044
1045    #[derive(Default)]
1046    struct FakeCollaborativeReboot {
1047        req: Option<zx::EventPair>,
1048    }
1049
1050    impl CollaborativeRebootScheduler for FakeCollaborativeReboot {
1051        async fn schedule(
1052            &mut self,
1053            reason: fpower::CollaborativeRebootReason,
1054            cancel: Option<zx::EventPair>,
1055        ) {
1056            assert_eq!(reason, fpower::CollaborativeRebootReason::NetstackMigration);
1057            let cancel = cancel.expect("cancellation signal must be provided");
1058            assert_eq!(self.req.replace(cancel), None, "attempted to schedule multiple reboots");
1059        }
1060    }
1061
1062    struct NoCrashReports;
1063
1064    impl CrashReporter for NoCrashReports {
1065        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::ZERO;
1066        fn file_report(&mut self, reason: CrashReportReason) {
1067            panic!("unexpectedly attemped to file a crash report: {reason:?}")
1068        }
1069    }
1070
1071    #[derive(Default)]
1072    struct FakeCrashReporter {
1073        reports: Vec<CrashReportReason>,
1074    }
1075
1076    impl CrashReporter for FakeCrashReporter {
1077        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_millis(1);
1078        fn file_report(&mut self, reason: CrashReportReason) {
1079            self.reports.push(reason)
1080        }
1081    }
1082
1083    /// Signals an `EventWait` once the `target` crash reports have been filed.
1084    struct AwaitCrashReports {
1085        target: Vec<CrashReportReason>,
1086        reports: Vec<CrashReportReason>,
1087        event: Event,
1088    }
1089
1090    impl AwaitCrashReports {
1091        fn new(target: Vec<CrashReportReason>) -> (Self, EventWait) {
1092            let event = Event::new();
1093            let wait = event.wait();
1094            (Self { target, reports: Vec::new(), event }, wait)
1095        }
1096    }
1097
1098    impl CrashReporter for AwaitCrashReports {
1099        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_millis(1);
1100        fn file_report(&mut self, reason: CrashReportReason) {
1101            self.reports.push(reason);
1102            if self.reports == self.target {
1103                assert!(self.event.signal());
1104            }
1105        }
1106    }
1107
1108    fn serve_migration<
1109        P: PersistenceProvider,
1110        CR: CollaborativeRebootScheduler,
1111        R: CrashReporter,
1112    >(
1113        migration: Migration<P, CR, R>,
1114    ) -> (
1115        impl futures::Future<Output = Migration<P, CR, R>>,
1116        fnet_migration::ControlProxy,
1117        fnet_migration::StateProxy,
1118    ) {
1119        let (control, control_server) =
1120            fidl::endpoints::create_proxy_and_stream::<fnet_migration::ControlMarker>();
1121        let (state, state_server) =
1122            fidl::endpoints::create_proxy_and_stream::<fnet_migration::StateMarker>();
1123
1124        let fut = {
1125            let control =
1126                control_server.map(|req| ServiceRequest::Control(req.expect("control error")));
1127            let state = state_server.map(|req| ServiceRequest::State(req.expect("state error")));
1128            futures::stream::select(control, state).fold(migration, |mut migration, req| async {
1129                migration.handle_request(req).await.expect("handling request");
1130                migration
1131            })
1132        };
1133        (fut, control, state)
1134    }
1135
1136    #[test_case(Persisted{
1137        user: Some(NetstackVersion::Netstack2),
1138        automated: None,
1139        rollback: None,
1140        rollback_history: None,
1141    }; "user_netstack2")]
1142    #[test_case(Persisted{
1143        user: Some(NetstackVersion::Netstack3),
1144        automated: None,
1145        rollback: None,
1146        rollback_history: None,
1147    }; "user_netstack3")]
1148    #[test_case(Persisted{
1149        user: None,
1150        automated: None,
1151        rollback: None,
1152        rollback_history: None,
1153    }; "none")]
1154    #[test_case(Persisted{
1155        user: None,
1156        automated: Some(NetstackVersion::Netstack2),
1157        rollback: None,
1158        rollback_history: None,
1159    }; "automated_netstack2")]
1160    #[test_case(Persisted{
1161        user: None,
1162        automated: Some(NetstackVersion::Netstack3),
1163        rollback: None,
1164        rollback_history: None,
1165    }; "automated_netstack3")]
1166    #[test_case(Persisted{
1167        user: None,
1168        automated: None,
1169        rollback: Some(rollback::Persisted::Success),
1170        rollback_history: None,
1171    }; "rollback_success")]
1172    #[test_case(Persisted{
1173        user: None,
1174        automated: None,
1175        rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
1176        rollback_history: Some(RollbackHistory {epoch: CURRENT_EPOCH, count: 3}),
1177    }; "rollback_history")]
1178    #[test_case(Persisted{
1179        user: Some(NetstackVersion::Netstack2),
1180        automated: Some(NetstackVersion::Netstack3),
1181        rollback: Some(rollback::Persisted::HealthcheckFailures(5)),
1182        rollback_history: Some(RollbackHistory {epoch: CURRENT_EPOCH, count: 3}),
1183    }; "all")]
1184    #[fuchsia::test(add_test_attr = false)]
1185    fn persist_save_load(v: Persisted) {
1186        let mut m = InMemory::default();
1187        v.save(m.open_writer().unwrap());
1188        assert_eq!(Persisted::load(m.open_reader().unwrap()), v);
1189    }
1190
1191    #[fuchsia::test]
1192    fn uses_defaults_if_no_persistence() {
1193        let m = Migration::new(InMemory::default(), NoCollaborativeReboot, NoCrashReports);
1194        let Migration {
1195            current_boot,
1196            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1197            persistence: _,
1198            collaborative_reboot: _,
1199            crash_reporter: _,
1200        } = m;
1201        assert_eq!(current_boot.version(), DEFAULT_NETSTACK);
1202        assert_eq!(user, None);
1203        assert_eq!(automated, None);
1204    }
1205
1206    #[test_case(
1207        None, Some(NetstackVersion::Netstack3), None, None, NetstackVersion::Netstack3;
1208        "automated_ns3")]
1209    #[test_case(
1210        None, Some(NetstackVersion::Netstack2), None, None, NetstackVersion::Netstack2;
1211        "automated_ns2")]
1212    #[test_case(
1213        Some(NetstackVersion::Netstack3),
1214        Some(NetstackVersion::Netstack2),
1215        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)),
1216        Some(RollbackHistory{epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH}),
1217        NetstackVersion::Netstack3;
1218        "user_ns3_override")]
1219    #[test_case(
1220        Some(NetstackVersion::Netstack2),
1221        Some(NetstackVersion::Netstack3),
1222        Some(rollback::Persisted::Success),
1223        None,
1224        NetstackVersion::Netstack2;
1225        "user_ns2_override")]
1226    #[test_case(
1227        Some(NetstackVersion::Netstack2),
1228        None,
1229        None,
1230        None,
1231        NetstackVersion::Netstack2; "user_ns2")]
1232    #[test_case(
1233        Some(NetstackVersion::Netstack3),
1234        None,
1235        None,
1236        None,
1237        NetstackVersion::Netstack3; "user_ns3")]
1238    #[test_case(
1239        None,
1240        Some(NetstackVersion::Netstack3),
1241        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)),
1242        None,
1243        NetstackVersion::Netstack2; "rollback_to_ns2")]
1244    #[test_case(
1245        None,
1246        Some(NetstackVersion::Netstack3),
1247        Some(rollback::Persisted::HealthcheckFailures(0)),
1248        Some(RollbackHistory{epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH}),
1249        NetstackVersion::Netstack2; "indefinitely_rolled_back_to_ns2")]
1250    #[test_case(
1251        None,
1252        Some(NetstackVersion::Netstack3),
1253        Some(rollback::Persisted::HealthcheckFailures(0)),
1254        Some(RollbackHistory{epoch: CURRENT_EPOCH - 1, count: MAX_ROLLBACKS_PER_EPOCH}),
1255        NetstackVersion::Netstack3; "ignore_rollback_history_from_old_epoch")]
1256    #[test_case(None, None, None, None, DEFAULT_NETSTACK; "default")]
1257    #[fuchsia::test]
1258    async fn get_netstack_version(
1259        p_user: Option<NetstackVersion>,
1260        p_automated: Option<NetstackVersion>,
1261        p_rollback: Option<rollback::Persisted>,
1262        p_history: Option<RollbackHistory>,
1263        expect: NetstackVersion,
1264    ) {
1265        let m = Migration::new(
1266            InMemory::with_persisted(Persisted {
1267                user: p_user,
1268                automated: p_automated,
1269                rollback: p_rollback,
1270                rollback_history: p_history,
1271            }),
1272            NoCollaborativeReboot,
1273            NoCrashReports,
1274        );
1275        let Migration {
1276            current_boot,
1277            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1278            persistence: _,
1279            collaborative_reboot: _,
1280            crash_reporter: _,
1281        } = &m;
1282        assert_eq!(current_boot.version(), expect);
1283        assert_eq!(*user, p_user);
1284        assert_eq!(*automated, p_automated);
1285
1286        let (serve, _, state) = serve_migration(m);
1287        let fut = async move {
1288            let fnet_migration::InEffectVersion { current_boot, user, automated } =
1289                state.get_netstack_version().await.expect("get netstack version");
1290            let expect = expect.into();
1291            let p_user = p_user.map(Into::into);
1292            let p_automated = p_automated.map(Into::into);
1293            assert_eq!(current_boot, expect);
1294            assert_eq!(user, p_user);
1295            assert_eq!(automated, p_automated);
1296        };
1297        let (_, ()): (Migration<_, _, _>, _) = futures::future::join(serve, fut).await;
1298    }
1299
1300    #[derive(Debug, Copy, Clone)]
1301    enum SetMechanism {
1302        User,
1303        Automated,
1304    }
1305
1306    #[test_case(SetMechanism::User, NetstackVersion::Netstack2; "set_user_ns2")]
1307    #[test_case(SetMechanism::User, NetstackVersion::Netstack3; "set_user_ns3")]
1308    #[test_case(SetMechanism::Automated, NetstackVersion::Netstack2; "set_automated_ns2")]
1309    #[test_case(SetMechanism::Automated, NetstackVersion::Netstack3; "set_automated_ns3")]
1310    #[fuchsia::test]
1311    async fn set_netstack_version(mechanism: SetMechanism, set_version: NetstackVersion) {
1312        let m = Migration::new(
1313            InMemory::with_persisted(Default::default()),
1314            FakeCollaborativeReboot::default(),
1315            NoCrashReports,
1316        );
1317        let (serve, control, _) = serve_migration(m);
1318        let fut = async move {
1319            let setting = fnet_migration::VersionSetting { version: set_version.into() };
1320            match mechanism {
1321                SetMechanism::User => control
1322                    .set_user_netstack_version(Some(&setting))
1323                    .await
1324                    .expect("set user netstack version"),
1325                SetMechanism::Automated => control
1326                    .set_automated_netstack_version(Some(&setting))
1327                    .await
1328                    .expect("set automated netstack version"),
1329            }
1330        };
1331        let (migration, ()) = futures::future::join(serve, fut).await;
1332
1333        let validate_versions = |m: &Migration<_, _, _>, current| {
1334            let Migration {
1335                current_boot,
1336                persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1337                persistence: _,
1338                collaborative_reboot: _,
1339                crash_reporter: _,
1340            } = m;
1341            assert_eq!(current_boot.version(), current);
1342            match mechanism {
1343                SetMechanism::User => {
1344                    assert_eq!(*user, Some(set_version));
1345                    assert_eq!(*automated, None);
1346                }
1347                SetMechanism::Automated => {
1348                    assert_eq!(*user, None);
1349                    assert_eq!(*automated, Some(set_version));
1350                }
1351            }
1352        };
1353
1354        validate_versions(&migration, DEFAULT_NETSTACK);
1355        // There should only be a collaborative reboot request if the new
1356        // version differs from the default version.
1357        let cr_req = &migration.collaborative_reboot.scheduler.req;
1358        if set_version != DEFAULT_NETSTACK {
1359            assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
1360        } else {
1361            assert_eq!(cr_req, &None);
1362        }
1363
1364        // Check that the setting was properly persisted.
1365        let migration = Migration::new(
1366            migration.persistence,
1367            migration.collaborative_reboot.scheduler,
1368            migration.crash_reporter,
1369        );
1370        validate_versions(&migration, set_version);
1371    }
1372
1373    #[fuchsia::test]
1374    async fn update_rollback_state() {
1375        let mut migration = Migration::new(
1376            InMemory::with_persisted(Persisted {
1377                automated: Some(NetstackVersion::Netstack3),
1378                user: None,
1379                rollback: None,
1380                rollback_history: None,
1381            }),
1382            FakeCollaborativeReboot::default(),
1383            FakeCrashReporter::default(),
1384        );
1385
1386        assert_eq!(migration.current_boot.version(), NetstackVersion::Netstack3);
1387        assert!(migration.collaborative_reboot.scheduler.req.is_none());
1388
1389        // The first update shouldn't schedule a reboot because we haven't
1390        // passed the healthcheck threshold yet.
1391        migration.update_rollback_state(rollback::Persisted::HealthcheckFailures(1)).await;
1392        assert_matches!(
1393            migration.persisted.rollback,
1394            Some(rollback::Persisted::HealthcheckFailures(1))
1395        );
1396        assert!(migration.collaborative_reboot.scheduler.req.is_none());
1397
1398        // This second update should schedule a reboot because we've passed
1399        // the healthcheck limit and want to roll back to Netstack2.
1400        migration
1401            .update_rollback_state(rollback::Persisted::HealthcheckFailures(
1402                rollback::MAX_FAILED_HEALTHCHECKS,
1403            ))
1404            .await;
1405        assert_matches!(
1406            migration.persisted.rollback,
1407            Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS))
1408        );
1409        assert_eq!(
1410            migration
1411                .collaborative_reboot
1412                .scheduler
1413                .req
1414                .as_ref()
1415                .expect("reboot was not scheduled")
1416                .is_closed()
1417                .unwrap(),
1418            false
1419        );
1420
1421        // It should have also filed a crash report
1422        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::FailedHealthcheck]);
1423
1424        // This emulates seeing a healthcheck success before rebooting, in which
1425        // case we should see the reboot get canceled.
1426        migration.update_rollback_state(rollback::Persisted::Success).await;
1427        assert_matches!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1428        assert!(
1429            migration.collaborative_reboot.scheduler.req.as_ref().unwrap().is_closed().unwrap()
1430        );
1431
1432        // Ensure that the changes were persisted successfully.
1433        let migration = Migration::new(
1434            migration.persistence,
1435            migration.collaborative_reboot.scheduler,
1436            migration.crash_reporter,
1437        );
1438        assert_matches!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1439    }
1440
1441    #[test_case(SetMechanism::User, Some(NetstackVersion::Netstack2), false)]
1442    #[test_case(SetMechanism::User, Some(NetstackVersion::Netstack3), true)]
1443    #[test_case(SetMechanism::User, None, false)]
1444    #[test_case(SetMechanism::Automated, Some(NetstackVersion::Netstack2), false)]
1445    #[test_case(SetMechanism::Automated, Some(NetstackVersion::Netstack3), true)]
1446    #[test_case(SetMechanism::Automated, None, true)]
1447    #[fuchsia::test]
1448    async fn cancel_collaborative_reboot(
1449        mechanism: SetMechanism,
1450        version: Option<NetstackVersion>,
1451        expect_canceled: bool,
1452    ) {
1453        let migration = Migration::new(
1454            InMemory::with_persisted(Persisted {
1455                user: None,
1456                automated: None,
1457                rollback: None,
1458                rollback_history: None,
1459            }),
1460            FakeCollaborativeReboot::default(),
1461            NoCrashReports,
1462        );
1463
1464        // Start of by updating the automated setting to Netstack2; this ensures
1465        // there is a pending request to cancel.
1466        let (serve, control, _) = serve_migration(migration);
1467        let fut = async move {
1468            control
1469                .set_automated_netstack_version(Some(&fnet_migration::VersionSetting {
1470                    version: fnet_migration::NetstackVersion::Netstack2,
1471                }))
1472                .await
1473                .expect("set automated netstack version");
1474        };
1475        let (migration, ()) = futures::future::join(serve, fut).await;
1476        let cancel = migration
1477            .collaborative_reboot
1478            .scheduler
1479            .req
1480            .as_ref()
1481            .expect("there should be a request");
1482        assert_eq!(Ok(false), cancel.is_closed());
1483
1484        // Update the setting based on the test parameters
1485        let (serve, control, _) = serve_migration(migration);
1486        let fut = async move {
1487            let setting = version.map(|v| fnet_migration::VersionSetting { version: v.into() });
1488            match mechanism {
1489                SetMechanism::User => control
1490                    .set_user_netstack_version(setting.as_ref())
1491                    .await
1492                    .expect("set user netstack version"),
1493                SetMechanism::Automated => control
1494                    .set_automated_netstack_version(setting.as_ref())
1495                    .await
1496                    .expect("set automated netstack version"),
1497            }
1498        };
1499        let (migration, ()) = futures::future::join(serve, fut).await;
1500
1501        let cancel = migration
1502            .collaborative_reboot
1503            .scheduler
1504            .req
1505            .as_ref()
1506            .expect("there should be a request");
1507        assert_eq!(Ok(expect_canceled), cancel.is_closed());
1508    }
1509
1510    #[test_case(SetMechanism::User)]
1511    #[test_case(SetMechanism::Automated)]
1512    #[fuchsia::test]
1513    async fn clear_netstack_version(mechanism: SetMechanism) {
1514        const PREVIOUS_VERSION: NetstackVersion = NetstackVersion::Netstack2;
1515        let m = Migration::new(
1516            InMemory::with_persisted(Persisted {
1517                user: Some(PREVIOUS_VERSION),
1518                automated: Some(PREVIOUS_VERSION),
1519                rollback: None,
1520                rollback_history: None,
1521            }),
1522            NoCollaborativeReboot,
1523            NoCrashReports,
1524        );
1525        let (serve, control, _) = serve_migration(m);
1526        let fut = async move {
1527            match mechanism {
1528                SetMechanism::User => control
1529                    .set_user_netstack_version(None)
1530                    .await
1531                    .expect("set user netstack version"),
1532                SetMechanism::Automated => control
1533                    .set_automated_netstack_version(None)
1534                    .await
1535                    .expect("set automated netstack version"),
1536            }
1537        };
1538        let (migration, ()) = futures::future::join(serve, fut).await;
1539
1540        let validate_versions = |m: &Migration<_, _, _>| {
1541            let Migration {
1542                current_boot,
1543                persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1544                persistence: _,
1545                collaborative_reboot: _,
1546                crash_reporter: _,
1547            } = m;
1548            assert_eq!(current_boot.version(), PREVIOUS_VERSION);
1549            match mechanism {
1550                SetMechanism::User => {
1551                    assert_eq!(*user, None);
1552                    assert_eq!(*automated, Some(PREVIOUS_VERSION));
1553                }
1554                SetMechanism::Automated => {
1555                    assert_eq!(*user, Some(PREVIOUS_VERSION));
1556                    assert_eq!(*automated, None);
1557                }
1558            }
1559        };
1560
1561        validate_versions(&migration);
1562        // Check that the setting was properly persisted.
1563        let migration = Migration::new(
1564            migration.persistence,
1565            migration.collaborative_reboot.scheduler,
1566            migration.crash_reporter,
1567        );
1568        validate_versions(&migration);
1569    }
1570
1571    #[fuchsia::test]
1572    async fn inspect() {
1573        let mut m = Migration::new(
1574            InMemory::with_persisted(Persisted {
1575                user: Some(NetstackVersion::Netstack2),
1576                automated: Some(NetstackVersion::Netstack3),
1577                rollback: None,
1578                rollback_history: None,
1579            }),
1580            NoCollaborativeReboot,
1581            NoCrashReports,
1582        );
1583        let inspector = fuchsia_inspect::component::inspector();
1584        let nodes = InspectNodes::new(inspector, &m);
1585        assert_data_tree!(inspector,
1586            root: {
1587                current_boot: 2u64,
1588                user_setting: 2u64,
1589                automated_setting: 3u64,
1590                rollback_state: "None",
1591                forced_netstack2: false,
1592                current_epoch: CURRENT_EPOCH,
1593                rollback_history: format!(
1594                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 0 }})"
1595                ),
1596            }
1597        );
1598
1599        m.persisted = Persisted {
1600            user: None,
1601            automated: Some(NetstackVersion::Netstack2),
1602            rollback: None,
1603            rollback_history: None,
1604        };
1605        nodes.update(&m);
1606        assert_data_tree!(inspector,
1607            root: {
1608                current_boot: 2u64,
1609                user_setting: 0u64,
1610                automated_setting: 2u64,
1611                rollback_state: "None",
1612                forced_netstack2: false,
1613                current_epoch: CURRENT_EPOCH,
1614                rollback_history: format!(
1615                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 0 }})"
1616                ),
1617            }
1618        );
1619    }
1620
1621    #[fuchsia::test]
1622    async fn inspect_rollback() {
1623        let mut m = Migration::new(
1624            InMemory::with_persisted(Persisted {
1625                user: None,
1626                automated: Some(NetstackVersion::Netstack3),
1627                rollback: Some(rollback::Persisted::HealthcheckFailures(
1628                    rollback::MAX_FAILED_HEALTHCHECKS,
1629                )),
1630                rollback_history: None,
1631            }),
1632            NoCollaborativeReboot,
1633            NoCrashReports,
1634        );
1635        let inspector = fuchsia_inspect::component::inspector();
1636        let nodes = InspectNodes::new(inspector, &m);
1637        assert_data_tree!(inspector,
1638            root: {
1639                current_boot: 2u64,
1640                user_setting: 0u64,
1641                automated_setting: 3u64,
1642                rollback_state: "Some(HealthcheckFailures(5))",
1643                forced_netstack2: true,
1644                current_epoch: CURRENT_EPOCH,
1645                rollback_history: format!(
1646                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1647                ),
1648            }
1649        );
1650
1651        m.persisted.rollback =
1652            Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS + 1));
1653        nodes.update(&m);
1654        assert_data_tree!(inspector,
1655            root: {
1656                current_boot: 2u64,
1657                user_setting: 0u64,
1658                automated_setting: 3u64,
1659                rollback_state: "Some(HealthcheckFailures(6))",
1660                forced_netstack2: true,
1661                current_epoch: CURRENT_EPOCH,
1662                rollback_history: format!(
1663                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1664                ),
1665            }
1666        );
1667        m.persisted.rollback = Some(rollback::Persisted::Success);
1668        nodes.update(&m);
1669        assert_data_tree!(inspector,
1670            root: {
1671                current_boot: 2u64,
1672                user_setting: 0u64,
1673                automated_setting: 3u64,
1674                rollback_state: "Some(Success)",
1675                forced_netstack2: true,
1676                current_epoch: CURRENT_EPOCH,
1677                rollback_history: format!(
1678                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1679                ),
1680            }
1681        );
1682    }
1683
1684    #[test_case(CURRENT_EPOCH, 2, true; "ns2_with_failures_in_current_epoch")]
1685    #[test_case(CURRENT_EPOCH - 1, 3, false; "ns3_with_failures_in_old_epoch")]
1686    #[fuchsia::test]
1687    async fn inspect_rollback_history(epoch: u32, current_boot: u64, forced_netstack2: bool) {
1688        let m = Migration::new(
1689            InMemory::with_persisted(Persisted {
1690                user: None,
1691                automated: Some(NetstackVersion::Netstack3),
1692                rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
1693                rollback_history: Some(RollbackHistory {
1694                    epoch: epoch,
1695                    count: MAX_ROLLBACKS_PER_EPOCH,
1696                }),
1697            }),
1698            NoCollaborativeReboot,
1699            NoCrashReports,
1700        );
1701        let inspector = fuchsia_inspect::component::inspector();
1702        let _nodes = InspectNodes::new(inspector, &m);
1703        assert_data_tree!(inspector,
1704            root: {
1705                current_boot: current_boot,
1706                user_setting: 0u64,
1707                automated_setting: 3u64,
1708                rollback_state: "Some(HealthcheckFailures(0))",
1709                forced_netstack2: forced_netstack2,
1710                current_epoch: CURRENT_EPOCH,
1711                rollback_history: format!(
1712                    "Some(RollbackHistory {{ epoch: {}, count: {} }})",
1713                    CURRENT_EPOCH,
1714                    // Check if the history should have been reset.
1715                    if epoch == CURRENT_EPOCH { MAX_ROLLBACKS_PER_EPOCH } else {0},
1716                ),
1717            }
1718        );
1719    }
1720
1721    #[test_case::test_matrix(
1722    [
1723        (RollbackNetstackVersion::Netstack2, metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack2),
1724        (RollbackNetstackVersion::Netstack3, metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack3),
1725    ],
1726    [
1727        (None, metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::NoSelection),
1728        (Some(NetstackVersion::Netstack2), metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack2),
1729        (Some(NetstackVersion::Netstack3), metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack3),
1730    ],
1731    [
1732        (None, metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::NoSelection),
1733        (Some(NetstackVersion::Netstack2), metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack2),
1734        (Some(NetstackVersion::Netstack3), metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack3),
1735    ]
1736    )]
1737    #[fuchsia::test]
1738    async fn metrics_logger(
1739        current_boot: (
1740            RollbackNetstackVersion,
1741            metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion,
1742        ),
1743        user: (
1744            Option<NetstackVersion>,
1745            metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion,
1746        ),
1747        automated: (
1748            Option<NetstackVersion>,
1749            metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion,
1750        ),
1751    ) {
1752        let (current_boot, current_boot_expect) = current_boot;
1753        let (user, user_expect) = user;
1754        let (automated, automated_expect) = automated;
1755        let mut m = Migration::new(
1756            InMemory::with_persisted(Persisted {
1757                user,
1758                automated,
1759                rollback: None,
1760                rollback_history: None,
1761            }),
1762            NoCollaborativeReboot,
1763            NoCrashReports,
1764        );
1765        m.current_boot = current_boot;
1766        let (logger, mut logger_stream) =
1767            fidl::endpoints::create_proxy_and_stream::<fmetrics::MetricEventLoggerMarker>();
1768
1769        let metrics_logger = MetricsLogger { logger: Some(logger) };
1770
1771        let ((), ()) = futures::future::join(metrics_logger.log_metrics(&m), async {
1772            let expect = [
1773                (
1774                    metrics_registry::STACK_MIGRATION_CURRENT_BOOT_METRIC_ID,
1775                    Some(current_boot_expect.as_event_code()),
1776                ),
1777                (
1778                    metrics_registry::STACK_MIGRATION_USER_SETTING_METRIC_ID,
1779                    Some(user_expect.as_event_code()),
1780                ),
1781                (
1782                    metrics_registry::STACK_MIGRATION_AUTOMATED_SETTING_METRIC_ID,
1783                    Some(automated_expect.as_event_code()),
1784                ),
1785                (
1786                    metrics_registry::STACK_MIGRATION_STATE_METRIC_ID,
1787                    // Note: The rollback state doesn't have a flat expectation.
1788                    // Don't assert on its value here, and instead we directly
1789                    // test it in a separate test case.
1790                    None,
1791                ),
1792            ];
1793            for (id, ev) in expect {
1794                let (metric, occurences, codes, responder) = logger_stream
1795                    .next()
1796                    .await
1797                    .unwrap()
1798                    .unwrap()
1799                    .into_log_occurrence()
1800                    .expect("bad request");
1801                assert_eq!(metric, id);
1802                assert_eq!(occurences, 1);
1803                if let Some(ev) = ev {
1804                    assert_eq!(codes, vec![ev]);
1805                }
1806                responder.send(Ok(())).unwrap();
1807            }
1808        })
1809        .await;
1810    }
1811
1812    #[test_case(
1813        RollbackNetstackVersion::Netstack2, None, None =>
1814        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1815        "not_started_none"
1816    )]
1817    #[test_case(
1818        RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack2), None =>
1819        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1820        "not_started_ns2"
1821    )]
1822    #[test_case(
1823        RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack3), None =>
1824        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Scheduled;
1825        "scheduled"
1826    )]
1827    #[test_case(
1828        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3), None =>
1829        metrics_registry::StackMigrationStateMetricDimensionMigrationState::InProgress;
1830        "in_progress_none"
1831    )]
1832    #[test_case(
1833        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1834        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS - 1)) =>
1835        metrics_registry::StackMigrationStateMetricDimensionMigrationState::InProgress;
1836        "in_progress_some"
1837    )]
1838    #[test_case(
1839        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1840        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)) =>
1841        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Failed;
1842        "failed_exact"
1843    )]
1844    #[test_case(
1845        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1846        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS + 1)) =>
1847        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Failed;
1848        "failed_more"
1849    )]
1850    #[test_case(
1851        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1852        Some(rollback::Persisted::Success) =>
1853        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Success;
1854        "success"
1855    )]
1856    #[test_case(
1857        RollbackNetstackVersion::Netstack3, None, None =>
1858        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Canceled;
1859        "canceled_none"
1860    )]
1861    #[test_case(
1862        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack2), None =>
1863        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Canceled;
1864        "canceled_ns2"
1865    )]
1866    #[test_case(
1867        RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack3), None =>
1868        metrics_registry::StackMigrationStateMetricDimensionMigrationState::RolledBack;
1869        "rolled_back"
1870    )]
1871    #[test_case(
1872        RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack2), None =>
1873        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1874        "rolled_back_becomes_not_started"
1875    )]
1876    #[test_case(
1877        RollbackNetstackVersion::ForceNetstack2Indefinitely, None, None =>
1878        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Abandoned;
1879        "abandoned"
1880    )]
1881    #[fuchsia::test]
1882    fn test_state_metric(
1883        current_boot: RollbackNetstackVersion,
1884        automated: Option<NetstackVersion>,
1885        rollback: Option<rollback::Persisted>,
1886    ) -> metrics_registry::StackMigrationStateMetricDimensionMigrationState {
1887        let mut migration = Migration::new(
1888            InMemory::with_persisted(Persisted {
1889                user: None,
1890                automated,
1891                rollback,
1892                rollback_history: None,
1893            }),
1894            NoCollaborativeReboot,
1895            NoCrashReports,
1896        );
1897        migration.current_boot = current_boot;
1898        compute_state_metric(&migration)
1899    }
1900
1901    /// An in-memory mock-persistence that triggers an event once the target
1902    /// state has been persisted.
1903    #[derive(Clone)]
1904    struct AwaitPersisted {
1905        file: Rc<RefCell<Option<Vec<u8>>>>,
1906        target: Vec<u8>,
1907        event: Event,
1908    }
1909
1910    impl AwaitPersisted {
1911        fn with_persisted(start: Persisted, target: &Persisted) -> (Self, EventWait) {
1912            let event = Event::new();
1913            let wait = event.wait();
1914            let target_bytes = serde_json::to_vec(target).expect("failed to serialize target");
1915            let mut s = Self { file: Default::default(), target: target_bytes, event };
1916            start.save(s.open_writer().unwrap());
1917            (s, wait)
1918        }
1919    }
1920
1921    impl PersistenceProvider for AwaitPersisted {
1922        type Writer = Self;
1923        type Reader = std::io::Cursor<Vec<u8>>;
1924
1925        fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
1926            *self.file.borrow_mut() = Some(Vec::new());
1927            Ok(self.clone())
1928        }
1929
1930        fn open_reader(&self) -> std::io::Result<Self::Reader> {
1931            self.file
1932                .borrow()
1933                .clone()
1934                .map(std::io::Cursor::new)
1935                .ok_or_else(|| std::io::ErrorKind::NotFound.into())
1936        }
1937    }
1938
1939    impl std::io::Write for AwaitPersisted {
1940        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1941            let r = self.file.borrow_mut().as_mut().expect("no file open").write(buf);
1942            if self.file.borrow().as_ref().expect("no_file_open") == &self.target {
1943                let _: bool = self.event.signal();
1944            }
1945            r
1946        }
1947
1948        fn flush(&mut self) -> std::io::Result<()> {
1949            Ok(())
1950        }
1951    }
1952
1953    #[fuchsia::test]
1954    async fn migrate_to_ns3_success() {
1955        let start = Persisted {
1956            user: None,
1957            automated: Some(NetstackVersion::Netstack3),
1958            rollback: None,
1959            rollback_history: None,
1960        };
1961        let target = Persisted {
1962            user: None,
1963            automated: Some(NetstackVersion::Netstack3),
1964            rollback: Some(rollback::Persisted::Success),
1965            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
1966        };
1967
1968        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
1969        let mut migration = Migration::new(persistence, NoCollaborativeReboot, NoCrashReports);
1970        // No service requests.
1971        let service_request_stream = futures::stream::pending();
1972        // A health check that always succeeds.
1973        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
1974            Ok(fnet_http::Response { error: None, status_code: Some(204), ..Default::default() })
1975        });
1976        let healthcheck_tick = futures::stream::once(futures::future::ready(()));
1977
1978        {
1979            let main_fut = main_inner(
1980                &mut migration,
1981                service_request_stream,
1982                mock_healthcheck,
1983                healthcheck_tick,
1984            )
1985            .fuse();
1986            futures::pin_mut!(main_fut);
1987            futures::select!(
1988                () = main_fut => unreachable!("main fut should never exit"),
1989                () = wait => {}
1990            );
1991        }
1992
1993        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1994    }
1995
1996    #[test_case[0; "first_failure"]]
1997    #[test_case[MAX_ROLLBACKS_PER_EPOCH-1; "last_failure"]]
1998    #[fuchsia::test]
1999    async fn migrate_to_ns3_fails(rollbacks_in_history: u8) {
2000        let start = Persisted {
2001            user: None,
2002            automated: Some(NetstackVersion::Netstack3),
2003            rollback: None,
2004            rollback_history: Some(RollbackHistory {
2005                epoch: CURRENT_EPOCH,
2006                count: rollbacks_in_history,
2007            }),
2008        };
2009        let target = Persisted {
2010            user: None,
2011            automated: Some(NetstackVersion::Netstack3),
2012            rollback: Some(rollback::Persisted::HealthcheckFailures(
2013                rollback::MAX_FAILED_HEALTHCHECKS,
2014            )),
2015            rollback_history: Some(RollbackHistory {
2016                epoch: CURRENT_EPOCH,
2017                count: rollbacks_in_history,
2018            }),
2019        };
2020
2021        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
2022        let mut migration = Migration::new(
2023            persistence,
2024            FakeCollaborativeReboot::default(),
2025            FakeCrashReporter::default(),
2026        );
2027        // No service requests.
2028        let service_request_stream = futures::stream::pending();
2029        // A health check that always fails.
2030        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2031            Ok(fnet_http::Response { error: None, status_code: Some(500), ..Default::default() })
2032        });
2033        // Use a non-zero interval so that the Healthcheck code doesn't hog the scheduler.
2034        let healthcheck_tick = fuchsia_async::Interval::new(Duration::from_millis(1).into());
2035
2036        {
2037            let main_fut = main_inner(
2038                &mut migration,
2039                service_request_stream,
2040                mock_healthcheck,
2041                healthcheck_tick,
2042            )
2043            .fuse();
2044            futures::pin_mut!(main_fut);
2045            futures::select!(
2046                () = main_fut => unreachable!("main fut should never exit"),
2047                () = wait => {}
2048            );
2049        }
2050
2051        let failure_count = assert_matches!(
2052            migration.persisted.rollback,
2053            Some(rollback::Persisted::HealthcheckFailures(f)) => f
2054        );
2055        assert!(
2056            failure_count >= rollback::MAX_FAILED_HEALTHCHECKS,
2057            "failure_count={failure_count}"
2058        );
2059
2060        // Verify a failed migration schedules a collaborative reboot.
2061        let cr_req = &migration.collaborative_reboot.scheduler.req;
2062        assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
2063
2064        // Verify crash reports were filed for each failed healthcheck above
2065        // the threshold.
2066        let expected_num_crash_reports =
2067            failure_count - HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS + 1;
2068        assert_eq!(
2069            migration.crash_reporter.reports,
2070            vec![CrashReportReason::FailedHealthcheck; expected_num_crash_reports]
2071        )
2072    }
2073
2074    // Regression test for https://fxbug.dev/395913604.
2075    //
2076    // The original bug would reset the number of failed healthchecks in
2077    // persistence from `rollback::MAX_FAILED_HEALTHCHECKS` to 0, if an inbound
2078    // service request was received.
2079    //
2080    // Verify this is no longer the case by triggering a failed healthcheck,
2081    // pushing the total to `rollback::MAX_FAILED_HEALTHCHECKS`, then sending
2082    // a `fuchsia.net.migration/State.GetNetstackVersion` request.
2083    #[fuchsia::test]
2084    async fn migrate_to_ns3_rollback_regression_test() {
2085        let start = Persisted {
2086            user: None,
2087            automated: Some(NetstackVersion::Netstack3),
2088            rollback: Some(rollback::Persisted::HealthcheckFailures(
2089                rollback::MAX_FAILED_HEALTHCHECKS - 1,
2090            )),
2091            rollback_history: None,
2092        };
2093        let target = Persisted {
2094            user: None,
2095            automated: Some(NetstackVersion::Netstack3),
2096            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2097            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
2098        };
2099
2100        let (persistence, wait) = AwaitPersisted::with_persisted(start, &target);
2101        let mut migration = Migration::new(
2102            persistence,
2103            FakeCollaborativeReboot::default(),
2104            FakeCrashReporter::default(),
2105        );
2106        // A health check that always fails.
2107        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2108            Ok(fnet_http::Response { error: None, status_code: Some(500), ..Default::default() })
2109        });
2110        let healthcheck_tick = futures::stream::once(futures::future::ready(()));
2111        // Send "get" requests, to trigger the bug.
2112        let (client, server) =
2113            fidl::endpoints::create_proxy_and_stream::<fnet_migration::StateMarker>();
2114        let service_request_stream = server.map(|r| r.map(ServiceRequest::State));
2115        let client_fut = async move {
2116            // Send multiple get requests, to ensure that at least one would occur after the failed
2117            // healthcheck.
2118            let mut stream = fuchsia_async::Interval::new(Duration::from_millis(1).into());
2119            while let Some(()) = stream.next().await {
2120                let _ =
2121                    client.get_netstack_version().await.expect("failed to get netstack version");
2122            }
2123        }
2124        .fuse();
2125
2126        // If wait were to fire, the bug has occurred. Instead expect a timeout.
2127        // Use 1 second to keep the test runtime short; If CQ has a hiccup and
2128        // pauses execution, we'd see a false negative, which isn't a big deal.
2129        let wait_fut = wait
2130            .map(|()| panic!("unexpectedly observed the persisted healthcheck failures reset to 0"))
2131            .on_timeout(Duration::from_secs(1), || ())
2132            .fuse();
2133
2134        {
2135            let main_fut = main_inner(
2136                &mut migration,
2137                service_request_stream,
2138                mock_healthcheck,
2139                healthcheck_tick,
2140            )
2141            .fuse();
2142            futures::pin_mut!(main_fut);
2143            futures::pin_mut!(client_fut);
2144            futures::pin_mut!(wait_fut);
2145            futures::select!(
2146                () = main_fut => unreachable!("main fut should never exit"),
2147                () = client_fut => unreachable!("client fut should never exit"),
2148                () = wait_fut => {}
2149            );
2150        }
2151    }
2152
2153    #[fuchsia::test]
2154    async fn startup_after_first_rollback() {
2155        let start = Persisted {
2156            user: None,
2157            automated: Some(NetstackVersion::Netstack3),
2158            rollback: Some(rollback::Persisted::HealthcheckFailures(
2159                rollback::MAX_FAILED_HEALTHCHECKS,
2160            )),
2161            rollback_history: None,
2162        };
2163        let target = Persisted {
2164            user: None,
2165            automated: Some(NetstackVersion::Netstack3),
2166            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2167            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 1 }),
2168        };
2169
2170        let (persistence, persistence_wait) = AwaitPersisted::with_persisted(start, &target);
2171        let (crash_reporter, crash_reporter_wait) =
2172            AwaitCrashReports::new(vec![CrashReportReason::RolledBack]);
2173        let mut migration =
2174            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2175        // No service requests.
2176        let service_request_stream = futures::stream::pending();
2177        // No Healthchecks.
2178        let healthcheck_tick = futures::stream::pending();
2179        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2180            panic!("This test isn't exercising healthchecks");
2181        });
2182        {
2183            let main_fut = main_inner(
2184                &mut migration,
2185                service_request_stream,
2186                mock_healthcheck,
2187                healthcheck_tick,
2188            )
2189            .fuse();
2190            futures::pin_mut!(main_fut);
2191            let mut wait = futures::future::join(persistence_wait, crash_reporter_wait);
2192            futures::select!(
2193                () = main_fut => unreachable!("main fut should never exit"),
2194                ((), ()) = wait => {}
2195            );
2196        }
2197
2198        // The number of Healthcheck Failures should have been reset in
2199        // preparation for the next migration attempt.
2200        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2201        // The rollback history should have been updated to reflect this
2202        // failure.
2203        assert_eq!(
2204            migration.persisted.rollback_history,
2205            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 1 })
2206        );
2207        // A Collaborative Reboot request should be scheduled, so that we can
2208        // attempt the migration again.
2209        let cr_req = &migration.collaborative_reboot.scheduler.req;
2210        assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
2211        // The current_boot should reflect that we've rolled back to NS2
2212        // temporarily.
2213        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2);
2214        // A rollback crash report should have been filed.
2215        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::RolledBack],);
2216    }
2217
2218    #[fuchsia::test]
2219    async fn startup_after_final_rollback() {
2220        let start = Persisted {
2221            user: None,
2222            automated: Some(NetstackVersion::Netstack3),
2223            rollback: Some(rollback::Persisted::HealthcheckFailures(
2224                rollback::MAX_FAILED_HEALTHCHECKS,
2225            )),
2226            rollback_history: Some(RollbackHistory {
2227                epoch: CURRENT_EPOCH,
2228                count: MAX_ROLLBACKS_PER_EPOCH - 1,
2229            }),
2230        };
2231        let target = Persisted {
2232            user: None,
2233            automated: Some(NetstackVersion::Netstack3),
2234            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2235            rollback_history: Some(RollbackHistory {
2236                epoch: CURRENT_EPOCH,
2237                count: MAX_ROLLBACKS_PER_EPOCH,
2238            }),
2239        };
2240
2241        let (persistence, persistence_wait) = AwaitPersisted::with_persisted(start, &target);
2242        let (crash_reporter, crash_reporter_wait) =
2243            AwaitCrashReports::new(vec![CrashReportReason::RolledBack]);
2244        let mut migration =
2245            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2246        // No service requests.
2247        let service_request_stream = futures::stream::pending();
2248        // No Healthchecks.
2249        let healthcheck_tick = futures::stream::pending();
2250        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2251            panic!("This test isn't exercising healthchecks");
2252        });
2253        {
2254            let main_fut = main_inner(
2255                &mut migration,
2256                service_request_stream,
2257                mock_healthcheck,
2258                healthcheck_tick,
2259            )
2260            .fuse();
2261            futures::pin_mut!(main_fut);
2262            let mut wait = futures::future::join(persistence_wait, crash_reporter_wait);
2263            futures::select!(
2264                () = main_fut => unreachable!("main fut should never exit"),
2265                ((), ()) = wait => {}
2266            );
2267        }
2268
2269        // The number of Healthcheck Failures should have been reset in
2270        // preparation for the next migration attempt (in a later epoch).
2271        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2272        // The rollback history should have been updated to reflect this
2273        // failure.
2274        assert_eq!(
2275            migration.persisted.rollback_history,
2276            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH })
2277        );
2278        // A collaborative Reboot request should NOT be scheduled. We're not
2279        // going to re-attempt the migration until the epoch advances.
2280        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2281        // The current_boot should reflect that we've rolled back to NS2
2282        // indefinitely.
2283        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2Indefinitely);
2284        // A rollback crash report should have been filed.
2285        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::RolledBack]);
2286    }
2287
2288    // This test simulates the device starting up while we were already
2289    // in the `ForceNetstack2Indefinitely` state. E.g. the current boot isn't
2290    // a rollback, rather it's continuing to use Netstack2, as established in
2291    // a previous boot.
2292    #[fuchsia::test]
2293    async fn startup_already_with_max_rollbacks_per_epoch() {
2294        let persistence = InMemory::with_persisted(Persisted {
2295            user: None,
2296            automated: Some(NetstackVersion::Netstack3),
2297            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2298            rollback_history: Some(RollbackHistory {
2299                epoch: CURRENT_EPOCH,
2300                count: MAX_ROLLBACKS_PER_EPOCH,
2301            }),
2302        });
2303        // No crash report should get filed because this boot isn't a rollback.
2304        let crash_reporter = NoCrashReports;
2305        let mut migration =
2306            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2307        // No service requests.
2308        let service_request_stream = futures::stream::pending();
2309        // No Healthchecks.
2310        let healthcheck_tick = futures::stream::pending();
2311        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2312            panic!("This test isn't exercising healthchecks");
2313        });
2314
2315        // Drive the main future for a fixed timeout. It will never exit, but
2316        // we want to give the migration machinery enough time to apply any
2317        // changes it wants to make (it shouldn't make any).
2318        main_inner(&mut migration, service_request_stream, mock_healthcheck, healthcheck_tick)
2319            .map(Err)
2320            .on_timeout(Duration::from_secs(1), || Ok(()))
2321            .await
2322            .expect("main fut should never exit");
2323
2324        // None of the persisted state should have changed.
2325        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2326        assert_eq!(
2327            migration.persisted.rollback_history,
2328            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH })
2329        );
2330        // A collaborative Reboot request should NOT be scheduled.
2331        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2332        // The current_boot should reflect that we've rolled back to NS2
2333        // indefinitely.
2334        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2Indefinitely);
2335    }
2336
2337    #[fuchsia::test]
2338    async fn startup_with_new_epoch() {
2339        let start = Persisted {
2340            user: None,
2341            automated: Some(NetstackVersion::Netstack3),
2342            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2343            rollback_history: Some(RollbackHistory {
2344                epoch: CURRENT_EPOCH - 1,
2345                count: MAX_ROLLBACKS_PER_EPOCH,
2346            }),
2347        };
2348        let target = Persisted {
2349            user: None,
2350            automated: Some(NetstackVersion::Netstack3),
2351            rollback: Some(rollback::Persisted::HealthcheckFailures(1)),
2352            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
2353        };
2354
2355        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
2356        let mut migration =
2357            Migration::new(persistence, FakeCollaborativeReboot::default(), NoCrashReports);
2358        // No service requests.
2359        let service_request_stream = futures::stream::pending();
2360        // No Healthchecks.
2361        let healthcheck_tick = futures::stream::pending();
2362        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2363            panic!("This test isn't exercising healthchecks");
2364        });
2365        {
2366            let main_fut = main_inner(
2367                &mut migration,
2368                service_request_stream,
2369                mock_healthcheck,
2370                healthcheck_tick,
2371            )
2372            .fuse();
2373            futures::pin_mut!(main_fut);
2374            futures::select!(
2375                () = main_fut => unreachable!("main fut should never exit"),
2376                () = wait => {}
2377            );
2378        }
2379
2380        // The number of Healthcheck Failures should have been incremented,
2381        // indicating the rollback mechanism is trying to migrate to NS3.
2382        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(1)));
2383        // The rollback history should have been updated to reflect the new
2384        // epoch.
2385        assert_eq!(
2386            migration.persisted.rollback_history,
2387            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 })
2388        );
2389        // A collaborative Reboot request should NOT be scheduled. We're already
2390        // running Nestack3!
2391        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2392        // The current_boot should reflect that we're retrying migrations to NS3.
2393        assert_eq!(migration.current_boot, RollbackNetstackVersion::Netstack3);
2394    }
2395
2396    #[fuchsia::test]
2397    async fn noop_rollback_state_update() {
2398        let rollback_state = rollback::Persisted::HealthcheckFailures(
2399            HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS,
2400        );
2401        let mut m = Migration::new(
2402            InMemory::with_persisted(Persisted {
2403                user: None,
2404                automated: Some(NetstackVersion::Netstack3),
2405                rollback: Some(rollback_state),
2406                rollback_history: None,
2407            }),
2408            // Expect not to see a Collaborative Reboot or a Crash Report.
2409            NoCollaborativeReboot,
2410            NoCrashReports,
2411        );
2412        m.update_rollback_state(rollback_state).await;
2413    }
2414}