Skip to main content

stack_migration/
main.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5mod rollback;
6
7use std::pin::{Pin, pin};
8use std::time::Duration;
9
10use cobalt_client::traits::AsEventCode as _;
11use fidl_fuchsia_metrics as fmetrics;
12use fidl_fuchsia_net_stackmigrationdeprecated as fnet_migration;
13use fidl_fuchsia_power_internal as fpower;
14use fuchsia_async::Task;
15use fuchsia_component::server::{ServiceFs, ServiceFsDir};
16use fuchsia_inspect::Property as _;
17use futures::channel::mpsc;
18use futures::{FutureExt as _, Stream, StreamExt as _};
19use log::{error, info, warn};
20use networking_metrics_registry::networking_metrics_registry as metrics_registry;
21
22const DEFAULT_NETSTACK: NetstackVersion = NetstackVersion::Netstack3;
23
24/// The number of times a device is allowed to fail to migrate to Netstack3,
25/// before abandoning migration attempts during the `CURRENT_EPOCH`.
26///
27/// Based on the timing parameters in [rollback], a full rollback cycle takes
28/// about 2 days. Limiting the number of rollbacks to 180 should allow migration
29/// attempts for approximately 1 year. In a pragmatic sense, this effectively
30/// disables the "abandon migration" logic.
31const MAX_ROLLBACKS_PER_EPOCH: u8 = 180;
32
33/// The "epoch" is an arbitrary interval used to limit the number of attempts a
34/// device will make to migrate to Netstack3. Once the epoch increases, the
35/// device will resume attempts to migrate to Netstack3.
36///
37/// Pragmatically the intention is to increment this number every time a Fuchsia
38/// Release is cut that contains a known fix to a Netstack3 bug.
39const CURRENT_EPOCH: u32 = 3;
40
41/// The number of failed healthchecks at which we begin generating crash
42/// reports.
43///
44/// By setting this to one less than the `rollback::MAX_FAILED_HEALTHCHECKS`, we
45/// ensure each rollback event will have two "failed healthcheck" reports from
46/// the previous boot.
47const HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS: usize =
48    crate::rollback::MAX_FAILED_HEALTHCHECKS - 1;
49
50#[derive(Debug, Clone, Copy, serde::Deserialize, serde::Serialize, Eq, PartialEq)]
51enum NetstackVersion {
52    Netstack2,
53    Netstack3,
54}
55
56impl NetstackVersion {
57    fn inspect_uint_value(&self) -> u64 {
58        match self {
59            Self::Netstack2 => 2,
60            Self::Netstack3 => 3,
61        }
62    }
63
64    fn optional_inspect_uint_value(o: &Option<Self>) -> u64 {
65        o.as_ref().map(Self::inspect_uint_value).unwrap_or(0)
66    }
67}
68
69impl From<fnet_migration::NetstackVersion> for NetstackVersion {
70    fn from(value: fnet_migration::NetstackVersion) -> Self {
71        match value {
72            fnet_migration::NetstackVersion::Netstack2 => NetstackVersion::Netstack2,
73            fnet_migration::NetstackVersion::Netstack3 => NetstackVersion::Netstack3,
74        }
75    }
76}
77
78impl From<NetstackVersion> for fnet_migration::NetstackVersion {
79    fn from(value: NetstackVersion) -> Self {
80        match value {
81            NetstackVersion::Netstack2 => fnet_migration::NetstackVersion::Netstack2,
82            NetstackVersion::Netstack3 => fnet_migration::NetstackVersion::Netstack3,
83        }
84    }
85}
86
87impl From<NetstackVersion> for Box<fnet_migration::VersionSetting> {
88    fn from(value: NetstackVersion) -> Self {
89        Box::new(fnet_migration::VersionSetting { version: value.into() })
90    }
91}
92
93#[derive(Debug, PartialEq)]
94enum RollbackNetstackVersion {
95    Netstack2,
96    // The automated setting requested Netstack3, but the persisted state
97    // indicates that the previous boot failed to migrate to Netstack3 (had too
98    // many health check failure). Forcibly use Netstack2 for this boot.
99    ForceNetstack2,
100    // The automated setting requested Netstack3, but we've already failed too
101    // many migration attempts in the `CURRENT_EPOCH`. Forcibly Use Netstack2
102    // until the next epoch.
103    ForceNetstack2Indefinitely,
104    Netstack3,
105}
106
107impl RollbackNetstackVersion {
108    // Convert into a `NetstackVersion`, while honoring the forced setting.
109    fn version(&self) -> NetstackVersion {
110        match self {
111            Self::Netstack2 | Self::ForceNetstack2 | Self::ForceNetstack2Indefinitely => {
112                NetstackVersion::Netstack2
113            }
114            Self::Netstack3 => NetstackVersion::Netstack3,
115        }
116    }
117
118    // Convert into a `NetstackVersion`, while ignoring the forced setting.
119    fn version_ignoring_force(&self) -> NetstackVersion {
120        match self {
121            Self::Netstack2 => NetstackVersion::Netstack2,
122            Self::Netstack3 | Self::ForceNetstack2 | Self::ForceNetstack2Indefinitely => {
123                NetstackVersion::Netstack3
124            }
125        }
126    }
127}
128
129impl From<NetstackVersion> for RollbackNetstackVersion {
130    fn from(version: NetstackVersion) -> Self {
131        match version {
132            NetstackVersion::Netstack2 => RollbackNetstackVersion::Netstack2,
133            NetstackVersion::Netstack3 => RollbackNetstackVersion::Netstack3,
134        }
135    }
136}
137
138#[derive(Default, Debug, serde::Deserialize, serde::Serialize)]
139#[cfg_attr(test, derive(Eq, PartialEq))]
140struct Persisted {
141    automated: Option<NetstackVersion>,
142    user: Option<NetstackVersion>,
143    rollback: Option<rollback::Persisted>,
144    rollback_history: Option<RollbackHistory>,
145}
146
147impl Persisted {
148    fn load<R: std::io::Read>(r: R) -> Self {
149        serde_json::from_reader(std::io::BufReader::new(r)).unwrap_or_else(|e| {
150            error!("error loading persisted config {e:?}, using defaults");
151            Persisted::default()
152        })
153    }
154
155    fn save<W: std::io::Write>(&self, w: W) {
156        serde_json::to_writer(w, self).unwrap_or_else(|e: serde_json::Error| {
157            error!("error persisting configuration {self:?}: {e:?}")
158        })
159    }
160
161    // Determine the desired NetstackVersion based on the persisted values
162    fn desired_netstack_version(&self) -> RollbackNetstackVersion {
163        match self {
164            // Always prefer the user setting, if present.
165            Persisted { user: Some(user), automated: _, rollback: _, rollback_history: _ } => {
166                (*user).into()
167            }
168            // If the automated setting is Netstack2, honor it unconditionally.
169            Persisted {
170                user: None,
171                automated: Some(NetstackVersion::Netstack2),
172                rollback: _,
173                rollback_history: _,
174            } => RollbackNetstackVersion::Netstack2,
175            // If the automated setting is Netstack3, we need to first check the
176            // rollback state.
177            Persisted {
178                user: None,
179                automated: Some(NetstackVersion::Netstack3),
180                rollback,
181                rollback_history,
182            } => {
183                let rollback_count = match rollback_history {
184                    None => 0,
185                    Some(RollbackHistory { epoch, count }) => {
186                        if *epoch == CURRENT_EPOCH {
187                            *count
188                        } else {
189                            // Ignore failed rollbacks in an different epoch.
190                            0
191                        }
192                    }
193                };
194                let healthcheck_failure_count = match rollback {
195                    None => 0,
196                    Some(rollback::Persisted::Success) => 0,
197                    Some(rollback::Persisted::HealthcheckFailures(count)) => *count,
198                };
199
200                if rollback_count >= MAX_ROLLBACKS_PER_EPOCH {
201                    // If we've failed all of our allotted migration attempts
202                    // abandon further migration attempts in this epoch.
203                    RollbackNetstackVersion::ForceNetstack2Indefinitely
204                } else if healthcheck_failure_count >= rollback::MAX_FAILED_HEALTHCHECKS {
205                    // If we've failed all the healthchecks in the current
206                    // migration attempt, temporarily rollback to Netstack2.
207                    RollbackNetstackVersion::ForceNetstack2
208                } else {
209                    // Otherwise, proceed as normal with Netstack3.
210                    RollbackNetstackVersion::Netstack3
211                }
212            }
213            // Use the default version if nothing is set.
214            Persisted { user: None, automated: None, rollback: _, rollback_history: _ } => {
215                DEFAULT_NETSTACK.into()
216            }
217        }
218    }
219}
220
221/// A history of rollbacks that have occurred on the device.
222#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
223pub(crate) struct RollbackHistory {
224    /// The epoch during which the rollback occurred.
225    epoch: u32,
226    /// The number of rollbacks that took place.
227    count: u8,
228}
229
230impl Default for RollbackHistory {
231    fn default() -> Self {
232        RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }
233    }
234}
235
236enum ServiceRequest {
237    Control(fnet_migration::ControlRequest),
238    State(fnet_migration::StateRequest),
239}
240
241struct Migration<P, CR, R> {
242    current_boot: RollbackNetstackVersion,
243    persisted: Persisted,
244    persistence: P,
245    collaborative_reboot: CollaborativeReboot<CR>,
246    crash_reporter: R,
247}
248
249trait PersistenceProvider {
250    type Writer: std::io::Write;
251    type Reader: std::io::Read;
252
253    fn open_writer(&mut self) -> std::io::Result<Self::Writer>;
254    fn open_reader(&self) -> std::io::Result<Self::Reader>;
255}
256
257fn try_persist<P: PersistenceProvider>(provider: &mut P, data: &Persisted) {
258    let w = match provider.open_writer() {
259        Ok(w) => w,
260        Err(e) => {
261            error!("failed to open writer to persist settings: {e:?}");
262            return;
263        }
264    };
265    data.save(w);
266}
267
268struct DataPersistenceProvider {}
269
270const PERSISTED_FILE_PATH: &'static str = "/data/config.json";
271
272impl PersistenceProvider for DataPersistenceProvider {
273    type Writer = std::fs::File;
274    type Reader = std::fs::File;
275
276    fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
277        std::fs::File::create(PERSISTED_FILE_PATH)
278    }
279
280    fn open_reader(&self) -> std::io::Result<Self::Reader> {
281        std::fs::File::open(PERSISTED_FILE_PATH)
282    }
283}
284
285struct CollaborativeReboot<CR> {
286    scheduler: CR,
287    /// `Some(<cancellation_token>)` if there's an outstanding collaborative
288    /// reboot scheduled.
289    scheduled_req: Option<zx::EventPair>,
290}
291
292impl<CR: CollaborativeRebootScheduler> CollaborativeReboot<CR> {
293    /// Schedules a collaborative reboot.
294    ///
295    /// No-Op if there's already a reboot scheduled.
296    async fn schedule(&mut self) {
297        let Self { scheduler, scheduled_req } = self;
298        if scheduled_req.is_some() {
299            // We already have an outstanding request.
300            return;
301        }
302
303        info!("Scheduling collaborative reboot");
304        let (mine, theirs) = zx::EventPair::create();
305        *scheduled_req = Some(mine);
306        scheduler
307            .schedule(fpower::CollaborativeRebootReason::NetstackMigration, Some(theirs))
308            .await;
309    }
310
311    /// Cancels the currently scheduled collaborative Reboot.
312    ///
313    /// No-Op if there's none scheduled.
314    fn cancel(&mut self) {
315        if let Some(cancel) = self.scheduled_req.take() {
316            info!("Canceling collaborative reboot request. It's no longer necessary.");
317            // Dropping the eventpair cancels the request.
318            std::mem::drop(cancel);
319        }
320    }
321}
322
323/// An abstraction over the `fpower::CollaborativeRebootScheduler` FIDL API.
324trait CollaborativeRebootScheduler {
325    async fn schedule(
326        &mut self,
327        reason: fpower::CollaborativeRebootReason,
328        cancel: Option<zx::EventPair>,
329    );
330}
331
332/// An implementation of `CollaborativeRebootScheduler` that connects to the
333/// API over FIDL.
334struct Scheduler {}
335
336impl CollaborativeRebootScheduler for Scheduler {
337    async fn schedule(
338        &mut self,
339        reason: fpower::CollaborativeRebootReason,
340        cancel: Option<zx::EventPair>,
341    ) {
342        let proxy = match fuchsia_component::client::connect_to_protocol::<
343            fpower::CollaborativeRebootSchedulerMarker,
344        >() {
345            Ok(proxy) => proxy,
346            Err(e) => {
347                error!("Failed to connect to collaborative reboot scheduler: {e:?}");
348                return;
349            }
350        };
351        match proxy.schedule_reboot(reason, cancel).await {
352            Ok(()) => {}
353            Err(e) => error!("Failed to schedule collaborative reboot: {e:?}"),
354        }
355    }
356}
357
358/// The reason to file a crash report.
359#[derive(Debug, Clone, PartialEq, Eq)]
360enum CrashReportReason {
361    FailedHealthcheck,
362    RolledBack,
363}
364
365/// An abstraction over the `fidl_fuchsia_feedback::CrashReporter` FIDL API.
366trait CrashReporter {
367    /// The amount of time after startup to wait before filling a rollback
368    /// induced crash report.
369    const ROLLBACK_CRASH_REPORT_DELAY: Duration;
370
371    fn file_report(&mut self, reason: CrashReportReason);
372}
373
374/// An implementation of `CrashReporter` that connects to the API over FIDL.
375struct Reporter {}
376
377impl CrashReporter for Reporter {
378    const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_secs(180);
379
380    fn file_report(&mut self, reason: CrashReportReason) {
381        const PROGRAM_NAME: &str = "netstack_migration";
382        const FAILED_HEALTHCHECK_SIGNATURE: &str = "fuchsia-netstack3-healthcheck-failed";
383        const ROLLED_BACK_SIGNATURE: &str = "fuchsia-netstack3-rolled-back";
384
385        info!("Generating a crash report for reason: {reason:?}");
386
387        let proxy = match fuchsia_component::client::connect_to_protocol::<
388            fidl_fuchsia_feedback::CrashReporterMarker,
389        >() {
390            Ok(proxy) => proxy,
391            Err(e) => {
392                error!("Failed to connect to CrashReport proxy: {e:?}");
393                return;
394            }
395        };
396
397        let signature = match reason {
398            CrashReportReason::FailedHealthcheck => FAILED_HEALTHCHECK_SIGNATURE,
399            CrashReportReason::RolledBack => ROLLED_BACK_SIGNATURE,
400        };
401        let report = fidl_fuchsia_feedback::CrashReport {
402            program_name: Some(PROGRAM_NAME.to_string()),
403            program_uptime: Some(zx::MonotonicInstant::get().into_nanos()),
404            crash_signature: Some(signature.to_string()),
405            is_fatal: Some(false),
406            ..Default::default()
407        };
408
409        // File the crash report in a background task. It may take a while and
410        // we don't want to block the migration worker from serving other
411        // operations.
412        Task::spawn(proxy.file_report(report).map(|result| match result {
413            Ok(Ok(status)) => info!("Successfully filed crash report. Status={status:?}"),
414            Ok(Err(e)) => error!("Failed to file crash report: {e:?}"),
415            Err(e) => error!("Failed to request crash report: {e:?}"),
416        }))
417        .detach();
418    }
419}
420
421impl<P: PersistenceProvider, CR: CollaborativeRebootScheduler, R: CrashReporter>
422    Migration<P, CR, R>
423{
424    fn new(mut persistence: P, cr_scheduler: CR, crash_reporter: R) -> Self {
425        let mut persisted = persistence.open_reader().map(Persisted::load).unwrap_or_else(|e| {
426            warn!("could not open persistence reader: {e:?}. using defaults");
427            Persisted::default()
428        });
429
430        // Routine to update the rollback history on startup based on persisted
431        // values from the previous boot.
432        //
433        // This performs two important operations:
434        //   1) reset the failure count if the epoch has advanced, and
435        //   2) increment the failure count if the previous boot failed to
436        //      migrate.
437        let mut new = match persisted.rollback_history {
438            None => RollbackHistory::default(),
439            Some(RollbackHistory { epoch, count }) if epoch != CURRENT_EPOCH => {
440                if count > 0 {
441                    info!("Reseting Rollback History for new epoch");
442                }
443                RollbackHistory::default()
444            }
445            Some(history) => history,
446        };
447        if let Some(rollback::Persisted::HealthcheckFailures(failures)) = persisted.rollback {
448            if failures >= rollback::MAX_FAILED_HEALTHCHECKS {
449                new.count = new.count.saturating_add(1);
450            }
451        }
452
453        if Some(new) != persisted.rollback_history {
454            persisted.rollback_history = Some(new);
455            try_persist(&mut persistence, &persisted);
456        }
457
458        info!("Netstack Migration starting with persisted state: {persisted:?}");
459
460        let current_boot = persisted.desired_netstack_version();
461
462        match current_boot {
463            RollbackNetstackVersion::ForceNetstack2 => {
464                warn!(
465                    "Previous boot failed to migrate to Netstack3. \
466                    Ignoring automated setting and forcibly using Netstack2 \
467                    for the current boot."
468                );
469            }
470            RollbackNetstackVersion::ForceNetstack2Indefinitely => {
471                warn!(
472                    "Previous boot failed to migrate to Netstack3 and the \
473                    rollback limit has been exceeded. Ignoring automated \
474                    setting and forcibly using Netstack2 until the next epoch."
475                );
476            }
477            RollbackNetstackVersion::Netstack2 | RollbackNetstackVersion::Netstack3 => {}
478        }
479
480        Self {
481            current_boot,
482            persisted,
483            persistence,
484            collaborative_reboot: CollaborativeReboot {
485                scheduler: cr_scheduler,
486                scheduled_req: None,
487            },
488            crash_reporter,
489        }
490    }
491
492    fn persist(&mut self) {
493        let Self {
494            current_boot: _,
495            persisted,
496            persistence,
497            collaborative_reboot: _,
498            crash_reporter: _,
499        } = self;
500        try_persist(persistence, persisted);
501    }
502
503    fn map_version_setting(
504        version: Option<Box<fnet_migration::VersionSetting>>,
505    ) -> Option<NetstackVersion> {
506        version.map(|v| {
507            let fnet_migration::VersionSetting { version } = &*v;
508            (*version).into()
509        })
510    }
511
512    async fn update_collaborative_reboot(&mut self) {
513        let Self {
514            current_boot,
515            persisted,
516            persistence: _,
517            collaborative_reboot,
518            crash_reporter: _,
519        } = self;
520        if persisted.desired_netstack_version().version() != current_boot.version() {
521            // When the current boot differs from our desired version, schedule
522            // a reboot (if there's not already one).
523            collaborative_reboot.schedule().await
524        } else {
525            // When the current_boot matches our desired version, we no longer
526            // need reboot. Cancel the outstanding request (if any)
527            collaborative_reboot.cancel()
528        }
529    }
530
531    async fn update_rollback_state(&mut self, new_state: rollback::Persisted) {
532        if self.persisted.rollback != Some(new_state) {
533            self.persisted.rollback = Some(new_state);
534            self.update_collaborative_reboot().await;
535            self.persist();
536
537            if let rollback::Persisted::HealthcheckFailures(f) = new_state {
538                if f >= HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS {
539                    self.crash_reporter.file_report(CrashReportReason::FailedHealthcheck);
540                }
541            }
542        }
543    }
544
545    async fn handle_control_request(
546        &mut self,
547        req: fnet_migration::ControlRequest,
548    ) -> Result<(), fidl::Error> {
549        match req {
550            fnet_migration::ControlRequest::SetAutomatedNetstackVersion { version, responder } => {
551                let version = Self::map_version_setting(version);
552                let Self {
553                    current_boot: _,
554                    persisted: Persisted { automated, user: _, rollback: _, rollback_history: _ },
555                    persistence: _,
556                    collaborative_reboot: _,
557                    crash_reporter: _,
558                } = self;
559                if version != *automated {
560                    info!("automated netstack version switched to {version:?}");
561                    *automated = version;
562                    self.persist();
563                    self.update_collaborative_reboot().await;
564                }
565                responder.send()
566            }
567            fnet_migration::ControlRequest::SetUserNetstackVersion { version, responder } => {
568                let version = Self::map_version_setting(version);
569                let Self {
570                    current_boot: _,
571                    persisted: Persisted { automated: _, user, rollback: _, rollback_history: _ },
572                    persistence: _,
573                    collaborative_reboot: _,
574                    crash_reporter: _,
575                } = self;
576                if version != *user {
577                    info!("user netstack version switched to {version:?}");
578                    *user = version;
579                    self.persist();
580                    self.update_collaborative_reboot().await;
581                }
582                responder.send()
583            }
584        }
585    }
586
587    fn handle_state_request(&self, req: fnet_migration::StateRequest) -> Result<(), fidl::Error> {
588        let Migration {
589            current_boot,
590            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
591            persistence: _,
592            collaborative_reboot: _,
593            crash_reporter: _,
594        } = self;
595        match req {
596            fnet_migration::StateRequest::GetNetstackVersion { responder } => {
597                responder.send(&fnet_migration::InEffectVersion {
598                    current_boot: current_boot.version().into(),
599                    user: (*user).map(Into::into),
600                    automated: (*automated).map(Into::into),
601                })
602            }
603        }
604    }
605
606    async fn handle_request(&mut self, req: ServiceRequest) -> Result<(), fidl::Error> {
607        match req {
608            ServiceRequest::Control(r) => self.handle_control_request(r).await,
609            ServiceRequest::State(r) => self.handle_state_request(r),
610        }
611    }
612}
613
614struct InspectNodes {
615    automated_setting: fuchsia_inspect::UintProperty,
616    user_setting: fuchsia_inspect::UintProperty,
617    rollback_state: fuchsia_inspect::StringProperty,
618}
619
620impl InspectNodes {
621    fn new<P, CR, F>(inspector: &fuchsia_inspect::Inspector, m: &Migration<P, CR, F>) -> Self {
622        let root = inspector.root();
623        let Migration {
624            current_boot,
625            persisted: Persisted { automated, user, rollback, rollback_history },
626            ..
627        } = m;
628        let automated_setting = root.create_uint(
629            "automated_setting",
630            NetstackVersion::optional_inspect_uint_value(automated),
631        );
632        let user_setting =
633            root.create_uint("user_setting", NetstackVersion::optional_inspect_uint_value(user));
634
635        let rollback_state = root.create_string("rollback_state", format!("{rollback:?}"));
636
637        // Record immutable state once, instead of keeping track of a property node.
638        root.record_uint("current_boot", current_boot.version().inspect_uint_value());
639        let forced_netstack2 = match current_boot {
640            RollbackNetstackVersion::Netstack2 | RollbackNetstackVersion::Netstack3 => false,
641            RollbackNetstackVersion::ForceNetstack2
642            | RollbackNetstackVersion::ForceNetstack2Indefinitely => true,
643        };
644        root.record_bool("forced_netstack2", forced_netstack2);
645        root.record_uint("current_epoch", CURRENT_EPOCH.into());
646        root.record_string("rollback_history", format!("{rollback_history:?}"));
647
648        Self { automated_setting, user_setting, rollback_state }
649    }
650
651    fn update<P, CR, F>(&self, m: &Migration<P, CR, F>) {
652        let Migration {
653            persisted: Persisted { automated, user, rollback, rollback_history: _ },
654            ..
655        } = m;
656        let Self { automated_setting, user_setting, rollback_state } = self;
657        automated_setting.set(NetstackVersion::optional_inspect_uint_value(automated));
658        user_setting.set(NetstackVersion::optional_inspect_uint_value(user));
659        rollback_state.set(&format!("{rollback:?}"));
660    }
661}
662
663/// Wraps communication with metrics (cobalt) server.
664struct MetricsLogger {
665    logger: Option<fmetrics::MetricEventLoggerProxy>,
666}
667
668impl MetricsLogger {
669    async fn new() -> Self {
670        let (logger, server_end) =
671            fidl::endpoints::create_proxy::<fmetrics::MetricEventLoggerMarker>();
672
673        let factory = match fuchsia_component::client::connect_to_protocol::<
674            fmetrics::MetricEventLoggerFactoryMarker,
675        >() {
676            Ok(f) => f,
677            Err(e) => {
678                warn!("can't connect to logger factory {e:?}");
679                return Self { logger: None };
680            }
681        };
682
683        match factory
684            .create_metric_event_logger(
685                &fmetrics::ProjectSpec {
686                    customer_id: Some(metrics_registry::CUSTOMER_ID),
687                    project_id: Some(metrics_registry::PROJECT_ID),
688                    ..Default::default()
689                },
690                server_end,
691            )
692            .await
693        {
694            Ok(Ok(())) => Self { logger: Some(logger) },
695            Ok(Err(e)) => {
696                warn!("can't create event logger {e:?}");
697                Self { logger: None }
698            }
699            Err(e) => {
700                warn!("error connecting to metric event logger {e:?}");
701                Self { logger: None }
702            }
703        }
704    }
705
706    /// Logs metrics from `migration` to the metrics server.
707    async fn log_metrics<P, CR, F>(&self, migration: &Migration<P, CR, F>) {
708        let logger = if let Some(logger) = self.logger.as_ref() {
709            logger
710        } else {
711            // Silently don't log metrics if we didn't manage to create a
712            // logger, warnings are emitted upon creation.
713            return;
714        };
715
716        let current_boot = match migration.current_boot {
717            RollbackNetstackVersion::Netstack2
718            | RollbackNetstackVersion::ForceNetstack2
719            | RollbackNetstackVersion::ForceNetstack2Indefinitely => {
720                metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack2
721            }
722            RollbackNetstackVersion::Netstack3 => {
723                metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack3
724            }
725        }
726        .as_event_code();
727        let user = match migration.persisted.user {
728            None => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::NoSelection,
729            Some(NetstackVersion::Netstack2) => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack2,
730            Some(NetstackVersion::Netstack3) => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack3,
731        }
732        .as_event_code();
733        let automated = match migration.persisted.automated {
734            None => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::NoSelection,
735            Some(NetstackVersion::Netstack2) => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack2,
736            Some(NetstackVersion::Netstack3) => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack3,
737        }.as_event_code();
738        let rollback_state = compute_state_metric(migration).as_event_code();
739        for (metric_id, event_code) in [
740            (metrics_registry::STACK_MIGRATION_CURRENT_BOOT_METRIC_ID, current_boot),
741            (metrics_registry::STACK_MIGRATION_USER_SETTING_METRIC_ID, user),
742            (metrics_registry::STACK_MIGRATION_AUTOMATED_SETTING_METRIC_ID, automated),
743            (metrics_registry::STACK_MIGRATION_STATE_METRIC_ID, rollback_state),
744        ] {
745            let occurrence_count = 1;
746            logger
747                .log_occurrence(metric_id, occurrence_count, &[event_code][..])
748                .await
749                .map(|r| {
750                    r.unwrap_or_else(|e| warn!("error reported logging metric {metric_id} {e:?}"))
751                })
752                .unwrap_or_else(|fidl_error| {
753                    warn!("error logging metric {metric_id} {fidl_error:?}")
754                });
755        }
756    }
757}
758
759fn compute_state_metric<P, CR, F>(
760    migration: &Migration<P, CR, F>,
761) -> metrics_registry::StackMigrationStateMetricDimensionMigrationState {
762    use metrics_registry::StackMigrationStateMetricDimensionMigrationState as state_metric;
763    let Migration {
764        current_boot,
765        persisted: Persisted { automated, user: _, rollback, rollback_history: _ },
766        persistence: _,
767        collaborative_reboot: _,
768        crash_reporter: _,
769    } = migration;
770
771    match (current_boot, automated, rollback) {
772        (RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack2) | None, _) => {
773            state_metric::NotStarted
774        }
775        (RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack3), _) => {
776            state_metric::Scheduled
777        }
778        (RollbackNetstackVersion::ForceNetstack2Indefinitely, _, _) => state_metric::Abandoned,
779        (RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack3), _) => {
780            state_metric::RolledBack
781        }
782        // NB: If a device observes the automated setting switch to Netstack2
783        // while it's current boot is `ForceNetstack2` it won't have any need
784        // to schedule a reboot, and it will perpetually stay in
785        // `ForceNetstack`. Rather that perpetually reporting
786        // `state_metric::RolledBack`, it's more useful from a diagnostics
787        // perspective to consider this device `state_metric::NotStarted`.
788        (RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack2) | None, _) => {
789            state_metric::NotStarted
790        }
791        (RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack2) | None, _) => {
792            state_metric::Canceled
793        }
794        (RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3), None) => {
795            state_metric::InProgress
796        }
797        (
798            RollbackNetstackVersion::Netstack3,
799            Some(NetstackVersion::Netstack3),
800            Some(rollback::Persisted::HealthcheckFailures(f)),
801        ) => {
802            if *f >= rollback::MAX_FAILED_HEALTHCHECKS {
803                state_metric::Failed
804            } else {
805                state_metric::InProgress
806            }
807        }
808        (
809            RollbackNetstackVersion::Netstack3,
810            Some(NetstackVersion::Netstack3),
811            Some(rollback::Persisted::Success),
812        ) => state_metric::Success,
813    }
814}
815
816#[fuchsia::main]
817pub async fn main() {
818    info!("running netstack migration service");
819
820    let mut fs = ServiceFs::new();
821    let _: &mut ServiceFsDir<'_, _> = fs
822        .dir("svc")
823        .add_fidl_service(|rs: fnet_migration::ControlRequestStream| {
824            rs.map(|req| req.map(ServiceRequest::Control)).left_stream()
825        })
826        .add_fidl_service(|rs: fnet_migration::StateRequestStream| {
827            rs.map(|req| req.map(ServiceRequest::State)).right_stream()
828        });
829    let _: &mut ServiceFs<_> =
830        fs.take_and_serve_directory_handle().expect("failed to take out directory handle");
831
832    let mut migration = Migration::new(DataPersistenceProvider {}, Scheduler {}, Reporter {});
833    main_inner(
834        &mut migration,
835        fs.fuse().flatten_unordered(None),
836        rollback::FidlHttpFetcher::new(),
837        rollback::new_healthcheck_stream(),
838    )
839    .await
840}
841
842async fn main_inner<
843    P: PersistenceProvider,
844    CR: CollaborativeRebootScheduler,
845    R: CrashReporter,
846    H: rollback::HttpFetcher + Send + 'static,
847    T: Stream<Item = ()> + Send + 'static,
848    SR: Stream<Item = Result<ServiceRequest, fidl::Error>>,
849>(
850    migration: &mut Migration<P, CR, R>,
851    service_request_stream: SR,
852    http_fetcher: H,
853    healthcheck_tick: T,
854) {
855    let inspector = fuchsia_inspect::component::inspector();
856    let _inspect_server =
857        inspect_runtime::publish(inspector, inspect_runtime::PublishOptions::default())
858            .expect("failed to serve inspector");
859    let inspect_nodes = InspectNodes::new(inspector, &migration);
860
861    let metrics_logger = MetricsLogger::new().await;
862
863    let (desired_version_sender, desired_version_receiver) = mpsc::unbounded();
864    let (rollback_state_sender, rollback_state_receiver) = mpsc::unbounded();
865
866    // NB: Check if we failed to migrate to NS3 in the previous boot.
867    //
868    // It's important to perform this check before the rollback state is reset
869    // below.
870    let current_boot_is_rollback = match migration.persisted.rollback {
871        Some(rollback::Persisted::Success) | None => false,
872        Some(rollback::Persisted::HealthcheckFailures(f)) => f >= rollback::MAX_FAILED_HEALTHCHECKS,
873    };
874
875    let rollback_state =
876        rollback::State::new(migration.persisted.rollback, migration.current_boot.version());
877
878    // Update rollback persistence immediately in case the device reboots before
879    // the rollback module has time to send an asynchronous update. This is
880    // required for correctness if Netstack3 is crashing on startup, or in the
881    // following case:
882    //
883    // 1. Device fails to migrate to Netstack3 and persists
884    //    HealthcheckFailures(MAX_FAILED_HEALTHCHECKS), which will force
885    //    Netstack2 on subsequent boots.
886    // 2. Device reboots into Netstack2, sees that it should be running
887    //    Netstack3, and schedules a reboot without clearing the failures.
888    // 3. Device reboots back into Netstack3 and sees that it should schedule
889    //    a reboot because the persisted failures are above the limit.
890    migration.update_rollback_state(rollback_state.persisted()).await;
891
892    Task::spawn(async move {
893        rollback::run(
894            rollback_state,
895            http_fetcher,
896            desired_version_receiver,
897            rollback_state_sender,
898            pin!(healthcheck_tick),
899        )
900        .await
901    })
902    .detach();
903
904    enum Action {
905        ServiceRequest(Result<ServiceRequest, fidl::Error>),
906        LogMetrics,
907        UpdateRollbackState(rollback::Persisted),
908        FileRollbackCrashReport,
909    }
910
911    let metrics_logging_interval = fuchsia_async::MonotonicDuration::from_hours(1);
912    let mut stream: futures::stream::SelectAll<Pin<Box<dyn Stream<Item = Action>>>> =
913        futures::stream::SelectAll::new();
914
915    // Always log metrics once on startup then periodically log new values so
916    // the aggregation window always contains one sample of the current
917    // settings.
918    stream.push(Box::pin(Box::new(
919        futures::stream::once(futures::future::ready(()))
920            .chain(fuchsia_async::Interval::new(metrics_logging_interval))
921            .map(|()| Action::LogMetrics),
922    )));
923    stream.push(Box::pin(Box::new(Box::pin(service_request_stream.map(Action::ServiceRequest)))));
924    stream.push(Box::pin(rollback_state_receiver.map(|state| Action::UpdateRollbackState(state))));
925
926    // If the current boot is a rollback, schedule a crash report to be filed
927    // soon. Don't file it right away, as during early boot there is not much
928    // useful diagnostics data available.
929    if current_boot_is_rollback {
930        stream.push(Box::pin(
931            futures::stream::once(fuchsia_async::Timer::new(R::ROLLBACK_CRASH_REPORT_DELAY))
932                .map(|()| Action::FileRollbackCrashReport),
933        ));
934    }
935
936    while let Some(action) = stream.next().await {
937        match action {
938            Action::ServiceRequest(req) => {
939                let result = match req {
940                    Ok(req) => migration.handle_request(req).await,
941                    Err(e) => Err(e),
942                };
943                // Always update inspector state after handling a request.
944                inspect_nodes.update(&migration);
945
946                // Send the desired netstack version to the rollback mechanism,
947                // but ignore the "forced" setting. The "forced" setting comes
948                // from the rollback mechanism, and sending that signal back
949                // into it would cause a the mechanism to incorrectly detect
950                // a cancelation.
951                match desired_version_sender.unbounded_send(
952                    migration.persisted.desired_netstack_version().version_ignoring_force(),
953                ) {
954                    Ok(()) => (),
955                    Err(e) => {
956                        error!("error sending update to rollback module: {:?}", e);
957                    }
958                }
959
960                match result {
961                    Ok(()) => (),
962                    Err(e) => {
963                        if !e.is_closed() {
964                            error!("error processing FIDL request {:?}", e)
965                        }
966                    }
967                }
968            }
969            Action::LogMetrics => {
970                metrics_logger.log_metrics(&migration).await;
971            }
972            Action::UpdateRollbackState(new_state) => {
973                migration.update_rollback_state(new_state).await;
974                // Always update inspector state when the rollback state
975                // changes.
976                inspect_nodes.update(&migration);
977            }
978            Action::FileRollbackCrashReport => {
979                migration.crash_reporter.file_report(CrashReportReason::RolledBack);
980            }
981        }
982    }
983}
984
985#[cfg(test)]
986mod tests {
987    use super::*;
988    use assert_matches::assert_matches;
989    use async_utils::event::{Event, EventWait};
990    use diagnostics_assertions::assert_data_tree;
991    use fidl::Peered as _;
992    use fidl_fuchsia_net_http as fnet_http;
993    use fuchsia_async::TimeoutExt;
994    use futures::FutureExt;
995    use std::cell::RefCell;
996    use std::rc::Rc;
997    use std::time::Duration;
998    use test_case::test_case;
999
1000    #[derive(Default, Clone)]
1001    struct InMemory {
1002        file: Rc<RefCell<Option<Vec<u8>>>>,
1003    }
1004
1005    impl InMemory {
1006        fn with_persisted(p: Persisted) -> Self {
1007            let mut s = Self::default();
1008            p.save(s.open_writer().unwrap());
1009            s
1010        }
1011    }
1012
1013    impl PersistenceProvider for InMemory {
1014        type Writer = Self;
1015        type Reader = std::io::Cursor<Vec<u8>>;
1016
1017        fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
1018            *self.file.borrow_mut() = Some(Vec::new());
1019            Ok(self.clone())
1020        }
1021
1022        fn open_reader(&self) -> std::io::Result<Self::Reader> {
1023            self.file
1024                .borrow()
1025                .clone()
1026                .map(std::io::Cursor::new)
1027                .ok_or_else(|| std::io::ErrorKind::NotFound.into())
1028        }
1029    }
1030
1031    impl std::io::Write for InMemory {
1032        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1033            let r = self.file.borrow_mut().as_mut().expect("no file open").write(buf);
1034            r
1035        }
1036
1037        fn flush(&mut self) -> std::io::Result<()> {
1038            Ok(())
1039        }
1040    }
1041
1042    struct NoCollaborativeReboot;
1043
1044    impl CollaborativeRebootScheduler for NoCollaborativeReboot {
1045        async fn schedule(
1046            &mut self,
1047            _reason: fpower::CollaborativeRebootReason,
1048            _cancel: Option<zx::EventPair>,
1049        ) {
1050            panic!("unexpectedly attempted to schedule a collaborative reboot");
1051        }
1052    }
1053
1054    #[derive(Default)]
1055    struct FakeCollaborativeReboot {
1056        req: Option<zx::EventPair>,
1057    }
1058
1059    impl CollaborativeRebootScheduler for FakeCollaborativeReboot {
1060        async fn schedule(
1061            &mut self,
1062            reason: fpower::CollaborativeRebootReason,
1063            cancel: Option<zx::EventPair>,
1064        ) {
1065            assert_eq!(reason, fpower::CollaborativeRebootReason::NetstackMigration);
1066            let cancel = cancel.expect("cancellation signal must be provided");
1067            assert_eq!(self.req.replace(cancel), None, "attempted to schedule multiple reboots");
1068        }
1069    }
1070
1071    struct NoCrashReports;
1072
1073    impl CrashReporter for NoCrashReports {
1074        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::ZERO;
1075        fn file_report(&mut self, reason: CrashReportReason) {
1076            panic!("unexpectedly attemped to file a crash report: {reason:?}")
1077        }
1078    }
1079
1080    #[derive(Default)]
1081    struct FakeCrashReporter {
1082        reports: Vec<CrashReportReason>,
1083    }
1084
1085    impl CrashReporter for FakeCrashReporter {
1086        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_millis(1);
1087        fn file_report(&mut self, reason: CrashReportReason) {
1088            self.reports.push(reason)
1089        }
1090    }
1091
1092    /// Signals an `EventWait` once the `target` crash reports have been filed.
1093    struct AwaitCrashReports {
1094        target: Vec<CrashReportReason>,
1095        reports: Vec<CrashReportReason>,
1096        event: Event,
1097    }
1098
1099    impl AwaitCrashReports {
1100        fn new(target: Vec<CrashReportReason>) -> (Self, EventWait) {
1101            let event = Event::new();
1102            let wait = event.wait();
1103            (Self { target, reports: Vec::new(), event }, wait)
1104        }
1105    }
1106
1107    impl CrashReporter for AwaitCrashReports {
1108        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_millis(1);
1109        fn file_report(&mut self, reason: CrashReportReason) {
1110            self.reports.push(reason);
1111            if self.reports == self.target {
1112                assert!(self.event.signal());
1113            }
1114        }
1115    }
1116
1117    fn serve_migration<
1118        P: PersistenceProvider,
1119        CR: CollaborativeRebootScheduler,
1120        R: CrashReporter,
1121    >(
1122        migration: Migration<P, CR, R>,
1123    ) -> (
1124        impl futures::Future<Output = Migration<P, CR, R>>,
1125        fnet_migration::ControlProxy,
1126        fnet_migration::StateProxy,
1127    ) {
1128        let (control, control_server) =
1129            fidl::endpoints::create_proxy_and_stream::<fnet_migration::ControlMarker>();
1130        let (state, state_server) =
1131            fidl::endpoints::create_proxy_and_stream::<fnet_migration::StateMarker>();
1132
1133        let fut = {
1134            let control =
1135                control_server.map(|req| ServiceRequest::Control(req.expect("control error")));
1136            let state = state_server.map(|req| ServiceRequest::State(req.expect("state error")));
1137            futures::stream::select(control, state).fold(migration, |mut migration, req| async {
1138                migration.handle_request(req).await.expect("handling request");
1139                migration
1140            })
1141        };
1142        (fut, control, state)
1143    }
1144
1145    #[test_case(Persisted{
1146        user: Some(NetstackVersion::Netstack2),
1147        automated: None,
1148        rollback: None,
1149        rollback_history: None,
1150    }; "user_netstack2")]
1151    #[test_case(Persisted{
1152        user: Some(NetstackVersion::Netstack3),
1153        automated: None,
1154        rollback: None,
1155        rollback_history: None,
1156    }; "user_netstack3")]
1157    #[test_case(Persisted{
1158        user: None,
1159        automated: None,
1160        rollback: None,
1161        rollback_history: None,
1162    }; "none")]
1163    #[test_case(Persisted{
1164        user: None,
1165        automated: Some(NetstackVersion::Netstack2),
1166        rollback: None,
1167        rollback_history: None,
1168    }; "automated_netstack2")]
1169    #[test_case(Persisted{
1170        user: None,
1171        automated: Some(NetstackVersion::Netstack3),
1172        rollback: None,
1173        rollback_history: None,
1174    }; "automated_netstack3")]
1175    #[test_case(Persisted{
1176        user: None,
1177        automated: None,
1178        rollback: Some(rollback::Persisted::Success),
1179        rollback_history: None,
1180    }; "rollback_success")]
1181    #[test_case(Persisted{
1182        user: None,
1183        automated: None,
1184        rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
1185        rollback_history: Some(RollbackHistory {epoch: CURRENT_EPOCH, count: 3}),
1186    }; "rollback_history")]
1187    #[test_case(Persisted{
1188        user: Some(NetstackVersion::Netstack2),
1189        automated: Some(NetstackVersion::Netstack3),
1190        rollback: Some(rollback::Persisted::HealthcheckFailures(5)),
1191        rollback_history: Some(RollbackHistory {epoch: CURRENT_EPOCH, count: 3}),
1192    }; "all")]
1193    #[fuchsia::test(add_test_attr = false)]
1194    fn persist_save_load(v: Persisted) {
1195        let mut m = InMemory::default();
1196        v.save(m.open_writer().unwrap());
1197        assert_eq!(Persisted::load(m.open_reader().unwrap()), v);
1198    }
1199
1200    #[fuchsia::test]
1201    fn uses_defaults_if_no_persistence() {
1202        let m = Migration::new(InMemory::default(), NoCollaborativeReboot, NoCrashReports);
1203        let Migration {
1204            current_boot,
1205            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1206            persistence: _,
1207            collaborative_reboot: _,
1208            crash_reporter: _,
1209        } = m;
1210        assert_eq!(current_boot.version(), DEFAULT_NETSTACK);
1211        assert_eq!(user, None);
1212        assert_eq!(automated, None);
1213    }
1214
1215    #[test_case(
1216        None, Some(NetstackVersion::Netstack3), None, None, NetstackVersion::Netstack3;
1217        "automated_ns3")]
1218    #[test_case(
1219        None, Some(NetstackVersion::Netstack2), None, None, NetstackVersion::Netstack2;
1220        "automated_ns2")]
1221    #[test_case(
1222        Some(NetstackVersion::Netstack3),
1223        Some(NetstackVersion::Netstack2),
1224        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)),
1225        Some(RollbackHistory{epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH}),
1226        NetstackVersion::Netstack3;
1227        "user_ns3_override")]
1228    #[test_case(
1229        Some(NetstackVersion::Netstack2),
1230        Some(NetstackVersion::Netstack3),
1231        Some(rollback::Persisted::Success),
1232        None,
1233        NetstackVersion::Netstack2;
1234        "user_ns2_override")]
1235    #[test_case(
1236        Some(NetstackVersion::Netstack2),
1237        None,
1238        None,
1239        None,
1240        NetstackVersion::Netstack2; "user_ns2")]
1241    #[test_case(
1242        Some(NetstackVersion::Netstack3),
1243        None,
1244        None,
1245        None,
1246        NetstackVersion::Netstack3; "user_ns3")]
1247    #[test_case(
1248        None,
1249        Some(NetstackVersion::Netstack3),
1250        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)),
1251        None,
1252        NetstackVersion::Netstack2; "rollback_to_ns2")]
1253    #[test_case(
1254        None,
1255        Some(NetstackVersion::Netstack3),
1256        Some(rollback::Persisted::HealthcheckFailures(0)),
1257        Some(RollbackHistory{epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH}),
1258        NetstackVersion::Netstack2; "indefinitely_rolled_back_to_ns2")]
1259    #[test_case(
1260        None,
1261        Some(NetstackVersion::Netstack3),
1262        Some(rollback::Persisted::HealthcheckFailures(0)),
1263        Some(RollbackHistory{epoch: CURRENT_EPOCH - 1, count: MAX_ROLLBACKS_PER_EPOCH}),
1264        NetstackVersion::Netstack3; "ignore_rollback_history_from_old_epoch")]
1265    #[test_case(None, None, None, None, DEFAULT_NETSTACK; "default")]
1266    #[fuchsia::test]
1267    async fn get_netstack_version(
1268        p_user: Option<NetstackVersion>,
1269        p_automated: Option<NetstackVersion>,
1270        p_rollback: Option<rollback::Persisted>,
1271        p_history: Option<RollbackHistory>,
1272        expect: NetstackVersion,
1273    ) {
1274        let m = Migration::new(
1275            InMemory::with_persisted(Persisted {
1276                user: p_user,
1277                automated: p_automated,
1278                rollback: p_rollback,
1279                rollback_history: p_history,
1280            }),
1281            NoCollaborativeReboot,
1282            NoCrashReports,
1283        );
1284        let Migration {
1285            current_boot,
1286            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1287            persistence: _,
1288            collaborative_reboot: _,
1289            crash_reporter: _,
1290        } = &m;
1291        assert_eq!(current_boot.version(), expect);
1292        assert_eq!(*user, p_user);
1293        assert_eq!(*automated, p_automated);
1294
1295        let (serve, _, state) = serve_migration(m);
1296        let fut = async move {
1297            let fnet_migration::InEffectVersion { current_boot, user, automated } =
1298                state.get_netstack_version().await.expect("get netstack version");
1299            let expect = expect.into();
1300            let p_user = p_user.map(Into::into);
1301            let p_automated = p_automated.map(Into::into);
1302            assert_eq!(current_boot, expect);
1303            assert_eq!(user, p_user);
1304            assert_eq!(automated, p_automated);
1305        };
1306        let (_, ()): (Migration<_, _, _>, _) = futures::future::join(serve, fut).await;
1307    }
1308
1309    #[derive(Debug, Copy, Clone)]
1310    enum SetMechanism {
1311        User,
1312        Automated,
1313    }
1314
1315    #[test_case(SetMechanism::User, NetstackVersion::Netstack2; "set_user_ns2")]
1316    #[test_case(SetMechanism::User, NetstackVersion::Netstack3; "set_user_ns3")]
1317    #[test_case(SetMechanism::Automated, NetstackVersion::Netstack2; "set_automated_ns2")]
1318    #[test_case(SetMechanism::Automated, NetstackVersion::Netstack3; "set_automated_ns3")]
1319    #[fuchsia::test]
1320    async fn set_netstack_version(mechanism: SetMechanism, set_version: NetstackVersion) {
1321        let m = Migration::new(
1322            InMemory::with_persisted(Default::default()),
1323            FakeCollaborativeReboot::default(),
1324            NoCrashReports,
1325        );
1326        let (serve, control, _) = serve_migration(m);
1327        let fut = async move {
1328            let setting = fnet_migration::VersionSetting { version: set_version.into() };
1329            match mechanism {
1330                SetMechanism::User => control
1331                    .set_user_netstack_version(Some(&setting))
1332                    .await
1333                    .expect("set user netstack version"),
1334                SetMechanism::Automated => control
1335                    .set_automated_netstack_version(Some(&setting))
1336                    .await
1337                    .expect("set automated netstack version"),
1338            }
1339        };
1340        let (migration, ()) = futures::future::join(serve, fut).await;
1341
1342        let validate_versions = |m: &Migration<_, _, _>, current| {
1343            let Migration {
1344                current_boot,
1345                persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1346                persistence: _,
1347                collaborative_reboot: _,
1348                crash_reporter: _,
1349            } = m;
1350            assert_eq!(current_boot.version(), current);
1351            match mechanism {
1352                SetMechanism::User => {
1353                    assert_eq!(*user, Some(set_version));
1354                    assert_eq!(*automated, None);
1355                }
1356                SetMechanism::Automated => {
1357                    assert_eq!(*user, None);
1358                    assert_eq!(*automated, Some(set_version));
1359                }
1360            }
1361        };
1362
1363        validate_versions(&migration, DEFAULT_NETSTACK);
1364        // There should only be a collaborative reboot request if the new
1365        // version differs from the default version.
1366        let cr_req = &migration.collaborative_reboot.scheduler.req;
1367        if set_version != DEFAULT_NETSTACK {
1368            assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
1369        } else {
1370            assert_eq!(cr_req, &None);
1371        }
1372
1373        // Check that the setting was properly persisted.
1374        let migration = Migration::new(
1375            migration.persistence,
1376            migration.collaborative_reboot.scheduler,
1377            migration.crash_reporter,
1378        );
1379        validate_versions(&migration, set_version);
1380    }
1381
1382    #[fuchsia::test]
1383    async fn update_rollback_state() {
1384        let mut migration = Migration::new(
1385            InMemory::with_persisted(Persisted {
1386                automated: Some(NetstackVersion::Netstack3),
1387                user: None,
1388                rollback: None,
1389                rollback_history: None,
1390            }),
1391            FakeCollaborativeReboot::default(),
1392            FakeCrashReporter::default(),
1393        );
1394
1395        assert_eq!(migration.current_boot.version(), NetstackVersion::Netstack3);
1396        assert!(migration.collaborative_reboot.scheduler.req.is_none());
1397
1398        // The first update shouldn't schedule a reboot because we haven't
1399        // passed the healthcheck threshold yet.
1400        migration.update_rollback_state(rollback::Persisted::HealthcheckFailures(1)).await;
1401        assert_matches!(
1402            migration.persisted.rollback,
1403            Some(rollback::Persisted::HealthcheckFailures(1))
1404        );
1405        assert!(migration.collaborative_reboot.scheduler.req.is_none());
1406
1407        // This second update should schedule a reboot because we've passed
1408        // the healthcheck limit and want to roll back to Netstack2.
1409        migration
1410            .update_rollback_state(rollback::Persisted::HealthcheckFailures(
1411                rollback::MAX_FAILED_HEALTHCHECKS,
1412            ))
1413            .await;
1414        assert_matches!(
1415            migration.persisted.rollback,
1416            Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS))
1417        );
1418        assert_eq!(
1419            migration
1420                .collaborative_reboot
1421                .scheduler
1422                .req
1423                .as_ref()
1424                .expect("reboot was not scheduled")
1425                .is_closed()
1426                .unwrap(),
1427            false
1428        );
1429
1430        // It should have also filed a crash report
1431        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::FailedHealthcheck]);
1432
1433        // This emulates seeing a healthcheck success before rebooting, in which
1434        // case we should see the reboot get canceled.
1435        migration.update_rollback_state(rollback::Persisted::Success).await;
1436        assert_matches!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1437        assert!(
1438            migration.collaborative_reboot.scheduler.req.as_ref().unwrap().is_closed().unwrap()
1439        );
1440
1441        // Ensure that the changes were persisted successfully.
1442        let migration = Migration::new(
1443            migration.persistence,
1444            migration.collaborative_reboot.scheduler,
1445            migration.crash_reporter,
1446        );
1447        assert_matches!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1448    }
1449
1450    #[test_case(SetMechanism::User, Some(NetstackVersion::Netstack2), false)]
1451    #[test_case(SetMechanism::User, Some(NetstackVersion::Netstack3), true)]
1452    #[test_case(SetMechanism::User, None, false)]
1453    #[test_case(SetMechanism::Automated, Some(NetstackVersion::Netstack2), false)]
1454    #[test_case(SetMechanism::Automated, Some(NetstackVersion::Netstack3), true)]
1455    #[test_case(SetMechanism::Automated, None, true)]
1456    #[fuchsia::test]
1457    async fn cancel_collaborative_reboot(
1458        mechanism: SetMechanism,
1459        version: Option<NetstackVersion>,
1460        expect_canceled: bool,
1461    ) {
1462        let migration = Migration::new(
1463            InMemory::with_persisted(Persisted {
1464                user: None,
1465                automated: None,
1466                rollback: None,
1467                rollback_history: None,
1468            }),
1469            FakeCollaborativeReboot::default(),
1470            NoCrashReports,
1471        );
1472
1473        // Start of by updating the automated setting to Netstack2; this ensures
1474        // there is a pending request to cancel.
1475        let (serve, control, _) = serve_migration(migration);
1476        let fut = async move {
1477            control
1478                .set_automated_netstack_version(Some(&fnet_migration::VersionSetting {
1479                    version: fnet_migration::NetstackVersion::Netstack2,
1480                }))
1481                .await
1482                .expect("set automated netstack version");
1483        };
1484        let (migration, ()) = futures::future::join(serve, fut).await;
1485        let cancel = migration
1486            .collaborative_reboot
1487            .scheduler
1488            .req
1489            .as_ref()
1490            .expect("there should be a request");
1491        assert_eq!(Ok(false), cancel.is_closed());
1492
1493        // Update the setting based on the test parameters
1494        let (serve, control, _) = serve_migration(migration);
1495        let fut = async move {
1496            let setting = version.map(|v| fnet_migration::VersionSetting { version: v.into() });
1497            match mechanism {
1498                SetMechanism::User => control
1499                    .set_user_netstack_version(setting.as_ref())
1500                    .await
1501                    .expect("set user netstack version"),
1502                SetMechanism::Automated => control
1503                    .set_automated_netstack_version(setting.as_ref())
1504                    .await
1505                    .expect("set automated netstack version"),
1506            }
1507        };
1508        let (migration, ()) = futures::future::join(serve, fut).await;
1509
1510        let cancel = migration
1511            .collaborative_reboot
1512            .scheduler
1513            .req
1514            .as_ref()
1515            .expect("there should be a request");
1516        assert_eq!(Ok(expect_canceled), cancel.is_closed());
1517    }
1518
1519    #[test_case(SetMechanism::User)]
1520    #[test_case(SetMechanism::Automated)]
1521    #[fuchsia::test]
1522    async fn clear_netstack_version(mechanism: SetMechanism) {
1523        const PREVIOUS_VERSION: NetstackVersion = NetstackVersion::Netstack2;
1524        let m = Migration::new(
1525            InMemory::with_persisted(Persisted {
1526                user: Some(PREVIOUS_VERSION),
1527                automated: Some(PREVIOUS_VERSION),
1528                rollback: None,
1529                rollback_history: None,
1530            }),
1531            NoCollaborativeReboot,
1532            NoCrashReports,
1533        );
1534        let (serve, control, _) = serve_migration(m);
1535        let fut = async move {
1536            match mechanism {
1537                SetMechanism::User => control
1538                    .set_user_netstack_version(None)
1539                    .await
1540                    .expect("set user netstack version"),
1541                SetMechanism::Automated => control
1542                    .set_automated_netstack_version(None)
1543                    .await
1544                    .expect("set automated netstack version"),
1545            }
1546        };
1547        let (migration, ()) = futures::future::join(serve, fut).await;
1548
1549        let validate_versions = |m: &Migration<_, _, _>| {
1550            let Migration {
1551                current_boot,
1552                persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1553                persistence: _,
1554                collaborative_reboot: _,
1555                crash_reporter: _,
1556            } = m;
1557            assert_eq!(current_boot.version(), PREVIOUS_VERSION);
1558            match mechanism {
1559                SetMechanism::User => {
1560                    assert_eq!(*user, None);
1561                    assert_eq!(*automated, Some(PREVIOUS_VERSION));
1562                }
1563                SetMechanism::Automated => {
1564                    assert_eq!(*user, Some(PREVIOUS_VERSION));
1565                    assert_eq!(*automated, None);
1566                }
1567            }
1568        };
1569
1570        validate_versions(&migration);
1571        // Check that the setting was properly persisted.
1572        let migration = Migration::new(
1573            migration.persistence,
1574            migration.collaborative_reboot.scheduler,
1575            migration.crash_reporter,
1576        );
1577        validate_versions(&migration);
1578    }
1579
1580    #[fuchsia::test]
1581    async fn inspect() {
1582        let mut m = Migration::new(
1583            InMemory::with_persisted(Persisted {
1584                user: Some(NetstackVersion::Netstack2),
1585                automated: Some(NetstackVersion::Netstack3),
1586                rollback: None,
1587                rollback_history: None,
1588            }),
1589            NoCollaborativeReboot,
1590            NoCrashReports,
1591        );
1592        let inspector = fuchsia_inspect::component::inspector();
1593        let nodes = InspectNodes::new(inspector, &m);
1594        assert_data_tree!(inspector,
1595            root: {
1596                current_boot: 2u64,
1597                user_setting: 2u64,
1598                automated_setting: 3u64,
1599                rollback_state: "None",
1600                forced_netstack2: false,
1601                current_epoch: CURRENT_EPOCH,
1602                rollback_history: format!(
1603                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 0 }})"
1604                ),
1605            }
1606        );
1607
1608        m.persisted = Persisted {
1609            user: None,
1610            automated: Some(NetstackVersion::Netstack2),
1611            rollback: None,
1612            rollback_history: None,
1613        };
1614        nodes.update(&m);
1615        assert_data_tree!(inspector,
1616            root: {
1617                current_boot: 2u64,
1618                user_setting: 0u64,
1619                automated_setting: 2u64,
1620                rollback_state: "None",
1621                forced_netstack2: false,
1622                current_epoch: CURRENT_EPOCH,
1623                rollback_history: format!(
1624                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 0 }})"
1625                ),
1626            }
1627        );
1628    }
1629
1630    #[fuchsia::test]
1631    async fn inspect_rollback() {
1632        let mut m = Migration::new(
1633            InMemory::with_persisted(Persisted {
1634                user: None,
1635                automated: Some(NetstackVersion::Netstack3),
1636                rollback: Some(rollback::Persisted::HealthcheckFailures(
1637                    rollback::MAX_FAILED_HEALTHCHECKS,
1638                )),
1639                rollback_history: None,
1640            }),
1641            NoCollaborativeReboot,
1642            NoCrashReports,
1643        );
1644        let inspector = fuchsia_inspect::component::inspector();
1645        let nodes = InspectNodes::new(inspector, &m);
1646        assert_data_tree!(inspector,
1647            root: {
1648                current_boot: 2u64,
1649                user_setting: 0u64,
1650                automated_setting: 3u64,
1651                rollback_state: "Some(HealthcheckFailures(5))",
1652                forced_netstack2: true,
1653                current_epoch: CURRENT_EPOCH,
1654                rollback_history: format!(
1655                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1656                ),
1657            }
1658        );
1659
1660        m.persisted.rollback =
1661            Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS + 1));
1662        nodes.update(&m);
1663        assert_data_tree!(inspector,
1664            root: {
1665                current_boot: 2u64,
1666                user_setting: 0u64,
1667                automated_setting: 3u64,
1668                rollback_state: "Some(HealthcheckFailures(6))",
1669                forced_netstack2: true,
1670                current_epoch: CURRENT_EPOCH,
1671                rollback_history: format!(
1672                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1673                ),
1674            }
1675        );
1676        m.persisted.rollback = Some(rollback::Persisted::Success);
1677        nodes.update(&m);
1678        assert_data_tree!(inspector,
1679            root: {
1680                current_boot: 2u64,
1681                user_setting: 0u64,
1682                automated_setting: 3u64,
1683                rollback_state: "Some(Success)",
1684                forced_netstack2: true,
1685                current_epoch: CURRENT_EPOCH,
1686                rollback_history: format!(
1687                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1688                ),
1689            }
1690        );
1691    }
1692
1693    #[test_case(CURRENT_EPOCH, 2, true; "ns2_with_failures_in_current_epoch")]
1694    #[test_case(CURRENT_EPOCH - 1, 3, false; "ns3_with_failures_in_old_epoch")]
1695    #[fuchsia::test]
1696    async fn inspect_rollback_history(epoch: u32, current_boot: u64, forced_netstack2: bool) {
1697        let m = Migration::new(
1698            InMemory::with_persisted(Persisted {
1699                user: None,
1700                automated: Some(NetstackVersion::Netstack3),
1701                rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
1702                rollback_history: Some(RollbackHistory {
1703                    epoch: epoch,
1704                    count: MAX_ROLLBACKS_PER_EPOCH,
1705                }),
1706            }),
1707            NoCollaborativeReboot,
1708            NoCrashReports,
1709        );
1710        let inspector = fuchsia_inspect::component::inspector();
1711        let _nodes = InspectNodes::new(inspector, &m);
1712        assert_data_tree!(inspector,
1713            root: {
1714                current_boot: current_boot,
1715                user_setting: 0u64,
1716                automated_setting: 3u64,
1717                rollback_state: "Some(HealthcheckFailures(0))",
1718                forced_netstack2: forced_netstack2,
1719                current_epoch: CURRENT_EPOCH,
1720                rollback_history: format!(
1721                    "Some(RollbackHistory {{ epoch: {}, count: {} }})",
1722                    CURRENT_EPOCH,
1723                    // Check if the history should have been reset.
1724                    if epoch == CURRENT_EPOCH { MAX_ROLLBACKS_PER_EPOCH } else {0},
1725                ),
1726            }
1727        );
1728    }
1729
1730    #[test_case::test_matrix(
1731    [
1732        (RollbackNetstackVersion::Netstack2, metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack2),
1733        (RollbackNetstackVersion::Netstack3, metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack3),
1734    ],
1735    [
1736        (None, metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::NoSelection),
1737        (Some(NetstackVersion::Netstack2), metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack2),
1738        (Some(NetstackVersion::Netstack3), metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack3),
1739    ],
1740    [
1741        (None, metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::NoSelection),
1742        (Some(NetstackVersion::Netstack2), metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack2),
1743        (Some(NetstackVersion::Netstack3), metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack3),
1744    ]
1745    )]
1746    #[fuchsia::test]
1747    async fn metrics_logger(
1748        current_boot: (
1749            RollbackNetstackVersion,
1750            metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion,
1751        ),
1752        user: (
1753            Option<NetstackVersion>,
1754            metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion,
1755        ),
1756        automated: (
1757            Option<NetstackVersion>,
1758            metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion,
1759        ),
1760    ) {
1761        let (current_boot, current_boot_expect) = current_boot;
1762        let (user, user_expect) = user;
1763        let (automated, automated_expect) = automated;
1764        let mut m = Migration::new(
1765            InMemory::with_persisted(Persisted {
1766                user,
1767                automated,
1768                rollback: None,
1769                rollback_history: None,
1770            }),
1771            NoCollaborativeReboot,
1772            NoCrashReports,
1773        );
1774        m.current_boot = current_boot;
1775        let (logger, mut logger_stream) =
1776            fidl::endpoints::create_proxy_and_stream::<fmetrics::MetricEventLoggerMarker>();
1777
1778        let metrics_logger = MetricsLogger { logger: Some(logger) };
1779
1780        let ((), ()) = futures::future::join(metrics_logger.log_metrics(&m), async {
1781            let expect = [
1782                (
1783                    metrics_registry::STACK_MIGRATION_CURRENT_BOOT_METRIC_ID,
1784                    Some(current_boot_expect.as_event_code()),
1785                ),
1786                (
1787                    metrics_registry::STACK_MIGRATION_USER_SETTING_METRIC_ID,
1788                    Some(user_expect.as_event_code()),
1789                ),
1790                (
1791                    metrics_registry::STACK_MIGRATION_AUTOMATED_SETTING_METRIC_ID,
1792                    Some(automated_expect.as_event_code()),
1793                ),
1794                (
1795                    metrics_registry::STACK_MIGRATION_STATE_METRIC_ID,
1796                    // Note: The rollback state doesn't have a flat expectation.
1797                    // Don't assert on its value here, and instead we directly
1798                    // test it in a separate test case.
1799                    None,
1800                ),
1801            ];
1802            for (id, ev) in expect {
1803                let (metric, occurences, codes, responder) = logger_stream
1804                    .next()
1805                    .await
1806                    .unwrap()
1807                    .unwrap()
1808                    .into_log_occurrence()
1809                    .expect("bad request");
1810                assert_eq!(metric, id);
1811                assert_eq!(occurences, 1);
1812                if let Some(ev) = ev {
1813                    assert_eq!(codes, vec![ev]);
1814                }
1815                responder.send(Ok(())).unwrap();
1816            }
1817        })
1818        .await;
1819    }
1820
1821    #[test_case(
1822        RollbackNetstackVersion::Netstack2, None, None =>
1823        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1824        "not_started_none"
1825    )]
1826    #[test_case(
1827        RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack2), None =>
1828        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1829        "not_started_ns2"
1830    )]
1831    #[test_case(
1832        RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack3), None =>
1833        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Scheduled;
1834        "scheduled"
1835    )]
1836    #[test_case(
1837        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3), None =>
1838        metrics_registry::StackMigrationStateMetricDimensionMigrationState::InProgress;
1839        "in_progress_none"
1840    )]
1841    #[test_case(
1842        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1843        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS - 1)) =>
1844        metrics_registry::StackMigrationStateMetricDimensionMigrationState::InProgress;
1845        "in_progress_some"
1846    )]
1847    #[test_case(
1848        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1849        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)) =>
1850        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Failed;
1851        "failed_exact"
1852    )]
1853    #[test_case(
1854        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1855        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS + 1)) =>
1856        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Failed;
1857        "failed_more"
1858    )]
1859    #[test_case(
1860        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1861        Some(rollback::Persisted::Success) =>
1862        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Success;
1863        "success"
1864    )]
1865    #[test_case(
1866        RollbackNetstackVersion::Netstack3, None, None =>
1867        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Canceled;
1868        "canceled_none"
1869    )]
1870    #[test_case(
1871        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack2), None =>
1872        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Canceled;
1873        "canceled_ns2"
1874    )]
1875    #[test_case(
1876        RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack3), None =>
1877        metrics_registry::StackMigrationStateMetricDimensionMigrationState::RolledBack;
1878        "rolled_back"
1879    )]
1880    #[test_case(
1881        RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack2), None =>
1882        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1883        "rolled_back_becomes_not_started"
1884    )]
1885    #[test_case(
1886        RollbackNetstackVersion::ForceNetstack2Indefinitely, None, None =>
1887        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Abandoned;
1888        "abandoned"
1889    )]
1890    #[fuchsia::test]
1891    fn test_state_metric(
1892        current_boot: RollbackNetstackVersion,
1893        automated: Option<NetstackVersion>,
1894        rollback: Option<rollback::Persisted>,
1895    ) -> metrics_registry::StackMigrationStateMetricDimensionMigrationState {
1896        let mut migration = Migration::new(
1897            InMemory::with_persisted(Persisted {
1898                user: None,
1899                automated,
1900                rollback,
1901                rollback_history: None,
1902            }),
1903            NoCollaborativeReboot,
1904            NoCrashReports,
1905        );
1906        migration.current_boot = current_boot;
1907        compute_state_metric(&migration)
1908    }
1909
1910    /// An in-memory mock-persistence that triggers an event once the target
1911    /// state has been persisted.
1912    #[derive(Clone)]
1913    struct AwaitPersisted {
1914        file: Rc<RefCell<Option<Vec<u8>>>>,
1915        target: Vec<u8>,
1916        event: Event,
1917    }
1918
1919    impl AwaitPersisted {
1920        fn with_persisted(start: Persisted, target: &Persisted) -> (Self, EventWait) {
1921            let event = Event::new();
1922            let wait = event.wait();
1923            let target_bytes = serde_json::to_vec(target).expect("failed to serialize target");
1924            let mut s = Self { file: Default::default(), target: target_bytes, event };
1925            start.save(s.open_writer().unwrap());
1926            (s, wait)
1927        }
1928    }
1929
1930    impl PersistenceProvider for AwaitPersisted {
1931        type Writer = Self;
1932        type Reader = std::io::Cursor<Vec<u8>>;
1933
1934        fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
1935            *self.file.borrow_mut() = Some(Vec::new());
1936            Ok(self.clone())
1937        }
1938
1939        fn open_reader(&self) -> std::io::Result<Self::Reader> {
1940            self.file
1941                .borrow()
1942                .clone()
1943                .map(std::io::Cursor::new)
1944                .ok_or_else(|| std::io::ErrorKind::NotFound.into())
1945        }
1946    }
1947
1948    impl std::io::Write for AwaitPersisted {
1949        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1950            let r = self.file.borrow_mut().as_mut().expect("no file open").write(buf);
1951            if self.file.borrow().as_ref().expect("no_file_open") == &self.target {
1952                let _: bool = self.event.signal();
1953            }
1954            r
1955        }
1956
1957        fn flush(&mut self) -> std::io::Result<()> {
1958            Ok(())
1959        }
1960    }
1961
1962    #[fuchsia::test]
1963    async fn migrate_to_ns3_success() {
1964        let start = Persisted {
1965            user: None,
1966            automated: Some(NetstackVersion::Netstack3),
1967            rollback: None,
1968            rollback_history: None,
1969        };
1970        let target = Persisted {
1971            user: None,
1972            automated: Some(NetstackVersion::Netstack3),
1973            rollback: Some(rollback::Persisted::Success),
1974            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
1975        };
1976
1977        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
1978        let mut migration = Migration::new(persistence, NoCollaborativeReboot, NoCrashReports);
1979        // No service requests.
1980        let service_request_stream = futures::stream::pending();
1981        // A health check that always succeeds.
1982        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
1983            Ok(fnet_http::Response { error: None, status_code: Some(204), ..Default::default() })
1984        });
1985        let healthcheck_tick = futures::stream::once(futures::future::ready(()));
1986
1987        {
1988            let main_fut = main_inner(
1989                &mut migration,
1990                service_request_stream,
1991                mock_healthcheck,
1992                healthcheck_tick,
1993            )
1994            .fuse();
1995            futures::pin_mut!(main_fut);
1996            futures::select!(
1997                () = main_fut => unreachable!("main fut should never exit"),
1998                () = wait => {}
1999            );
2000        }
2001
2002        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::Success));
2003    }
2004
2005    #[test_case[0; "first_failure"]]
2006    #[test_case[MAX_ROLLBACKS_PER_EPOCH-1; "last_failure"]]
2007    #[fuchsia::test]
2008    async fn migrate_to_ns3_fails(rollbacks_in_history: u8) {
2009        let start = Persisted {
2010            user: None,
2011            automated: Some(NetstackVersion::Netstack3),
2012            rollback: None,
2013            rollback_history: Some(RollbackHistory {
2014                epoch: CURRENT_EPOCH,
2015                count: rollbacks_in_history,
2016            }),
2017        };
2018        let target = Persisted {
2019            user: None,
2020            automated: Some(NetstackVersion::Netstack3),
2021            rollback: Some(rollback::Persisted::HealthcheckFailures(
2022                rollback::MAX_FAILED_HEALTHCHECKS,
2023            )),
2024            rollback_history: Some(RollbackHistory {
2025                epoch: CURRENT_EPOCH,
2026                count: rollbacks_in_history,
2027            }),
2028        };
2029
2030        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
2031        let mut migration = Migration::new(
2032            persistence,
2033            FakeCollaborativeReboot::default(),
2034            FakeCrashReporter::default(),
2035        );
2036        // No service requests.
2037        let service_request_stream = futures::stream::pending();
2038        // A health check that always fails.
2039        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2040            Ok(fnet_http::Response { error: None, status_code: Some(500), ..Default::default() })
2041        });
2042        // Use a non-zero interval so that the Healthcheck code doesn't hog the scheduler.
2043        let healthcheck_tick = fuchsia_async::Interval::new(Duration::from_millis(1).into());
2044
2045        {
2046            let main_fut = main_inner(
2047                &mut migration,
2048                service_request_stream,
2049                mock_healthcheck,
2050                healthcheck_tick,
2051            )
2052            .fuse();
2053            futures::pin_mut!(main_fut);
2054            futures::select!(
2055                () = main_fut => unreachable!("main fut should never exit"),
2056                () = wait => {}
2057            );
2058        }
2059
2060        let failure_count = assert_matches!(
2061            migration.persisted.rollback,
2062            Some(rollback::Persisted::HealthcheckFailures(f)) => f
2063        );
2064        assert!(
2065            failure_count >= rollback::MAX_FAILED_HEALTHCHECKS,
2066            "failure_count={failure_count}"
2067        );
2068
2069        // Verify a failed migration schedules a collaborative reboot.
2070        let cr_req = &migration.collaborative_reboot.scheduler.req;
2071        assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
2072
2073        // Verify crash reports were filed for each failed healthcheck above
2074        // the threshold.
2075        let expected_num_crash_reports =
2076            failure_count - HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS + 1;
2077        assert_eq!(
2078            migration.crash_reporter.reports,
2079            vec![CrashReportReason::FailedHealthcheck; expected_num_crash_reports]
2080        )
2081    }
2082
2083    // Regression test for https://fxbug.dev/395913604.
2084    //
2085    // The original bug would reset the number of failed healthchecks in
2086    // persistence from `rollback::MAX_FAILED_HEALTHCHECKS` to 0, if an inbound
2087    // service request was received.
2088    //
2089    // Verify this is no longer the case by triggering a failed healthcheck,
2090    // pushing the total to `rollback::MAX_FAILED_HEALTHCHECKS`, then sending
2091    // a `fuchsia.net.migration/State.GetNetstackVersion` request.
2092    #[fuchsia::test]
2093    async fn migrate_to_ns3_rollback_regression_test() {
2094        let start = Persisted {
2095            user: None,
2096            automated: Some(NetstackVersion::Netstack3),
2097            rollback: Some(rollback::Persisted::HealthcheckFailures(
2098                rollback::MAX_FAILED_HEALTHCHECKS - 1,
2099            )),
2100            rollback_history: None,
2101        };
2102        let target = Persisted {
2103            user: None,
2104            automated: Some(NetstackVersion::Netstack3),
2105            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2106            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
2107        };
2108
2109        let (persistence, wait) = AwaitPersisted::with_persisted(start, &target);
2110        let mut migration = Migration::new(
2111            persistence,
2112            FakeCollaborativeReboot::default(),
2113            FakeCrashReporter::default(),
2114        );
2115        // A health check that always fails.
2116        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2117            Ok(fnet_http::Response { error: None, status_code: Some(500), ..Default::default() })
2118        });
2119        let healthcheck_tick = futures::stream::once(futures::future::ready(()));
2120        // Send "get" requests, to trigger the bug.
2121        let (client, server) =
2122            fidl::endpoints::create_proxy_and_stream::<fnet_migration::StateMarker>();
2123        let service_request_stream = server.map(|r| r.map(ServiceRequest::State));
2124        let client_fut = async move {
2125            // Send multiple get requests, to ensure that at least one would occur after the failed
2126            // healthcheck.
2127            let mut stream = fuchsia_async::Interval::new(Duration::from_millis(1).into());
2128            while let Some(()) = stream.next().await {
2129                let _ =
2130                    client.get_netstack_version().await.expect("failed to get netstack version");
2131            }
2132        }
2133        .fuse();
2134
2135        // If wait were to fire, the bug has occurred. Instead expect a timeout.
2136        // Use 1 second to keep the test runtime short; If CQ has a hiccup and
2137        // pauses execution, we'd see a false negative, which isn't a big deal.
2138        let wait_fut = wait
2139            .map(|()| panic!("unexpectedly observed the persisted healthcheck failures reset to 0"))
2140            .on_timeout(Duration::from_secs(1), || ())
2141            .fuse();
2142
2143        {
2144            let main_fut = main_inner(
2145                &mut migration,
2146                service_request_stream,
2147                mock_healthcheck,
2148                healthcheck_tick,
2149            )
2150            .fuse();
2151            futures::pin_mut!(main_fut);
2152            futures::pin_mut!(client_fut);
2153            futures::pin_mut!(wait_fut);
2154            futures::select!(
2155                () = main_fut => unreachable!("main fut should never exit"),
2156                () = client_fut => unreachable!("client fut should never exit"),
2157                () = wait_fut => {}
2158            );
2159        }
2160    }
2161
2162    #[fuchsia::test]
2163    async fn startup_after_first_rollback() {
2164        let start = Persisted {
2165            user: None,
2166            automated: Some(NetstackVersion::Netstack3),
2167            rollback: Some(rollback::Persisted::HealthcheckFailures(
2168                rollback::MAX_FAILED_HEALTHCHECKS,
2169            )),
2170            rollback_history: None,
2171        };
2172        let target = Persisted {
2173            user: None,
2174            automated: Some(NetstackVersion::Netstack3),
2175            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2176            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 1 }),
2177        };
2178
2179        let (persistence, persistence_wait) = AwaitPersisted::with_persisted(start, &target);
2180        let (crash_reporter, crash_reporter_wait) =
2181            AwaitCrashReports::new(vec![CrashReportReason::RolledBack]);
2182        let mut migration =
2183            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2184        // No service requests.
2185        let service_request_stream = futures::stream::pending();
2186        // No Healthchecks.
2187        let healthcheck_tick = futures::stream::pending();
2188        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2189            panic!("This test isn't exercising healthchecks");
2190        });
2191        {
2192            let main_fut = main_inner(
2193                &mut migration,
2194                service_request_stream,
2195                mock_healthcheck,
2196                healthcheck_tick,
2197            )
2198            .fuse();
2199            futures::pin_mut!(main_fut);
2200            let mut wait = futures::future::join(persistence_wait, crash_reporter_wait);
2201            futures::select!(
2202                () = main_fut => unreachable!("main fut should never exit"),
2203                ((), ()) = wait => {}
2204            );
2205        }
2206
2207        // The number of Healthcheck Failures should have been reset in
2208        // preparation for the next migration attempt.
2209        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2210        // The rollback history should have been updated to reflect this
2211        // failure.
2212        assert_eq!(
2213            migration.persisted.rollback_history,
2214            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 1 })
2215        );
2216        // A Collaborative Reboot request should be scheduled, so that we can
2217        // attempt the migration again.
2218        let cr_req = &migration.collaborative_reboot.scheduler.req;
2219        assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
2220        // The current_boot should reflect that we've rolled back to NS2
2221        // temporarily.
2222        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2);
2223        // A rollback crash report should have been filed.
2224        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::RolledBack],);
2225    }
2226
2227    #[fuchsia::test]
2228    async fn startup_after_final_rollback() {
2229        let start = Persisted {
2230            user: None,
2231            automated: Some(NetstackVersion::Netstack3),
2232            rollback: Some(rollback::Persisted::HealthcheckFailures(
2233                rollback::MAX_FAILED_HEALTHCHECKS,
2234            )),
2235            rollback_history: Some(RollbackHistory {
2236                epoch: CURRENT_EPOCH,
2237                count: MAX_ROLLBACKS_PER_EPOCH - 1,
2238            }),
2239        };
2240        let target = Persisted {
2241            user: None,
2242            automated: Some(NetstackVersion::Netstack3),
2243            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2244            rollback_history: Some(RollbackHistory {
2245                epoch: CURRENT_EPOCH,
2246                count: MAX_ROLLBACKS_PER_EPOCH,
2247            }),
2248        };
2249
2250        let (persistence, persistence_wait) = AwaitPersisted::with_persisted(start, &target);
2251        let (crash_reporter, crash_reporter_wait) =
2252            AwaitCrashReports::new(vec![CrashReportReason::RolledBack]);
2253        let mut migration =
2254            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2255        // No service requests.
2256        let service_request_stream = futures::stream::pending();
2257        // No Healthchecks.
2258        let healthcheck_tick = futures::stream::pending();
2259        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2260            panic!("This test isn't exercising healthchecks");
2261        });
2262        {
2263            let main_fut = main_inner(
2264                &mut migration,
2265                service_request_stream,
2266                mock_healthcheck,
2267                healthcheck_tick,
2268            )
2269            .fuse();
2270            futures::pin_mut!(main_fut);
2271            let mut wait = futures::future::join(persistence_wait, crash_reporter_wait);
2272            futures::select!(
2273                () = main_fut => unreachable!("main fut should never exit"),
2274                ((), ()) = wait => {}
2275            );
2276        }
2277
2278        // The number of Healthcheck Failures should have been reset in
2279        // preparation for the next migration attempt (in a later epoch).
2280        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2281        // The rollback history should have been updated to reflect this
2282        // failure.
2283        assert_eq!(
2284            migration.persisted.rollback_history,
2285            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH })
2286        );
2287        // A collaborative Reboot request should NOT be scheduled. We're not
2288        // going to re-attempt the migration until the epoch advances.
2289        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2290        // The current_boot should reflect that we've rolled back to NS2
2291        // indefinitely.
2292        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2Indefinitely);
2293        // A rollback crash report should have been filed.
2294        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::RolledBack]);
2295    }
2296
2297    // This test simulates the device starting up while we were already
2298    // in the `ForceNetstack2Indefinitely` state. E.g. the current boot isn't
2299    // a rollback, rather it's continuing to use Netstack2, as established in
2300    // a previous boot.
2301    #[fuchsia::test]
2302    async fn startup_already_with_max_rollbacks_per_epoch() {
2303        let persistence = InMemory::with_persisted(Persisted {
2304            user: None,
2305            automated: Some(NetstackVersion::Netstack3),
2306            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2307            rollback_history: Some(RollbackHistory {
2308                epoch: CURRENT_EPOCH,
2309                count: MAX_ROLLBACKS_PER_EPOCH,
2310            }),
2311        });
2312        // No crash report should get filed because this boot isn't a rollback.
2313        let crash_reporter = NoCrashReports;
2314        let mut migration =
2315            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2316        // No service requests.
2317        let service_request_stream = futures::stream::pending();
2318        // No Healthchecks.
2319        let healthcheck_tick = futures::stream::pending();
2320        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2321            panic!("This test isn't exercising healthchecks");
2322        });
2323
2324        // Drive the main future for a fixed timeout. It will never exit, but
2325        // we want to give the migration machinery enough time to apply any
2326        // changes it wants to make (it shouldn't make any).
2327        main_inner(&mut migration, service_request_stream, mock_healthcheck, healthcheck_tick)
2328            .map(Err)
2329            .on_timeout(Duration::from_secs(1), || Ok(()))
2330            .await
2331            .expect("main fut should never exit");
2332
2333        // None of the persisted state should have changed.
2334        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2335        assert_eq!(
2336            migration.persisted.rollback_history,
2337            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH })
2338        );
2339        // A collaborative Reboot request should NOT be scheduled.
2340        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2341        // The current_boot should reflect that we've rolled back to NS2
2342        // indefinitely.
2343        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2Indefinitely);
2344    }
2345
2346    #[fuchsia::test]
2347    async fn startup_with_new_epoch() {
2348        let start = Persisted {
2349            user: None,
2350            automated: Some(NetstackVersion::Netstack3),
2351            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2352            rollback_history: Some(RollbackHistory {
2353                epoch: CURRENT_EPOCH - 1,
2354                count: MAX_ROLLBACKS_PER_EPOCH,
2355            }),
2356        };
2357        let target = Persisted {
2358            user: None,
2359            automated: Some(NetstackVersion::Netstack3),
2360            rollback: Some(rollback::Persisted::HealthcheckFailures(1)),
2361            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
2362        };
2363
2364        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
2365        let mut migration =
2366            Migration::new(persistence, FakeCollaborativeReboot::default(), NoCrashReports);
2367        // No service requests.
2368        let service_request_stream = futures::stream::pending();
2369        // No Healthchecks.
2370        let healthcheck_tick = futures::stream::pending();
2371        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2372            panic!("This test isn't exercising healthchecks");
2373        });
2374        {
2375            let main_fut = main_inner(
2376                &mut migration,
2377                service_request_stream,
2378                mock_healthcheck,
2379                healthcheck_tick,
2380            )
2381            .fuse();
2382            futures::pin_mut!(main_fut);
2383            futures::select!(
2384                () = main_fut => unreachable!("main fut should never exit"),
2385                () = wait => {}
2386            );
2387        }
2388
2389        // The number of Healthcheck Failures should have been incremented,
2390        // indicating the rollback mechanism is trying to migrate to NS3.
2391        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(1)));
2392        // The rollback history should have been updated to reflect the new
2393        // epoch.
2394        assert_eq!(
2395            migration.persisted.rollback_history,
2396            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 })
2397        );
2398        // A collaborative Reboot request should NOT be scheduled. We're already
2399        // running Nestack3!
2400        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2401        // The current_boot should reflect that we're retrying migrations to NS3.
2402        assert_eq!(migration.current_boot, RollbackNetstackVersion::Netstack3);
2403    }
2404
2405    #[fuchsia::test]
2406    async fn noop_rollback_state_update() {
2407        let rollback_state = rollback::Persisted::HealthcheckFailures(
2408            HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS,
2409        );
2410        let mut m = Migration::new(
2411            InMemory::with_persisted(Persisted {
2412                user: None,
2413                automated: Some(NetstackVersion::Netstack3),
2414                rollback: Some(rollback_state),
2415                rollback_history: None,
2416            }),
2417            // Expect not to see a Collaborative Reboot or a Crash Report.
2418            NoCollaborativeReboot,
2419            NoCrashReports,
2420        );
2421        m.update_rollback_state(rollback_state).await;
2422    }
2423}