Skip to main content

stack_migration/
main.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5mod rollback;
6
7use std::pin::{Pin, pin};
8use std::time::Duration;
9
10use cobalt_client::traits::AsEventCode as _;
11use fidl_fuchsia_metrics as fmetrics;
12use fidl_fuchsia_net_stackmigrationdeprecated as fnet_migration;
13use fidl_fuchsia_power_internal as fpower;
14use fuchsia_async::Task;
15use fuchsia_component::server::{ServiceFs, ServiceFsDir};
16use fuchsia_inspect::Property as _;
17use futures::channel::mpsc;
18use futures::{FutureExt as _, Stream, StreamExt as _};
19use log::{error, info, warn};
20use networking_metrics_registry::networking_metrics_registry as metrics_registry;
21
22const DEFAULT_NETSTACK: NetstackVersion = NetstackVersion::Netstack3;
23
24/// The number of times a device is allowed to fail to migrate to Netstack3,
25/// before abandoning migration attempts during the `CURRENT_EPOCH`.
26const MAX_ROLLBACKS_PER_EPOCH: u8 = 3;
27
28/// The "epoch" is an arbitrary interval used to limit the number of attempts a
29/// device will make to migrate to Netstack3. Once the epoch increases, the
30/// device will resume attempts to migrate to Netstack3.
31///
32/// Pragmatically the intention is to increment this number every time a Fuchsia
33/// Release is cut that contains a known fix to a Netstack3 bug.
34const CURRENT_EPOCH: u32 = 3;
35
36/// The number of failed healthchecks at which we begin generating crash
37/// reports.
38///
39/// By setting this to one less than the `rollback::MAX_FAILED_HEALTHCHECKS`, we
40/// ensure each rollback event will have two "failed healthcheck" reports from
41/// the previous boot.
42const HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS: usize =
43    crate::rollback::MAX_FAILED_HEALTHCHECKS - 1;
44
45#[derive(Debug, Clone, Copy, serde::Deserialize, serde::Serialize, Eq, PartialEq)]
46enum NetstackVersion {
47    Netstack2,
48    Netstack3,
49}
50
51impl NetstackVersion {
52    fn inspect_uint_value(&self) -> u64 {
53        match self {
54            Self::Netstack2 => 2,
55            Self::Netstack3 => 3,
56        }
57    }
58
59    fn optional_inspect_uint_value(o: &Option<Self>) -> u64 {
60        o.as_ref().map(Self::inspect_uint_value).unwrap_or(0)
61    }
62}
63
64impl From<fnet_migration::NetstackVersion> for NetstackVersion {
65    fn from(value: fnet_migration::NetstackVersion) -> Self {
66        match value {
67            fnet_migration::NetstackVersion::Netstack2 => NetstackVersion::Netstack2,
68            fnet_migration::NetstackVersion::Netstack3 => NetstackVersion::Netstack3,
69        }
70    }
71}
72
73impl From<NetstackVersion> for fnet_migration::NetstackVersion {
74    fn from(value: NetstackVersion) -> Self {
75        match value {
76            NetstackVersion::Netstack2 => fnet_migration::NetstackVersion::Netstack2,
77            NetstackVersion::Netstack3 => fnet_migration::NetstackVersion::Netstack3,
78        }
79    }
80}
81
82impl From<NetstackVersion> for Box<fnet_migration::VersionSetting> {
83    fn from(value: NetstackVersion) -> Self {
84        Box::new(fnet_migration::VersionSetting { version: value.into() })
85    }
86}
87
88#[derive(Debug, PartialEq)]
89enum RollbackNetstackVersion {
90    Netstack2,
91    // The automated setting requested Netstack3, but the persisted state
92    // indicates that the previous boot failed to migrate to Netstack3 (had too
93    // many health check failure). Forcibly use Netstack2 for this boot.
94    ForceNetstack2,
95    // The automated setting requested Netstack3, but we've already failed too
96    // many migration attempts in the `CURRENT_EPOCH`. Forcibly Use Netstack2
97    // until the next epoch.
98    ForceNetstack2Indefinitely,
99    Netstack3,
100}
101
102impl RollbackNetstackVersion {
103    // Convert into a `NetstackVersion`, while honoring the forced setting.
104    fn version(&self) -> NetstackVersion {
105        match self {
106            Self::Netstack2 | Self::ForceNetstack2 | Self::ForceNetstack2Indefinitely => {
107                NetstackVersion::Netstack2
108            }
109            Self::Netstack3 => NetstackVersion::Netstack3,
110        }
111    }
112
113    // Convert into a `NetstackVersion`, while ignoring the forced setting.
114    fn version_ignoring_force(&self) -> NetstackVersion {
115        match self {
116            Self::Netstack2 => NetstackVersion::Netstack2,
117            Self::Netstack3 | Self::ForceNetstack2 | Self::ForceNetstack2Indefinitely => {
118                NetstackVersion::Netstack3
119            }
120        }
121    }
122}
123
124impl From<NetstackVersion> for RollbackNetstackVersion {
125    fn from(version: NetstackVersion) -> Self {
126        match version {
127            NetstackVersion::Netstack2 => RollbackNetstackVersion::Netstack2,
128            NetstackVersion::Netstack3 => RollbackNetstackVersion::Netstack3,
129        }
130    }
131}
132
133#[derive(Default, Debug, serde::Deserialize, serde::Serialize)]
134#[cfg_attr(test, derive(Eq, PartialEq))]
135struct Persisted {
136    automated: Option<NetstackVersion>,
137    user: Option<NetstackVersion>,
138    rollback: Option<rollback::Persisted>,
139    rollback_history: Option<RollbackHistory>,
140}
141
142impl Persisted {
143    fn load<R: std::io::Read>(r: R) -> Self {
144        serde_json::from_reader(std::io::BufReader::new(r)).unwrap_or_else(|e| {
145            error!("error loading persisted config {e:?}, using defaults");
146            Persisted::default()
147        })
148    }
149
150    fn save<W: std::io::Write>(&self, w: W) {
151        serde_json::to_writer(w, self).unwrap_or_else(|e: serde_json::Error| {
152            error!("error persisting configuration {self:?}: {e:?}")
153        })
154    }
155
156    // Determine the desired NetstackVersion based on the persisted values
157    fn desired_netstack_version(&self) -> RollbackNetstackVersion {
158        match self {
159            // Always prefer the user setting, if present.
160            Persisted { user: Some(user), automated: _, rollback: _, rollback_history: _ } => {
161                (*user).into()
162            }
163            // If the automated setting is Netstack2, honor it unconditionally.
164            Persisted {
165                user: None,
166                automated: Some(NetstackVersion::Netstack2),
167                rollback: _,
168                rollback_history: _,
169            } => RollbackNetstackVersion::Netstack2,
170            // If the automated setting is Netstack3, we need to first check the
171            // rollback state.
172            Persisted {
173                user: None,
174                automated: Some(NetstackVersion::Netstack3),
175                rollback,
176                rollback_history,
177            } => {
178                let rollback_count = match rollback_history {
179                    None => 0,
180                    Some(RollbackHistory { epoch, count }) => {
181                        if *epoch == CURRENT_EPOCH {
182                            *count
183                        } else {
184                            // Ignore failed rollbacks in an different epoch.
185                            0
186                        }
187                    }
188                };
189                let healthcheck_failure_count = match rollback {
190                    None => 0,
191                    Some(rollback::Persisted::Success) => 0,
192                    Some(rollback::Persisted::HealthcheckFailures(count)) => *count,
193                };
194
195                if rollback_count >= MAX_ROLLBACKS_PER_EPOCH {
196                    // If we've failed all of our allotted migration attempts
197                    // abandon further migration attempts in this epoch.
198                    RollbackNetstackVersion::ForceNetstack2Indefinitely
199                } else if healthcheck_failure_count >= rollback::MAX_FAILED_HEALTHCHECKS {
200                    // If we've failed all the healthchecks in the current
201                    // migration attempt, temporarily rollback to Netstack2.
202                    RollbackNetstackVersion::ForceNetstack2
203                } else {
204                    // Otherwise, proceed as normal with Netstack3.
205                    RollbackNetstackVersion::Netstack3
206                }
207            }
208            // Use the default version if nothing is set.
209            Persisted { user: None, automated: None, rollback: _, rollback_history: _ } => {
210                DEFAULT_NETSTACK.into()
211            }
212        }
213    }
214}
215
216/// A history of rollbacks that have occurred on the device.
217#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
218pub(crate) struct RollbackHistory {
219    /// The epoch during which the rollback occurred.
220    epoch: u32,
221    /// The number of rollbacks that took place.
222    count: u8,
223}
224
225impl Default for RollbackHistory {
226    fn default() -> Self {
227        RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }
228    }
229}
230
231enum ServiceRequest {
232    Control(fnet_migration::ControlRequest),
233    State(fnet_migration::StateRequest),
234}
235
236struct Migration<P, CR, R> {
237    current_boot: RollbackNetstackVersion,
238    persisted: Persisted,
239    persistence: P,
240    collaborative_reboot: CollaborativeReboot<CR>,
241    crash_reporter: R,
242}
243
244trait PersistenceProvider {
245    type Writer: std::io::Write;
246    type Reader: std::io::Read;
247
248    fn open_writer(&mut self) -> std::io::Result<Self::Writer>;
249    fn open_reader(&self) -> std::io::Result<Self::Reader>;
250}
251
252fn try_persist<P: PersistenceProvider>(provider: &mut P, data: &Persisted) {
253    let w = match provider.open_writer() {
254        Ok(w) => w,
255        Err(e) => {
256            error!("failed to open writer to persist settings: {e:?}");
257            return;
258        }
259    };
260    data.save(w);
261}
262
263struct DataPersistenceProvider {}
264
265const PERSISTED_FILE_PATH: &'static str = "/data/config.json";
266
267impl PersistenceProvider for DataPersistenceProvider {
268    type Writer = std::fs::File;
269    type Reader = std::fs::File;
270
271    fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
272        std::fs::File::create(PERSISTED_FILE_PATH)
273    }
274
275    fn open_reader(&self) -> std::io::Result<Self::Reader> {
276        std::fs::File::open(PERSISTED_FILE_PATH)
277    }
278}
279
280struct CollaborativeReboot<CR> {
281    scheduler: CR,
282    /// `Some(<cancellation_token>)` if there's an outstanding collaborative
283    /// reboot scheduled.
284    scheduled_req: Option<zx::EventPair>,
285}
286
287impl<CR: CollaborativeRebootScheduler> CollaborativeReboot<CR> {
288    /// Schedules a collaborative reboot.
289    ///
290    /// No-Op if there's already a reboot scheduled.
291    async fn schedule(&mut self) {
292        let Self { scheduler, scheduled_req } = self;
293        if scheduled_req.is_some() {
294            // We already have an outstanding request.
295            return;
296        }
297
298        info!("Scheduling collaborative reboot");
299        let (mine, theirs) = zx::EventPair::create();
300        *scheduled_req = Some(mine);
301        scheduler
302            .schedule(fpower::CollaborativeRebootReason::NetstackMigration, Some(theirs))
303            .await;
304    }
305
306    /// Cancels the currently scheduled collaborative Reboot.
307    ///
308    /// No-Op if there's none scheduled.
309    fn cancel(&mut self) {
310        if let Some(cancel) = self.scheduled_req.take() {
311            info!("Canceling collaborative reboot request. It's no longer necessary.");
312            // Dropping the eventpair cancels the request.
313            std::mem::drop(cancel);
314        }
315    }
316}
317
318/// An abstraction over the `fpower::CollaborativeRebootScheduler` FIDL API.
319trait CollaborativeRebootScheduler {
320    async fn schedule(
321        &mut self,
322        reason: fpower::CollaborativeRebootReason,
323        cancel: Option<zx::EventPair>,
324    );
325}
326
327/// An implementation of `CollaborativeRebootScheduler` that connects to the
328/// API over FIDL.
329struct Scheduler {}
330
331impl CollaborativeRebootScheduler for Scheduler {
332    async fn schedule(
333        &mut self,
334        reason: fpower::CollaborativeRebootReason,
335        cancel: Option<zx::EventPair>,
336    ) {
337        let proxy = match fuchsia_component::client::connect_to_protocol::<
338            fpower::CollaborativeRebootSchedulerMarker,
339        >() {
340            Ok(proxy) => proxy,
341            Err(e) => {
342                error!("Failed to connect to collaborative reboot scheduler: {e:?}");
343                return;
344            }
345        };
346        match proxy.schedule_reboot(reason, cancel).await {
347            Ok(()) => {}
348            Err(e) => error!("Failed to schedule collaborative reboot: {e:?}"),
349        }
350    }
351}
352
353/// The reason to file a crash report.
354#[derive(Debug, Clone, PartialEq, Eq)]
355enum CrashReportReason {
356    FailedHealthcheck,
357    RolledBack,
358}
359
360/// An abstraction over the `fidl_fuchsia_feedback::CrashReporter` FIDL API.
361trait CrashReporter {
362    /// The amount of time after startup to wait before filling a rollback
363    /// induced crash report.
364    const ROLLBACK_CRASH_REPORT_DELAY: Duration;
365
366    fn file_report(&mut self, reason: CrashReportReason);
367}
368
369/// An implementation of `CrashReporter` that connects to the API over FIDL.
370struct Reporter {}
371
372impl CrashReporter for Reporter {
373    const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_secs(180);
374
375    fn file_report(&mut self, reason: CrashReportReason) {
376        const PROGRAM_NAME: &str = "netstack_migration";
377        const FAILED_HEALTHCHECK_SIGNATURE: &str = "fuchsia-netstack3-healthcheck-failed";
378        const ROLLED_BACK_SIGNATURE: &str = "fuchsia-netstack3-rolled-back";
379
380        info!("Generating a crash report for reason: {reason:?}");
381
382        let proxy = match fuchsia_component::client::connect_to_protocol::<
383            fidl_fuchsia_feedback::CrashReporterMarker,
384        >() {
385            Ok(proxy) => proxy,
386            Err(e) => {
387                error!("Failed to connect to CrashReport proxy: {e:?}");
388                return;
389            }
390        };
391
392        let signature = match reason {
393            CrashReportReason::FailedHealthcheck => FAILED_HEALTHCHECK_SIGNATURE,
394            CrashReportReason::RolledBack => ROLLED_BACK_SIGNATURE,
395        };
396        let report = fidl_fuchsia_feedback::CrashReport {
397            program_name: Some(PROGRAM_NAME.to_string()),
398            program_uptime: Some(zx::MonotonicInstant::get().into_nanos()),
399            crash_signature: Some(signature.to_string()),
400            is_fatal: Some(false),
401            ..Default::default()
402        };
403
404        // File the crash report in a background task. It may take a while and
405        // we don't want to block the migration worker from serving other
406        // operations.
407        Task::spawn(proxy.file_report(report).map(|result| match result {
408            Ok(Ok(status)) => info!("Successfully filed crash report. Status={status:?}"),
409            Ok(Err(e)) => error!("Failed to file crash report: {e:?}"),
410            Err(e) => error!("Failed to request crash report: {e:?}"),
411        }))
412        .detach();
413    }
414}
415
416impl<P: PersistenceProvider, CR: CollaborativeRebootScheduler, R: CrashReporter>
417    Migration<P, CR, R>
418{
419    fn new(mut persistence: P, cr_scheduler: CR, crash_reporter: R) -> Self {
420        let mut persisted = persistence.open_reader().map(Persisted::load).unwrap_or_else(|e| {
421            warn!("could not open persistence reader: {e:?}. using defaults");
422            Persisted::default()
423        });
424
425        // Routine to update the rollback history on startup based on persisted
426        // values from the previous boot.
427        //
428        // This performs two important operations:
429        //   1) reset the failure count if the epoch has advanced, and
430        //   2) increment the failure count if the previous boot failed to
431        //      migrate.
432        let mut new = match persisted.rollback_history {
433            None => RollbackHistory::default(),
434            Some(RollbackHistory { epoch, count }) if epoch != CURRENT_EPOCH => {
435                if count > 0 {
436                    info!("Reseting Rollback History for new epoch");
437                }
438                RollbackHistory::default()
439            }
440            Some(history) => history,
441        };
442        if let Some(rollback::Persisted::HealthcheckFailures(failures)) = persisted.rollback {
443            if failures >= rollback::MAX_FAILED_HEALTHCHECKS {
444                new.count = new.count.saturating_add(1);
445            }
446        }
447
448        if Some(new) != persisted.rollback_history {
449            persisted.rollback_history = Some(new);
450            try_persist(&mut persistence, &persisted);
451        }
452
453        info!("Netstack Migration starting with persisted state: {persisted:?}");
454
455        let current_boot = persisted.desired_netstack_version();
456
457        match current_boot {
458            RollbackNetstackVersion::ForceNetstack2 => {
459                warn!(
460                    "Previous boot failed to migrate to Netstack3. \
461                    Ignoring automated setting and forcibly using Netstack2 \
462                    for the current boot."
463                );
464            }
465            RollbackNetstackVersion::ForceNetstack2Indefinitely => {
466                warn!(
467                    "Previous boot failed to migrate to Netstack3 and the \
468                    rollback limit has been exceeded. Ignoring automated \
469                    setting and forcibly using Netstack2 until the next epoch."
470                );
471            }
472            RollbackNetstackVersion::Netstack2 | RollbackNetstackVersion::Netstack3 => {}
473        }
474
475        Self {
476            current_boot,
477            persisted,
478            persistence,
479            collaborative_reboot: CollaborativeReboot {
480                scheduler: cr_scheduler,
481                scheduled_req: None,
482            },
483            crash_reporter,
484        }
485    }
486
487    fn persist(&mut self) {
488        let Self {
489            current_boot: _,
490            persisted,
491            persistence,
492            collaborative_reboot: _,
493            crash_reporter: _,
494        } = self;
495        try_persist(persistence, persisted);
496    }
497
498    fn map_version_setting(
499        version: Option<Box<fnet_migration::VersionSetting>>,
500    ) -> Option<NetstackVersion> {
501        version.map(|v| {
502            let fnet_migration::VersionSetting { version } = &*v;
503            (*version).into()
504        })
505    }
506
507    async fn update_collaborative_reboot(&mut self) {
508        let Self {
509            current_boot,
510            persisted,
511            persistence: _,
512            collaborative_reboot,
513            crash_reporter: _,
514        } = self;
515        if persisted.desired_netstack_version().version() != current_boot.version() {
516            // When the current boot differs from our desired version, schedule
517            // a reboot (if there's not already one).
518            collaborative_reboot.schedule().await
519        } else {
520            // When the current_boot matches our desired version, we no longer
521            // need reboot. Cancel the outstanding request (if any)
522            collaborative_reboot.cancel()
523        }
524    }
525
526    async fn update_rollback_state(&mut self, new_state: rollback::Persisted) {
527        if self.persisted.rollback != Some(new_state) {
528            self.persisted.rollback = Some(new_state);
529            self.update_collaborative_reboot().await;
530            self.persist();
531
532            if let rollback::Persisted::HealthcheckFailures(f) = new_state {
533                if f >= HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS {
534                    self.crash_reporter.file_report(CrashReportReason::FailedHealthcheck);
535                }
536            }
537        }
538    }
539
540    async fn handle_control_request(
541        &mut self,
542        req: fnet_migration::ControlRequest,
543    ) -> Result<(), fidl::Error> {
544        match req {
545            fnet_migration::ControlRequest::SetAutomatedNetstackVersion { version, responder } => {
546                let version = Self::map_version_setting(version);
547                let Self {
548                    current_boot: _,
549                    persisted: Persisted { automated, user: _, rollback: _, rollback_history: _ },
550                    persistence: _,
551                    collaborative_reboot: _,
552                    crash_reporter: _,
553                } = self;
554                if version != *automated {
555                    info!("automated netstack version switched to {version:?}");
556                    *automated = version;
557                    self.persist();
558                    self.update_collaborative_reboot().await;
559                }
560                responder.send()
561            }
562            fnet_migration::ControlRequest::SetUserNetstackVersion { version, responder } => {
563                let version = Self::map_version_setting(version);
564                let Self {
565                    current_boot: _,
566                    persisted: Persisted { automated: _, user, rollback: _, rollback_history: _ },
567                    persistence: _,
568                    collaborative_reboot: _,
569                    crash_reporter: _,
570                } = self;
571                if version != *user {
572                    info!("user netstack version switched to {version:?}");
573                    *user = version;
574                    self.persist();
575                    self.update_collaborative_reboot().await;
576                }
577                responder.send()
578            }
579        }
580    }
581
582    fn handle_state_request(&self, req: fnet_migration::StateRequest) -> Result<(), fidl::Error> {
583        let Migration {
584            current_boot,
585            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
586            persistence: _,
587            collaborative_reboot: _,
588            crash_reporter: _,
589        } = self;
590        match req {
591            fnet_migration::StateRequest::GetNetstackVersion { responder } => {
592                responder.send(&fnet_migration::InEffectVersion {
593                    current_boot: current_boot.version().into(),
594                    user: (*user).map(Into::into),
595                    automated: (*automated).map(Into::into),
596                })
597            }
598        }
599    }
600
601    async fn handle_request(&mut self, req: ServiceRequest) -> Result<(), fidl::Error> {
602        match req {
603            ServiceRequest::Control(r) => self.handle_control_request(r).await,
604            ServiceRequest::State(r) => self.handle_state_request(r),
605        }
606    }
607}
608
609struct InspectNodes {
610    automated_setting: fuchsia_inspect::UintProperty,
611    user_setting: fuchsia_inspect::UintProperty,
612    rollback_state: fuchsia_inspect::StringProperty,
613}
614
615impl InspectNodes {
616    fn new<P, CR, F>(inspector: &fuchsia_inspect::Inspector, m: &Migration<P, CR, F>) -> Self {
617        let root = inspector.root();
618        let Migration {
619            current_boot,
620            persisted: Persisted { automated, user, rollback, rollback_history },
621            ..
622        } = m;
623        let automated_setting = root.create_uint(
624            "automated_setting",
625            NetstackVersion::optional_inspect_uint_value(automated),
626        );
627        let user_setting =
628            root.create_uint("user_setting", NetstackVersion::optional_inspect_uint_value(user));
629
630        let rollback_state = root.create_string("rollback_state", format!("{rollback:?}"));
631
632        // Record immutable state once, instead of keeping track of a property node.
633        root.record_uint("current_boot", current_boot.version().inspect_uint_value());
634        let forced_netstack2 = match current_boot {
635            RollbackNetstackVersion::Netstack2 | RollbackNetstackVersion::Netstack3 => false,
636            RollbackNetstackVersion::ForceNetstack2
637            | RollbackNetstackVersion::ForceNetstack2Indefinitely => true,
638        };
639        root.record_bool("forced_netstack2", forced_netstack2);
640        root.record_uint("current_epoch", CURRENT_EPOCH.into());
641        root.record_string("rollback_history", format!("{rollback_history:?}"));
642
643        Self { automated_setting, user_setting, rollback_state }
644    }
645
646    fn update<P, CR, F>(&self, m: &Migration<P, CR, F>) {
647        let Migration {
648            persisted: Persisted { automated, user, rollback, rollback_history: _ },
649            ..
650        } = m;
651        let Self { automated_setting, user_setting, rollback_state } = self;
652        automated_setting.set(NetstackVersion::optional_inspect_uint_value(automated));
653        user_setting.set(NetstackVersion::optional_inspect_uint_value(user));
654        rollback_state.set(&format!("{rollback:?}"));
655    }
656}
657
658/// Wraps communication with metrics (cobalt) server.
659struct MetricsLogger {
660    logger: Option<fmetrics::MetricEventLoggerProxy>,
661}
662
663impl MetricsLogger {
664    async fn new() -> Self {
665        let (logger, server_end) =
666            fidl::endpoints::create_proxy::<fmetrics::MetricEventLoggerMarker>();
667
668        let factory = match fuchsia_component::client::connect_to_protocol::<
669            fmetrics::MetricEventLoggerFactoryMarker,
670        >() {
671            Ok(f) => f,
672            Err(e) => {
673                warn!("can't connect to logger factory {e:?}");
674                return Self { logger: None };
675            }
676        };
677
678        match factory
679            .create_metric_event_logger(
680                &fmetrics::ProjectSpec {
681                    customer_id: Some(metrics_registry::CUSTOMER_ID),
682                    project_id: Some(metrics_registry::PROJECT_ID),
683                    ..Default::default()
684                },
685                server_end,
686            )
687            .await
688        {
689            Ok(Ok(())) => Self { logger: Some(logger) },
690            Ok(Err(e)) => {
691                warn!("can't create event logger {e:?}");
692                Self { logger: None }
693            }
694            Err(e) => {
695                warn!("error connecting to metric event logger {e:?}");
696                Self { logger: None }
697            }
698        }
699    }
700
701    /// Logs metrics from `migration` to the metrics server.
702    async fn log_metrics<P, CR, F>(&self, migration: &Migration<P, CR, F>) {
703        let logger = if let Some(logger) = self.logger.as_ref() {
704            logger
705        } else {
706            // Silently don't log metrics if we didn't manage to create a
707            // logger, warnings are emitted upon creation.
708            return;
709        };
710
711        let current_boot = match migration.current_boot {
712            RollbackNetstackVersion::Netstack2
713            | RollbackNetstackVersion::ForceNetstack2
714            | RollbackNetstackVersion::ForceNetstack2Indefinitely => {
715                metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack2
716            }
717            RollbackNetstackVersion::Netstack3 => {
718                metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack3
719            }
720        }
721        .as_event_code();
722        let user = match migration.persisted.user {
723            None => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::NoSelection,
724            Some(NetstackVersion::Netstack2) => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack2,
725            Some(NetstackVersion::Netstack3) => metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack3,
726        }
727        .as_event_code();
728        let automated = match migration.persisted.automated {
729            None => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::NoSelection,
730            Some(NetstackVersion::Netstack2) => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack2,
731            Some(NetstackVersion::Netstack3) => metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack3,
732        }.as_event_code();
733        let rollback_state = compute_state_metric(migration).as_event_code();
734        for (metric_id, event_code) in [
735            (metrics_registry::STACK_MIGRATION_CURRENT_BOOT_METRIC_ID, current_boot),
736            (metrics_registry::STACK_MIGRATION_USER_SETTING_METRIC_ID, user),
737            (metrics_registry::STACK_MIGRATION_AUTOMATED_SETTING_METRIC_ID, automated),
738            (metrics_registry::STACK_MIGRATION_STATE_METRIC_ID, rollback_state),
739        ] {
740            let occurrence_count = 1;
741            logger
742                .log_occurrence(metric_id, occurrence_count, &[event_code][..])
743                .await
744                .map(|r| {
745                    r.unwrap_or_else(|e| warn!("error reported logging metric {metric_id} {e:?}"))
746                })
747                .unwrap_or_else(|fidl_error| {
748                    warn!("error logging metric {metric_id} {fidl_error:?}")
749                });
750        }
751    }
752}
753
754fn compute_state_metric<P, CR, F>(
755    migration: &Migration<P, CR, F>,
756) -> metrics_registry::StackMigrationStateMetricDimensionMigrationState {
757    use metrics_registry::StackMigrationStateMetricDimensionMigrationState as state_metric;
758    let Migration {
759        current_boot,
760        persisted: Persisted { automated, user: _, rollback, rollback_history: _ },
761        persistence: _,
762        collaborative_reboot: _,
763        crash_reporter: _,
764    } = migration;
765
766    match (current_boot, automated, rollback) {
767        (RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack2) | None, _) => {
768            state_metric::NotStarted
769        }
770        (RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack3), _) => {
771            state_metric::Scheduled
772        }
773        (RollbackNetstackVersion::ForceNetstack2Indefinitely, _, _) => state_metric::Abandoned,
774        (RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack3), _) => {
775            state_metric::RolledBack
776        }
777        // NB: If a device observes the automated setting switch to Netstack2
778        // while it's current boot is `ForceNetstack2` it won't have any need
779        // to schedule a reboot, and it will perpetually stay in
780        // `ForceNetstack`. Rather that perpetually reporting
781        // `state_metric::RolledBack`, it's more useful from a diagnostics
782        // perspective to consider this device `state_metric::NotStarted`.
783        (RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack2) | None, _) => {
784            state_metric::NotStarted
785        }
786        (RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack2) | None, _) => {
787            state_metric::Canceled
788        }
789        (RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3), None) => {
790            state_metric::InProgress
791        }
792        (
793            RollbackNetstackVersion::Netstack3,
794            Some(NetstackVersion::Netstack3),
795            Some(rollback::Persisted::HealthcheckFailures(f)),
796        ) => {
797            if *f >= rollback::MAX_FAILED_HEALTHCHECKS {
798                state_metric::Failed
799            } else {
800                state_metric::InProgress
801            }
802        }
803        (
804            RollbackNetstackVersion::Netstack3,
805            Some(NetstackVersion::Netstack3),
806            Some(rollback::Persisted::Success),
807        ) => state_metric::Success,
808    }
809}
810
811#[fuchsia::main]
812pub async fn main() {
813    info!("running netstack migration service");
814
815    let mut fs = ServiceFs::new();
816    let _: &mut ServiceFsDir<'_, _> = fs
817        .dir("svc")
818        .add_fidl_service(|rs: fnet_migration::ControlRequestStream| {
819            rs.map(|req| req.map(ServiceRequest::Control)).left_stream()
820        })
821        .add_fidl_service(|rs: fnet_migration::StateRequestStream| {
822            rs.map(|req| req.map(ServiceRequest::State)).right_stream()
823        });
824    let _: &mut ServiceFs<_> =
825        fs.take_and_serve_directory_handle().expect("failed to take out directory handle");
826
827    let mut migration = Migration::new(DataPersistenceProvider {}, Scheduler {}, Reporter {});
828    main_inner(
829        &mut migration,
830        fs.fuse().flatten_unordered(None),
831        rollback::FidlHttpFetcher::new(),
832        rollback::new_healthcheck_stream(),
833    )
834    .await
835}
836
837async fn main_inner<
838    P: PersistenceProvider,
839    CR: CollaborativeRebootScheduler,
840    R: CrashReporter,
841    H: rollback::HttpFetcher + Send + 'static,
842    T: Stream<Item = ()> + Send + 'static,
843    SR: Stream<Item = Result<ServiceRequest, fidl::Error>>,
844>(
845    migration: &mut Migration<P, CR, R>,
846    service_request_stream: SR,
847    http_fetcher: H,
848    healthcheck_tick: T,
849) {
850    let inspector = fuchsia_inspect::component::inspector();
851    let _inspect_server =
852        inspect_runtime::publish(inspector, inspect_runtime::PublishOptions::default())
853            .expect("failed to serve inspector");
854    let inspect_nodes = InspectNodes::new(inspector, &migration);
855
856    let metrics_logger = MetricsLogger::new().await;
857
858    let (desired_version_sender, desired_version_receiver) = mpsc::unbounded();
859    let (rollback_state_sender, rollback_state_receiver) = mpsc::unbounded();
860
861    // NB: Check if we failed to migrate to NS3 in the previous boot.
862    //
863    // It's important to perform this check before the rollback state is reset
864    // below.
865    let current_boot_is_rollback = match migration.persisted.rollback {
866        Some(rollback::Persisted::Success) | None => false,
867        Some(rollback::Persisted::HealthcheckFailures(f)) => f >= rollback::MAX_FAILED_HEALTHCHECKS,
868    };
869
870    let rollback_state =
871        rollback::State::new(migration.persisted.rollback, migration.current_boot.version());
872
873    // Update rollback persistence immediately in case the device reboots before
874    // the rollback module has time to send an asynchronous update. This is
875    // required for correctness if Netstack3 is crashing on startup, or in the
876    // following case:
877    //
878    // 1. Device fails to migrate to Netstack3 and persists
879    //    HealthcheckFailures(MAX_FAILED_HEALTHCHECKS), which will force
880    //    Netstack2 on subsequent boots.
881    // 2. Device reboots into Netstack2, sees that it should be running
882    //    Netstack3, and schedules a reboot without clearing the failures.
883    // 3. Device reboots back into Netstack3 and sees that it should schedule
884    //    a reboot because the persisted failures are above the limit.
885    migration.update_rollback_state(rollback_state.persisted()).await;
886
887    Task::spawn(async move {
888        rollback::run(
889            rollback_state,
890            http_fetcher,
891            desired_version_receiver,
892            rollback_state_sender,
893            pin!(healthcheck_tick),
894        )
895        .await
896    })
897    .detach();
898
899    enum Action {
900        ServiceRequest(Result<ServiceRequest, fidl::Error>),
901        LogMetrics,
902        UpdateRollbackState(rollback::Persisted),
903        FileRollbackCrashReport,
904    }
905
906    let metrics_logging_interval = fuchsia_async::MonotonicDuration::from_hours(1);
907    let mut stream: futures::stream::SelectAll<Pin<Box<dyn Stream<Item = Action>>>> =
908        futures::stream::SelectAll::new();
909
910    // Always log metrics once on startup then periodically log new values so
911    // the aggregation window always contains one sample of the current
912    // settings.
913    stream.push(Box::pin(Box::new(
914        futures::stream::once(futures::future::ready(()))
915            .chain(fuchsia_async::Interval::new(metrics_logging_interval))
916            .map(|()| Action::LogMetrics),
917    )));
918    stream.push(Box::pin(Box::new(Box::pin(service_request_stream.map(Action::ServiceRequest)))));
919    stream.push(Box::pin(rollback_state_receiver.map(|state| Action::UpdateRollbackState(state))));
920
921    // If the current boot is a rollback, schedule a crash report to be filed
922    // soon. Don't file it right away, as during early boot there is not much
923    // useful diagnostics data available.
924    if current_boot_is_rollback {
925        stream.push(Box::pin(
926            futures::stream::once(fuchsia_async::Timer::new(R::ROLLBACK_CRASH_REPORT_DELAY))
927                .map(|()| Action::FileRollbackCrashReport),
928        ));
929    }
930
931    while let Some(action) = stream.next().await {
932        match action {
933            Action::ServiceRequest(req) => {
934                let result = match req {
935                    Ok(req) => migration.handle_request(req).await,
936                    Err(e) => Err(e),
937                };
938                // Always update inspector state after handling a request.
939                inspect_nodes.update(&migration);
940
941                // Send the desired netstack version to the rollback mechanism,
942                // but ignore the "forced" setting. The "forced" setting comes
943                // from the rollback mechanism, and sending that signal back
944                // into it would cause a the mechanism to incorrectly detect
945                // a cancelation.
946                match desired_version_sender.unbounded_send(
947                    migration.persisted.desired_netstack_version().version_ignoring_force(),
948                ) {
949                    Ok(()) => (),
950                    Err(e) => {
951                        error!("error sending update to rollback module: {:?}", e);
952                    }
953                }
954
955                match result {
956                    Ok(()) => (),
957                    Err(e) => {
958                        if !e.is_closed() {
959                            error!("error processing FIDL request {:?}", e)
960                        }
961                    }
962                }
963            }
964            Action::LogMetrics => {
965                metrics_logger.log_metrics(&migration).await;
966            }
967            Action::UpdateRollbackState(new_state) => {
968                migration.update_rollback_state(new_state).await;
969                // Always update inspector state when the rollback state
970                // changes.
971                inspect_nodes.update(&migration);
972            }
973            Action::FileRollbackCrashReport => {
974                migration.crash_reporter.file_report(CrashReportReason::RolledBack);
975            }
976        }
977    }
978}
979
980#[cfg(test)]
981mod tests {
982    use super::*;
983    use assert_matches::assert_matches;
984    use async_utils::event::{Event, EventWait};
985    use diagnostics_assertions::assert_data_tree;
986    use fidl::Peered as _;
987    use fidl_fuchsia_net_http as fnet_http;
988    use fuchsia_async::TimeoutExt;
989    use futures::FutureExt;
990    use std::cell::RefCell;
991    use std::rc::Rc;
992    use std::time::Duration;
993    use test_case::test_case;
994
995    #[derive(Default, Clone)]
996    struct InMemory {
997        file: Rc<RefCell<Option<Vec<u8>>>>,
998    }
999
1000    impl InMemory {
1001        fn with_persisted(p: Persisted) -> Self {
1002            let mut s = Self::default();
1003            p.save(s.open_writer().unwrap());
1004            s
1005        }
1006    }
1007
1008    impl PersistenceProvider for InMemory {
1009        type Writer = Self;
1010        type Reader = std::io::Cursor<Vec<u8>>;
1011
1012        fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
1013            *self.file.borrow_mut() = Some(Vec::new());
1014            Ok(self.clone())
1015        }
1016
1017        fn open_reader(&self) -> std::io::Result<Self::Reader> {
1018            self.file
1019                .borrow()
1020                .clone()
1021                .map(std::io::Cursor::new)
1022                .ok_or_else(|| std::io::ErrorKind::NotFound.into())
1023        }
1024    }
1025
1026    impl std::io::Write for InMemory {
1027        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1028            let r = self.file.borrow_mut().as_mut().expect("no file open").write(buf);
1029            r
1030        }
1031
1032        fn flush(&mut self) -> std::io::Result<()> {
1033            Ok(())
1034        }
1035    }
1036
1037    struct NoCollaborativeReboot;
1038
1039    impl CollaborativeRebootScheduler for NoCollaborativeReboot {
1040        async fn schedule(
1041            &mut self,
1042            _reason: fpower::CollaborativeRebootReason,
1043            _cancel: Option<zx::EventPair>,
1044        ) {
1045            panic!("unexpectedly attempted to schedule a collaborative reboot");
1046        }
1047    }
1048
1049    #[derive(Default)]
1050    struct FakeCollaborativeReboot {
1051        req: Option<zx::EventPair>,
1052    }
1053
1054    impl CollaborativeRebootScheduler for FakeCollaborativeReboot {
1055        async fn schedule(
1056            &mut self,
1057            reason: fpower::CollaborativeRebootReason,
1058            cancel: Option<zx::EventPair>,
1059        ) {
1060            assert_eq!(reason, fpower::CollaborativeRebootReason::NetstackMigration);
1061            let cancel = cancel.expect("cancellation signal must be provided");
1062            assert_eq!(self.req.replace(cancel), None, "attempted to schedule multiple reboots");
1063        }
1064    }
1065
1066    struct NoCrashReports;
1067
1068    impl CrashReporter for NoCrashReports {
1069        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::ZERO;
1070        fn file_report(&mut self, reason: CrashReportReason) {
1071            panic!("unexpectedly attemped to file a crash report: {reason:?}")
1072        }
1073    }
1074
1075    #[derive(Default)]
1076    struct FakeCrashReporter {
1077        reports: Vec<CrashReportReason>,
1078    }
1079
1080    impl CrashReporter for FakeCrashReporter {
1081        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_millis(1);
1082        fn file_report(&mut self, reason: CrashReportReason) {
1083            self.reports.push(reason)
1084        }
1085    }
1086
1087    /// Signals an `EventWait` once the `target` crash reports have been filed.
1088    struct AwaitCrashReports {
1089        target: Vec<CrashReportReason>,
1090        reports: Vec<CrashReportReason>,
1091        event: Event,
1092    }
1093
1094    impl AwaitCrashReports {
1095        fn new(target: Vec<CrashReportReason>) -> (Self, EventWait) {
1096            let event = Event::new();
1097            let wait = event.wait();
1098            (Self { target, reports: Vec::new(), event }, wait)
1099        }
1100    }
1101
1102    impl CrashReporter for AwaitCrashReports {
1103        const ROLLBACK_CRASH_REPORT_DELAY: Duration = Duration::from_millis(1);
1104        fn file_report(&mut self, reason: CrashReportReason) {
1105            self.reports.push(reason);
1106            if self.reports == self.target {
1107                assert!(self.event.signal());
1108            }
1109        }
1110    }
1111
1112    fn serve_migration<
1113        P: PersistenceProvider,
1114        CR: CollaborativeRebootScheduler,
1115        R: CrashReporter,
1116    >(
1117        migration: Migration<P, CR, R>,
1118    ) -> (
1119        impl futures::Future<Output = Migration<P, CR, R>>,
1120        fnet_migration::ControlProxy,
1121        fnet_migration::StateProxy,
1122    ) {
1123        let (control, control_server) =
1124            fidl::endpoints::create_proxy_and_stream::<fnet_migration::ControlMarker>();
1125        let (state, state_server) =
1126            fidl::endpoints::create_proxy_and_stream::<fnet_migration::StateMarker>();
1127
1128        let fut = {
1129            let control =
1130                control_server.map(|req| ServiceRequest::Control(req.expect("control error")));
1131            let state = state_server.map(|req| ServiceRequest::State(req.expect("state error")));
1132            futures::stream::select(control, state).fold(migration, |mut migration, req| async {
1133                migration.handle_request(req).await.expect("handling request");
1134                migration
1135            })
1136        };
1137        (fut, control, state)
1138    }
1139
1140    #[test_case(Persisted{
1141        user: Some(NetstackVersion::Netstack2),
1142        automated: None,
1143        rollback: None,
1144        rollback_history: None,
1145    }; "user_netstack2")]
1146    #[test_case(Persisted{
1147        user: Some(NetstackVersion::Netstack3),
1148        automated: None,
1149        rollback: None,
1150        rollback_history: None,
1151    }; "user_netstack3")]
1152    #[test_case(Persisted{
1153        user: None,
1154        automated: None,
1155        rollback: None,
1156        rollback_history: None,
1157    }; "none")]
1158    #[test_case(Persisted{
1159        user: None,
1160        automated: Some(NetstackVersion::Netstack2),
1161        rollback: None,
1162        rollback_history: None,
1163    }; "automated_netstack2")]
1164    #[test_case(Persisted{
1165        user: None,
1166        automated: Some(NetstackVersion::Netstack3),
1167        rollback: None,
1168        rollback_history: None,
1169    }; "automated_netstack3")]
1170    #[test_case(Persisted{
1171        user: None,
1172        automated: None,
1173        rollback: Some(rollback::Persisted::Success),
1174        rollback_history: None,
1175    }; "rollback_success")]
1176    #[test_case(Persisted{
1177        user: None,
1178        automated: None,
1179        rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
1180        rollback_history: Some(RollbackHistory {epoch: CURRENT_EPOCH, count: 3}),
1181    }; "rollback_history")]
1182    #[test_case(Persisted{
1183        user: Some(NetstackVersion::Netstack2),
1184        automated: Some(NetstackVersion::Netstack3),
1185        rollback: Some(rollback::Persisted::HealthcheckFailures(5)),
1186        rollback_history: Some(RollbackHistory {epoch: CURRENT_EPOCH, count: 3}),
1187    }; "all")]
1188    #[fuchsia::test(add_test_attr = false)]
1189    fn persist_save_load(v: Persisted) {
1190        let mut m = InMemory::default();
1191        v.save(m.open_writer().unwrap());
1192        assert_eq!(Persisted::load(m.open_reader().unwrap()), v);
1193    }
1194
1195    #[fuchsia::test]
1196    fn uses_defaults_if_no_persistence() {
1197        let m = Migration::new(InMemory::default(), NoCollaborativeReboot, NoCrashReports);
1198        let Migration {
1199            current_boot,
1200            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1201            persistence: _,
1202            collaborative_reboot: _,
1203            crash_reporter: _,
1204        } = m;
1205        assert_eq!(current_boot.version(), DEFAULT_NETSTACK);
1206        assert_eq!(user, None);
1207        assert_eq!(automated, None);
1208    }
1209
1210    #[test_case(
1211        None, Some(NetstackVersion::Netstack3), None, None, NetstackVersion::Netstack3;
1212        "automated_ns3")]
1213    #[test_case(
1214        None, Some(NetstackVersion::Netstack2), None, None, NetstackVersion::Netstack2;
1215        "automated_ns2")]
1216    #[test_case(
1217        Some(NetstackVersion::Netstack3),
1218        Some(NetstackVersion::Netstack2),
1219        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)),
1220        Some(RollbackHistory{epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH}),
1221        NetstackVersion::Netstack3;
1222        "user_ns3_override")]
1223    #[test_case(
1224        Some(NetstackVersion::Netstack2),
1225        Some(NetstackVersion::Netstack3),
1226        Some(rollback::Persisted::Success),
1227        None,
1228        NetstackVersion::Netstack2;
1229        "user_ns2_override")]
1230    #[test_case(
1231        Some(NetstackVersion::Netstack2),
1232        None,
1233        None,
1234        None,
1235        NetstackVersion::Netstack2; "user_ns2")]
1236    #[test_case(
1237        Some(NetstackVersion::Netstack3),
1238        None,
1239        None,
1240        None,
1241        NetstackVersion::Netstack3; "user_ns3")]
1242    #[test_case(
1243        None,
1244        Some(NetstackVersion::Netstack3),
1245        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)),
1246        None,
1247        NetstackVersion::Netstack2; "rollback_to_ns2")]
1248    #[test_case(
1249        None,
1250        Some(NetstackVersion::Netstack3),
1251        Some(rollback::Persisted::HealthcheckFailures(0)),
1252        Some(RollbackHistory{epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH}),
1253        NetstackVersion::Netstack2; "indefinitely_rolled_back_to_ns2")]
1254    #[test_case(
1255        None,
1256        Some(NetstackVersion::Netstack3),
1257        Some(rollback::Persisted::HealthcheckFailures(0)),
1258        Some(RollbackHistory{epoch: CURRENT_EPOCH - 1, count: MAX_ROLLBACKS_PER_EPOCH}),
1259        NetstackVersion::Netstack3; "ignore_rollback_history_from_old_epoch")]
1260    #[test_case(None, None, None, None, DEFAULT_NETSTACK; "default")]
1261    #[fuchsia::test]
1262    async fn get_netstack_version(
1263        p_user: Option<NetstackVersion>,
1264        p_automated: Option<NetstackVersion>,
1265        p_rollback: Option<rollback::Persisted>,
1266        p_history: Option<RollbackHistory>,
1267        expect: NetstackVersion,
1268    ) {
1269        let m = Migration::new(
1270            InMemory::with_persisted(Persisted {
1271                user: p_user,
1272                automated: p_automated,
1273                rollback: p_rollback,
1274                rollback_history: p_history,
1275            }),
1276            NoCollaborativeReboot,
1277            NoCrashReports,
1278        );
1279        let Migration {
1280            current_boot,
1281            persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1282            persistence: _,
1283            collaborative_reboot: _,
1284            crash_reporter: _,
1285        } = &m;
1286        assert_eq!(current_boot.version(), expect);
1287        assert_eq!(*user, p_user);
1288        assert_eq!(*automated, p_automated);
1289
1290        let (serve, _, state) = serve_migration(m);
1291        let fut = async move {
1292            let fnet_migration::InEffectVersion { current_boot, user, automated } =
1293                state.get_netstack_version().await.expect("get netstack version");
1294            let expect = expect.into();
1295            let p_user = p_user.map(Into::into);
1296            let p_automated = p_automated.map(Into::into);
1297            assert_eq!(current_boot, expect);
1298            assert_eq!(user, p_user);
1299            assert_eq!(automated, p_automated);
1300        };
1301        let (_, ()): (Migration<_, _, _>, _) = futures::future::join(serve, fut).await;
1302    }
1303
1304    #[derive(Debug, Copy, Clone)]
1305    enum SetMechanism {
1306        User,
1307        Automated,
1308    }
1309
1310    #[test_case(SetMechanism::User, NetstackVersion::Netstack2; "set_user_ns2")]
1311    #[test_case(SetMechanism::User, NetstackVersion::Netstack3; "set_user_ns3")]
1312    #[test_case(SetMechanism::Automated, NetstackVersion::Netstack2; "set_automated_ns2")]
1313    #[test_case(SetMechanism::Automated, NetstackVersion::Netstack3; "set_automated_ns3")]
1314    #[fuchsia::test]
1315    async fn set_netstack_version(mechanism: SetMechanism, set_version: NetstackVersion) {
1316        let m = Migration::new(
1317            InMemory::with_persisted(Default::default()),
1318            FakeCollaborativeReboot::default(),
1319            NoCrashReports,
1320        );
1321        let (serve, control, _) = serve_migration(m);
1322        let fut = async move {
1323            let setting = fnet_migration::VersionSetting { version: set_version.into() };
1324            match mechanism {
1325                SetMechanism::User => control
1326                    .set_user_netstack_version(Some(&setting))
1327                    .await
1328                    .expect("set user netstack version"),
1329                SetMechanism::Automated => control
1330                    .set_automated_netstack_version(Some(&setting))
1331                    .await
1332                    .expect("set automated netstack version"),
1333            }
1334        };
1335        let (migration, ()) = futures::future::join(serve, fut).await;
1336
1337        let validate_versions = |m: &Migration<_, _, _>, current| {
1338            let Migration {
1339                current_boot,
1340                persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1341                persistence: _,
1342                collaborative_reboot: _,
1343                crash_reporter: _,
1344            } = m;
1345            assert_eq!(current_boot.version(), current);
1346            match mechanism {
1347                SetMechanism::User => {
1348                    assert_eq!(*user, Some(set_version));
1349                    assert_eq!(*automated, None);
1350                }
1351                SetMechanism::Automated => {
1352                    assert_eq!(*user, None);
1353                    assert_eq!(*automated, Some(set_version));
1354                }
1355            }
1356        };
1357
1358        validate_versions(&migration, DEFAULT_NETSTACK);
1359        // There should only be a collaborative reboot request if the new
1360        // version differs from the default version.
1361        let cr_req = &migration.collaborative_reboot.scheduler.req;
1362        if set_version != DEFAULT_NETSTACK {
1363            assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
1364        } else {
1365            assert_eq!(cr_req, &None);
1366        }
1367
1368        // Check that the setting was properly persisted.
1369        let migration = Migration::new(
1370            migration.persistence,
1371            migration.collaborative_reboot.scheduler,
1372            migration.crash_reporter,
1373        );
1374        validate_versions(&migration, set_version);
1375    }
1376
1377    #[fuchsia::test]
1378    async fn update_rollback_state() {
1379        let mut migration = Migration::new(
1380            InMemory::with_persisted(Persisted {
1381                automated: Some(NetstackVersion::Netstack3),
1382                user: None,
1383                rollback: None,
1384                rollback_history: None,
1385            }),
1386            FakeCollaborativeReboot::default(),
1387            FakeCrashReporter::default(),
1388        );
1389
1390        assert_eq!(migration.current_boot.version(), NetstackVersion::Netstack3);
1391        assert!(migration.collaborative_reboot.scheduler.req.is_none());
1392
1393        // The first update shouldn't schedule a reboot because we haven't
1394        // passed the healthcheck threshold yet.
1395        migration.update_rollback_state(rollback::Persisted::HealthcheckFailures(1)).await;
1396        assert_matches!(
1397            migration.persisted.rollback,
1398            Some(rollback::Persisted::HealthcheckFailures(1))
1399        );
1400        assert!(migration.collaborative_reboot.scheduler.req.is_none());
1401
1402        // This second update should schedule a reboot because we've passed
1403        // the healthcheck limit and want to roll back to Netstack2.
1404        migration
1405            .update_rollback_state(rollback::Persisted::HealthcheckFailures(
1406                rollback::MAX_FAILED_HEALTHCHECKS,
1407            ))
1408            .await;
1409        assert_matches!(
1410            migration.persisted.rollback,
1411            Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS))
1412        );
1413        assert_eq!(
1414            migration
1415                .collaborative_reboot
1416                .scheduler
1417                .req
1418                .as_ref()
1419                .expect("reboot was not scheduled")
1420                .is_closed()
1421                .unwrap(),
1422            false
1423        );
1424
1425        // It should have also filed a crash report
1426        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::FailedHealthcheck]);
1427
1428        // This emulates seeing a healthcheck success before rebooting, in which
1429        // case we should see the reboot get canceled.
1430        migration.update_rollback_state(rollback::Persisted::Success).await;
1431        assert_matches!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1432        assert!(
1433            migration.collaborative_reboot.scheduler.req.as_ref().unwrap().is_closed().unwrap()
1434        );
1435
1436        // Ensure that the changes were persisted successfully.
1437        let migration = Migration::new(
1438            migration.persistence,
1439            migration.collaborative_reboot.scheduler,
1440            migration.crash_reporter,
1441        );
1442        assert_matches!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1443    }
1444
1445    #[test_case(SetMechanism::User, Some(NetstackVersion::Netstack2), false)]
1446    #[test_case(SetMechanism::User, Some(NetstackVersion::Netstack3), true)]
1447    #[test_case(SetMechanism::User, None, false)]
1448    #[test_case(SetMechanism::Automated, Some(NetstackVersion::Netstack2), false)]
1449    #[test_case(SetMechanism::Automated, Some(NetstackVersion::Netstack3), true)]
1450    #[test_case(SetMechanism::Automated, None, true)]
1451    #[fuchsia::test]
1452    async fn cancel_collaborative_reboot(
1453        mechanism: SetMechanism,
1454        version: Option<NetstackVersion>,
1455        expect_canceled: bool,
1456    ) {
1457        let migration = Migration::new(
1458            InMemory::with_persisted(Persisted {
1459                user: None,
1460                automated: None,
1461                rollback: None,
1462                rollback_history: None,
1463            }),
1464            FakeCollaborativeReboot::default(),
1465            NoCrashReports,
1466        );
1467
1468        // Start of by updating the automated setting to Netstack2; this ensures
1469        // there is a pending request to cancel.
1470        let (serve, control, _) = serve_migration(migration);
1471        let fut = async move {
1472            control
1473                .set_automated_netstack_version(Some(&fnet_migration::VersionSetting {
1474                    version: fnet_migration::NetstackVersion::Netstack2,
1475                }))
1476                .await
1477                .expect("set automated netstack version");
1478        };
1479        let (migration, ()) = futures::future::join(serve, fut).await;
1480        let cancel = migration
1481            .collaborative_reboot
1482            .scheduler
1483            .req
1484            .as_ref()
1485            .expect("there should be a request");
1486        assert_eq!(Ok(false), cancel.is_closed());
1487
1488        // Update the setting based on the test parameters
1489        let (serve, control, _) = serve_migration(migration);
1490        let fut = async move {
1491            let setting = version.map(|v| fnet_migration::VersionSetting { version: v.into() });
1492            match mechanism {
1493                SetMechanism::User => control
1494                    .set_user_netstack_version(setting.as_ref())
1495                    .await
1496                    .expect("set user netstack version"),
1497                SetMechanism::Automated => control
1498                    .set_automated_netstack_version(setting.as_ref())
1499                    .await
1500                    .expect("set automated netstack version"),
1501            }
1502        };
1503        let (migration, ()) = futures::future::join(serve, fut).await;
1504
1505        let cancel = migration
1506            .collaborative_reboot
1507            .scheduler
1508            .req
1509            .as_ref()
1510            .expect("there should be a request");
1511        assert_eq!(Ok(expect_canceled), cancel.is_closed());
1512    }
1513
1514    #[test_case(SetMechanism::User)]
1515    #[test_case(SetMechanism::Automated)]
1516    #[fuchsia::test]
1517    async fn clear_netstack_version(mechanism: SetMechanism) {
1518        const PREVIOUS_VERSION: NetstackVersion = NetstackVersion::Netstack2;
1519        let m = Migration::new(
1520            InMemory::with_persisted(Persisted {
1521                user: Some(PREVIOUS_VERSION),
1522                automated: Some(PREVIOUS_VERSION),
1523                rollback: None,
1524                rollback_history: None,
1525            }),
1526            NoCollaborativeReboot,
1527            NoCrashReports,
1528        );
1529        let (serve, control, _) = serve_migration(m);
1530        let fut = async move {
1531            match mechanism {
1532                SetMechanism::User => control
1533                    .set_user_netstack_version(None)
1534                    .await
1535                    .expect("set user netstack version"),
1536                SetMechanism::Automated => control
1537                    .set_automated_netstack_version(None)
1538                    .await
1539                    .expect("set automated netstack version"),
1540            }
1541        };
1542        let (migration, ()) = futures::future::join(serve, fut).await;
1543
1544        let validate_versions = |m: &Migration<_, _, _>| {
1545            let Migration {
1546                current_boot,
1547                persisted: Persisted { user, automated, rollback: _, rollback_history: _ },
1548                persistence: _,
1549                collaborative_reboot: _,
1550                crash_reporter: _,
1551            } = m;
1552            assert_eq!(current_boot.version(), PREVIOUS_VERSION);
1553            match mechanism {
1554                SetMechanism::User => {
1555                    assert_eq!(*user, None);
1556                    assert_eq!(*automated, Some(PREVIOUS_VERSION));
1557                }
1558                SetMechanism::Automated => {
1559                    assert_eq!(*user, Some(PREVIOUS_VERSION));
1560                    assert_eq!(*automated, None);
1561                }
1562            }
1563        };
1564
1565        validate_versions(&migration);
1566        // Check that the setting was properly persisted.
1567        let migration = Migration::new(
1568            migration.persistence,
1569            migration.collaborative_reboot.scheduler,
1570            migration.crash_reporter,
1571        );
1572        validate_versions(&migration);
1573    }
1574
1575    #[fuchsia::test]
1576    async fn inspect() {
1577        let mut m = Migration::new(
1578            InMemory::with_persisted(Persisted {
1579                user: Some(NetstackVersion::Netstack2),
1580                automated: Some(NetstackVersion::Netstack3),
1581                rollback: None,
1582                rollback_history: None,
1583            }),
1584            NoCollaborativeReboot,
1585            NoCrashReports,
1586        );
1587        let inspector = fuchsia_inspect::component::inspector();
1588        let nodes = InspectNodes::new(inspector, &m);
1589        assert_data_tree!(inspector,
1590            root: {
1591                current_boot: 2u64,
1592                user_setting: 2u64,
1593                automated_setting: 3u64,
1594                rollback_state: "None",
1595                forced_netstack2: false,
1596                current_epoch: CURRENT_EPOCH,
1597                rollback_history: format!(
1598                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 0 }})"
1599                ),
1600            }
1601        );
1602
1603        m.persisted = Persisted {
1604            user: None,
1605            automated: Some(NetstackVersion::Netstack2),
1606            rollback: None,
1607            rollback_history: None,
1608        };
1609        nodes.update(&m);
1610        assert_data_tree!(inspector,
1611            root: {
1612                current_boot: 2u64,
1613                user_setting: 0u64,
1614                automated_setting: 2u64,
1615                rollback_state: "None",
1616                forced_netstack2: false,
1617                current_epoch: CURRENT_EPOCH,
1618                rollback_history: format!(
1619                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 0 }})"
1620                ),
1621            }
1622        );
1623    }
1624
1625    #[fuchsia::test]
1626    async fn inspect_rollback() {
1627        let mut m = Migration::new(
1628            InMemory::with_persisted(Persisted {
1629                user: None,
1630                automated: Some(NetstackVersion::Netstack3),
1631                rollback: Some(rollback::Persisted::HealthcheckFailures(
1632                    rollback::MAX_FAILED_HEALTHCHECKS,
1633                )),
1634                rollback_history: None,
1635            }),
1636            NoCollaborativeReboot,
1637            NoCrashReports,
1638        );
1639        let inspector = fuchsia_inspect::component::inspector();
1640        let nodes = InspectNodes::new(inspector, &m);
1641        assert_data_tree!(inspector,
1642            root: {
1643                current_boot: 2u64,
1644                user_setting: 0u64,
1645                automated_setting: 3u64,
1646                rollback_state: "Some(HealthcheckFailures(5))",
1647                forced_netstack2: true,
1648                current_epoch: CURRENT_EPOCH,
1649                rollback_history: format!(
1650                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1651                ),
1652            }
1653        );
1654
1655        m.persisted.rollback =
1656            Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS + 1));
1657        nodes.update(&m);
1658        assert_data_tree!(inspector,
1659            root: {
1660                current_boot: 2u64,
1661                user_setting: 0u64,
1662                automated_setting: 3u64,
1663                rollback_state: "Some(HealthcheckFailures(6))",
1664                forced_netstack2: true,
1665                current_epoch: CURRENT_EPOCH,
1666                rollback_history: format!(
1667                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1668                ),
1669            }
1670        );
1671        m.persisted.rollback = Some(rollback::Persisted::Success);
1672        nodes.update(&m);
1673        assert_data_tree!(inspector,
1674            root: {
1675                current_boot: 2u64,
1676                user_setting: 0u64,
1677                automated_setting: 3u64,
1678                rollback_state: "Some(Success)",
1679                forced_netstack2: true,
1680                current_epoch: CURRENT_EPOCH,
1681                rollback_history: format!(
1682                    "Some(RollbackHistory {{ epoch: {CURRENT_EPOCH}, count: 1 }})"
1683                ),
1684            }
1685        );
1686    }
1687
1688    #[test_case(CURRENT_EPOCH, 2, true; "ns2_with_failures_in_current_epoch")]
1689    #[test_case(CURRENT_EPOCH - 1, 3, false; "ns3_with_failures_in_old_epoch")]
1690    #[fuchsia::test]
1691    async fn inspect_rollback_history(epoch: u32, current_boot: u64, forced_netstack2: bool) {
1692        let m = Migration::new(
1693            InMemory::with_persisted(Persisted {
1694                user: None,
1695                automated: Some(NetstackVersion::Netstack3),
1696                rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
1697                rollback_history: Some(RollbackHistory {
1698                    epoch: epoch,
1699                    count: MAX_ROLLBACKS_PER_EPOCH,
1700                }),
1701            }),
1702            NoCollaborativeReboot,
1703            NoCrashReports,
1704        );
1705        let inspector = fuchsia_inspect::component::inspector();
1706        let _nodes = InspectNodes::new(inspector, &m);
1707        assert_data_tree!(inspector,
1708            root: {
1709                current_boot: current_boot,
1710                user_setting: 0u64,
1711                automated_setting: 3u64,
1712                rollback_state: "Some(HealthcheckFailures(0))",
1713                forced_netstack2: forced_netstack2,
1714                current_epoch: CURRENT_EPOCH,
1715                rollback_history: format!(
1716                    "Some(RollbackHistory {{ epoch: {}, count: {} }})",
1717                    CURRENT_EPOCH,
1718                    // Check if the history should have been reset.
1719                    if epoch == CURRENT_EPOCH { MAX_ROLLBACKS_PER_EPOCH } else {0},
1720                ),
1721            }
1722        );
1723    }
1724
1725    #[test_case::test_matrix(
1726    [
1727        (RollbackNetstackVersion::Netstack2, metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack2),
1728        (RollbackNetstackVersion::Netstack3, metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion::Netstack3),
1729    ],
1730    [
1731        (None, metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::NoSelection),
1732        (Some(NetstackVersion::Netstack2), metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack2),
1733        (Some(NetstackVersion::Netstack3), metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion::Netstack3),
1734    ],
1735    [
1736        (None, metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::NoSelection),
1737        (Some(NetstackVersion::Netstack2), metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack2),
1738        (Some(NetstackVersion::Netstack3), metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion::Netstack3),
1739    ]
1740    )]
1741    #[fuchsia::test]
1742    async fn metrics_logger(
1743        current_boot: (
1744            RollbackNetstackVersion,
1745            metrics_registry::StackMigrationCurrentBootMetricDimensionNetstackVersion,
1746        ),
1747        user: (
1748            Option<NetstackVersion>,
1749            metrics_registry::StackMigrationUserSettingMetricDimensionNetstackVersion,
1750        ),
1751        automated: (
1752            Option<NetstackVersion>,
1753            metrics_registry::StackMigrationAutomatedSettingMetricDimensionNetstackVersion,
1754        ),
1755    ) {
1756        let (current_boot, current_boot_expect) = current_boot;
1757        let (user, user_expect) = user;
1758        let (automated, automated_expect) = automated;
1759        let mut m = Migration::new(
1760            InMemory::with_persisted(Persisted {
1761                user,
1762                automated,
1763                rollback: None,
1764                rollback_history: None,
1765            }),
1766            NoCollaborativeReboot,
1767            NoCrashReports,
1768        );
1769        m.current_boot = current_boot;
1770        let (logger, mut logger_stream) =
1771            fidl::endpoints::create_proxy_and_stream::<fmetrics::MetricEventLoggerMarker>();
1772
1773        let metrics_logger = MetricsLogger { logger: Some(logger) };
1774
1775        let ((), ()) = futures::future::join(metrics_logger.log_metrics(&m), async {
1776            let expect = [
1777                (
1778                    metrics_registry::STACK_MIGRATION_CURRENT_BOOT_METRIC_ID,
1779                    Some(current_boot_expect.as_event_code()),
1780                ),
1781                (
1782                    metrics_registry::STACK_MIGRATION_USER_SETTING_METRIC_ID,
1783                    Some(user_expect.as_event_code()),
1784                ),
1785                (
1786                    metrics_registry::STACK_MIGRATION_AUTOMATED_SETTING_METRIC_ID,
1787                    Some(automated_expect.as_event_code()),
1788                ),
1789                (
1790                    metrics_registry::STACK_MIGRATION_STATE_METRIC_ID,
1791                    // Note: The rollback state doesn't have a flat expectation.
1792                    // Don't assert on its value here, and instead we directly
1793                    // test it in a separate test case.
1794                    None,
1795                ),
1796            ];
1797            for (id, ev) in expect {
1798                let (metric, occurences, codes, responder) = logger_stream
1799                    .next()
1800                    .await
1801                    .unwrap()
1802                    .unwrap()
1803                    .into_log_occurrence()
1804                    .expect("bad request");
1805                assert_eq!(metric, id);
1806                assert_eq!(occurences, 1);
1807                if let Some(ev) = ev {
1808                    assert_eq!(codes, vec![ev]);
1809                }
1810                responder.send(Ok(())).unwrap();
1811            }
1812        })
1813        .await;
1814    }
1815
1816    #[test_case(
1817        RollbackNetstackVersion::Netstack2, None, None =>
1818        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1819        "not_started_none"
1820    )]
1821    #[test_case(
1822        RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack2), None =>
1823        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1824        "not_started_ns2"
1825    )]
1826    #[test_case(
1827        RollbackNetstackVersion::Netstack2, Some(NetstackVersion::Netstack3), None =>
1828        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Scheduled;
1829        "scheduled"
1830    )]
1831    #[test_case(
1832        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3), None =>
1833        metrics_registry::StackMigrationStateMetricDimensionMigrationState::InProgress;
1834        "in_progress_none"
1835    )]
1836    #[test_case(
1837        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1838        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS - 1)) =>
1839        metrics_registry::StackMigrationStateMetricDimensionMigrationState::InProgress;
1840        "in_progress_some"
1841    )]
1842    #[test_case(
1843        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1844        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS)) =>
1845        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Failed;
1846        "failed_exact"
1847    )]
1848    #[test_case(
1849        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1850        Some(rollback::Persisted::HealthcheckFailures(rollback::MAX_FAILED_HEALTHCHECKS + 1)) =>
1851        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Failed;
1852        "failed_more"
1853    )]
1854    #[test_case(
1855        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack3),
1856        Some(rollback::Persisted::Success) =>
1857        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Success;
1858        "success"
1859    )]
1860    #[test_case(
1861        RollbackNetstackVersion::Netstack3, None, None =>
1862        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Canceled;
1863        "canceled_none"
1864    )]
1865    #[test_case(
1866        RollbackNetstackVersion::Netstack3, Some(NetstackVersion::Netstack2), None =>
1867        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Canceled;
1868        "canceled_ns2"
1869    )]
1870    #[test_case(
1871        RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack3), None =>
1872        metrics_registry::StackMigrationStateMetricDimensionMigrationState::RolledBack;
1873        "rolled_back"
1874    )]
1875    #[test_case(
1876        RollbackNetstackVersion::ForceNetstack2, Some(NetstackVersion::Netstack2), None =>
1877        metrics_registry::StackMigrationStateMetricDimensionMigrationState::NotStarted;
1878        "rolled_back_becomes_not_started"
1879    )]
1880    #[test_case(
1881        RollbackNetstackVersion::ForceNetstack2Indefinitely, None, None =>
1882        metrics_registry::StackMigrationStateMetricDimensionMigrationState::Abandoned;
1883        "abandoned"
1884    )]
1885    #[fuchsia::test]
1886    fn test_state_metric(
1887        current_boot: RollbackNetstackVersion,
1888        automated: Option<NetstackVersion>,
1889        rollback: Option<rollback::Persisted>,
1890    ) -> metrics_registry::StackMigrationStateMetricDimensionMigrationState {
1891        let mut migration = Migration::new(
1892            InMemory::with_persisted(Persisted {
1893                user: None,
1894                automated,
1895                rollback,
1896                rollback_history: None,
1897            }),
1898            NoCollaborativeReboot,
1899            NoCrashReports,
1900        );
1901        migration.current_boot = current_boot;
1902        compute_state_metric(&migration)
1903    }
1904
1905    /// An in-memory mock-persistence that triggers an event once the target
1906    /// state has been persisted.
1907    #[derive(Clone)]
1908    struct AwaitPersisted {
1909        file: Rc<RefCell<Option<Vec<u8>>>>,
1910        target: Vec<u8>,
1911        event: Event,
1912    }
1913
1914    impl AwaitPersisted {
1915        fn with_persisted(start: Persisted, target: &Persisted) -> (Self, EventWait) {
1916            let event = Event::new();
1917            let wait = event.wait();
1918            let target_bytes = serde_json::to_vec(target).expect("failed to serialize target");
1919            let mut s = Self { file: Default::default(), target: target_bytes, event };
1920            start.save(s.open_writer().unwrap());
1921            (s, wait)
1922        }
1923    }
1924
1925    impl PersistenceProvider for AwaitPersisted {
1926        type Writer = Self;
1927        type Reader = std::io::Cursor<Vec<u8>>;
1928
1929        fn open_writer(&mut self) -> std::io::Result<Self::Writer> {
1930            *self.file.borrow_mut() = Some(Vec::new());
1931            Ok(self.clone())
1932        }
1933
1934        fn open_reader(&self) -> std::io::Result<Self::Reader> {
1935            self.file
1936                .borrow()
1937                .clone()
1938                .map(std::io::Cursor::new)
1939                .ok_or_else(|| std::io::ErrorKind::NotFound.into())
1940        }
1941    }
1942
1943    impl std::io::Write for AwaitPersisted {
1944        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1945            let r = self.file.borrow_mut().as_mut().expect("no file open").write(buf);
1946            if self.file.borrow().as_ref().expect("no_file_open") == &self.target {
1947                let _: bool = self.event.signal();
1948            }
1949            r
1950        }
1951
1952        fn flush(&mut self) -> std::io::Result<()> {
1953            Ok(())
1954        }
1955    }
1956
1957    #[fuchsia::test]
1958    async fn migrate_to_ns3_success() {
1959        let start = Persisted {
1960            user: None,
1961            automated: Some(NetstackVersion::Netstack3),
1962            rollback: None,
1963            rollback_history: None,
1964        };
1965        let target = Persisted {
1966            user: None,
1967            automated: Some(NetstackVersion::Netstack3),
1968            rollback: Some(rollback::Persisted::Success),
1969            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
1970        };
1971
1972        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
1973        let mut migration = Migration::new(persistence, NoCollaborativeReboot, NoCrashReports);
1974        // No service requests.
1975        let service_request_stream = futures::stream::pending();
1976        // A health check that always succeeds.
1977        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
1978            Ok(fnet_http::Response { error: None, status_code: Some(204), ..Default::default() })
1979        });
1980        let healthcheck_tick = futures::stream::once(futures::future::ready(()));
1981
1982        {
1983            let main_fut = main_inner(
1984                &mut migration,
1985                service_request_stream,
1986                mock_healthcheck,
1987                healthcheck_tick,
1988            )
1989            .fuse();
1990            futures::pin_mut!(main_fut);
1991            futures::select!(
1992                () = main_fut => unreachable!("main fut should never exit"),
1993                () = wait => {}
1994            );
1995        }
1996
1997        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::Success));
1998    }
1999
2000    #[test_case[0; "first_failure"]]
2001    #[test_case[MAX_ROLLBACKS_PER_EPOCH-1; "last_failure"]]
2002    #[fuchsia::test]
2003    async fn migrate_to_ns3_fails(rollbacks_in_history: u8) {
2004        let start = Persisted {
2005            user: None,
2006            automated: Some(NetstackVersion::Netstack3),
2007            rollback: None,
2008            rollback_history: Some(RollbackHistory {
2009                epoch: CURRENT_EPOCH,
2010                count: rollbacks_in_history,
2011            }),
2012        };
2013        let target = Persisted {
2014            user: None,
2015            automated: Some(NetstackVersion::Netstack3),
2016            rollback: Some(rollback::Persisted::HealthcheckFailures(
2017                rollback::MAX_FAILED_HEALTHCHECKS,
2018            )),
2019            rollback_history: Some(RollbackHistory {
2020                epoch: CURRENT_EPOCH,
2021                count: rollbacks_in_history,
2022            }),
2023        };
2024
2025        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
2026        let mut migration = Migration::new(
2027            persistence,
2028            FakeCollaborativeReboot::default(),
2029            FakeCrashReporter::default(),
2030        );
2031        // No service requests.
2032        let service_request_stream = futures::stream::pending();
2033        // A health check that always fails.
2034        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2035            Ok(fnet_http::Response { error: None, status_code: Some(500), ..Default::default() })
2036        });
2037        // Use a non-zero interval so that the Healthcheck code doesn't hog the scheduler.
2038        let healthcheck_tick = fuchsia_async::Interval::new(Duration::from_millis(1).into());
2039
2040        {
2041            let main_fut = main_inner(
2042                &mut migration,
2043                service_request_stream,
2044                mock_healthcheck,
2045                healthcheck_tick,
2046            )
2047            .fuse();
2048            futures::pin_mut!(main_fut);
2049            futures::select!(
2050                () = main_fut => unreachable!("main fut should never exit"),
2051                () = wait => {}
2052            );
2053        }
2054
2055        let failure_count = assert_matches!(
2056            migration.persisted.rollback,
2057            Some(rollback::Persisted::HealthcheckFailures(f)) => f
2058        );
2059        assert!(
2060            failure_count >= rollback::MAX_FAILED_HEALTHCHECKS,
2061            "failure_count={failure_count}"
2062        );
2063
2064        // Verify a failed migration schedules a collaborative reboot.
2065        let cr_req = &migration.collaborative_reboot.scheduler.req;
2066        assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
2067
2068        // Verify crash reports were filed for each failed healthcheck above
2069        // the threshold.
2070        let expected_num_crash_reports =
2071            failure_count - HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS + 1;
2072        assert_eq!(
2073            migration.crash_reporter.reports,
2074            vec![CrashReportReason::FailedHealthcheck; expected_num_crash_reports]
2075        )
2076    }
2077
2078    // Regression test for https://fxbug.dev/395913604.
2079    //
2080    // The original bug would reset the number of failed healthchecks in
2081    // persistence from `rollback::MAX_FAILED_HEALTHCHECKS` to 0, if an inbound
2082    // service request was received.
2083    //
2084    // Verify this is no longer the case by triggering a failed healthcheck,
2085    // pushing the total to `rollback::MAX_FAILED_HEALTHCHECKS`, then sending
2086    // a `fuchsia.net.migration/State.GetNetstackVersion` request.
2087    #[fuchsia::test]
2088    async fn migrate_to_ns3_rollback_regression_test() {
2089        let start = Persisted {
2090            user: None,
2091            automated: Some(NetstackVersion::Netstack3),
2092            rollback: Some(rollback::Persisted::HealthcheckFailures(
2093                rollback::MAX_FAILED_HEALTHCHECKS - 1,
2094            )),
2095            rollback_history: None,
2096        };
2097        let target = Persisted {
2098            user: None,
2099            automated: Some(NetstackVersion::Netstack3),
2100            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2101            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
2102        };
2103
2104        let (persistence, wait) = AwaitPersisted::with_persisted(start, &target);
2105        let mut migration = Migration::new(
2106            persistence,
2107            FakeCollaborativeReboot::default(),
2108            FakeCrashReporter::default(),
2109        );
2110        // A health check that always fails.
2111        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2112            Ok(fnet_http::Response { error: None, status_code: Some(500), ..Default::default() })
2113        });
2114        let healthcheck_tick = futures::stream::once(futures::future::ready(()));
2115        // Send "get" requests, to trigger the bug.
2116        let (client, server) =
2117            fidl::endpoints::create_proxy_and_stream::<fnet_migration::StateMarker>();
2118        let service_request_stream = server.map(|r| r.map(ServiceRequest::State));
2119        let client_fut = async move {
2120            // Send multiple get requests, to ensure that at least one would occur after the failed
2121            // healthcheck.
2122            let mut stream = fuchsia_async::Interval::new(Duration::from_millis(1).into());
2123            while let Some(()) = stream.next().await {
2124                let _ =
2125                    client.get_netstack_version().await.expect("failed to get netstack version");
2126            }
2127        }
2128        .fuse();
2129
2130        // If wait were to fire, the bug has occurred. Instead expect a timeout.
2131        // Use 1 second to keep the test runtime short; If CQ has a hiccup and
2132        // pauses execution, we'd see a false negative, which isn't a big deal.
2133        let wait_fut = wait
2134            .map(|()| panic!("unexpectedly observed the persisted healthcheck failures reset to 0"))
2135            .on_timeout(Duration::from_secs(1), || ())
2136            .fuse();
2137
2138        {
2139            let main_fut = main_inner(
2140                &mut migration,
2141                service_request_stream,
2142                mock_healthcheck,
2143                healthcheck_tick,
2144            )
2145            .fuse();
2146            futures::pin_mut!(main_fut);
2147            futures::pin_mut!(client_fut);
2148            futures::pin_mut!(wait_fut);
2149            futures::select!(
2150                () = main_fut => unreachable!("main fut should never exit"),
2151                () = client_fut => unreachable!("client fut should never exit"),
2152                () = wait_fut => {}
2153            );
2154        }
2155    }
2156
2157    #[fuchsia::test]
2158    async fn startup_after_first_rollback() {
2159        let start = Persisted {
2160            user: None,
2161            automated: Some(NetstackVersion::Netstack3),
2162            rollback: Some(rollback::Persisted::HealthcheckFailures(
2163                rollback::MAX_FAILED_HEALTHCHECKS,
2164            )),
2165            rollback_history: None,
2166        };
2167        let target = Persisted {
2168            user: None,
2169            automated: Some(NetstackVersion::Netstack3),
2170            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2171            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 1 }),
2172        };
2173
2174        let (persistence, persistence_wait) = AwaitPersisted::with_persisted(start, &target);
2175        let (crash_reporter, crash_reporter_wait) =
2176            AwaitCrashReports::new(vec![CrashReportReason::RolledBack]);
2177        let mut migration =
2178            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2179        // No service requests.
2180        let service_request_stream = futures::stream::pending();
2181        // No Healthchecks.
2182        let healthcheck_tick = futures::stream::pending();
2183        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2184            panic!("This test isn't exercising healthchecks");
2185        });
2186        {
2187            let main_fut = main_inner(
2188                &mut migration,
2189                service_request_stream,
2190                mock_healthcheck,
2191                healthcheck_tick,
2192            )
2193            .fuse();
2194            futures::pin_mut!(main_fut);
2195            let mut wait = futures::future::join(persistence_wait, crash_reporter_wait);
2196            futures::select!(
2197                () = main_fut => unreachable!("main fut should never exit"),
2198                ((), ()) = wait => {}
2199            );
2200        }
2201
2202        // The number of Healthcheck Failures should have been reset in
2203        // preparation for the next migration attempt.
2204        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2205        // The rollback history should have been updated to reflect this
2206        // failure.
2207        assert_eq!(
2208            migration.persisted.rollback_history,
2209            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 1 })
2210        );
2211        // A Collaborative Reboot request should be scheduled, so that we can
2212        // attempt the migration again.
2213        let cr_req = &migration.collaborative_reboot.scheduler.req;
2214        assert_eq!(Ok(false), cr_req.as_ref().expect("there should be a request").is_closed());
2215        // The current_boot should reflect that we've rolled back to NS2
2216        // temporarily.
2217        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2);
2218        // A rollback crash report should have been filed.
2219        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::RolledBack],);
2220    }
2221
2222    #[fuchsia::test]
2223    async fn startup_after_final_rollback() {
2224        let start = Persisted {
2225            user: None,
2226            automated: Some(NetstackVersion::Netstack3),
2227            rollback: Some(rollback::Persisted::HealthcheckFailures(
2228                rollback::MAX_FAILED_HEALTHCHECKS,
2229            )),
2230            rollback_history: Some(RollbackHistory {
2231                epoch: CURRENT_EPOCH,
2232                count: MAX_ROLLBACKS_PER_EPOCH - 1,
2233            }),
2234        };
2235        let target = Persisted {
2236            user: None,
2237            automated: Some(NetstackVersion::Netstack3),
2238            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2239            rollback_history: Some(RollbackHistory {
2240                epoch: CURRENT_EPOCH,
2241                count: MAX_ROLLBACKS_PER_EPOCH,
2242            }),
2243        };
2244
2245        let (persistence, persistence_wait) = AwaitPersisted::with_persisted(start, &target);
2246        let (crash_reporter, crash_reporter_wait) =
2247            AwaitCrashReports::new(vec![CrashReportReason::RolledBack]);
2248        let mut migration =
2249            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2250        // No service requests.
2251        let service_request_stream = futures::stream::pending();
2252        // No Healthchecks.
2253        let healthcheck_tick = futures::stream::pending();
2254        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2255            panic!("This test isn't exercising healthchecks");
2256        });
2257        {
2258            let main_fut = main_inner(
2259                &mut migration,
2260                service_request_stream,
2261                mock_healthcheck,
2262                healthcheck_tick,
2263            )
2264            .fuse();
2265            futures::pin_mut!(main_fut);
2266            let mut wait = futures::future::join(persistence_wait, crash_reporter_wait);
2267            futures::select!(
2268                () = main_fut => unreachable!("main fut should never exit"),
2269                ((), ()) = wait => {}
2270            );
2271        }
2272
2273        // The number of Healthcheck Failures should have been reset in
2274        // preparation for the next migration attempt (in a later epoch).
2275        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2276        // The rollback history should have been updated to reflect this
2277        // failure.
2278        assert_eq!(
2279            migration.persisted.rollback_history,
2280            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH })
2281        );
2282        // A collaborative Reboot request should NOT be scheduled. We're not
2283        // going to re-attempt the migration until the epoch advances.
2284        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2285        // The current_boot should reflect that we've rolled back to NS2
2286        // indefinitely.
2287        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2Indefinitely);
2288        // A rollback crash report should have been filed.
2289        assert_eq!(migration.crash_reporter.reports, vec![CrashReportReason::RolledBack]);
2290    }
2291
2292    // This test simulates the device starting up while we were already
2293    // in the `ForceNetstack2Indefinitely` state. E.g. the current boot isn't
2294    // a rollback, rather it's continuing to use Netstack2, as established in
2295    // a previous boot.
2296    #[fuchsia::test]
2297    async fn startup_already_with_max_rollbacks_per_epoch() {
2298        let persistence = InMemory::with_persisted(Persisted {
2299            user: None,
2300            automated: Some(NetstackVersion::Netstack3),
2301            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2302            rollback_history: Some(RollbackHistory {
2303                epoch: CURRENT_EPOCH,
2304                count: MAX_ROLLBACKS_PER_EPOCH,
2305            }),
2306        });
2307        // No crash report should get filed because this boot isn't a rollback.
2308        let crash_reporter = NoCrashReports;
2309        let mut migration =
2310            Migration::new(persistence, FakeCollaborativeReboot::default(), crash_reporter);
2311        // No service requests.
2312        let service_request_stream = futures::stream::pending();
2313        // No Healthchecks.
2314        let healthcheck_tick = futures::stream::pending();
2315        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2316            panic!("This test isn't exercising healthchecks");
2317        });
2318
2319        // Drive the main future for a fixed timeout. It will never exit, but
2320        // we want to give the migration machinery enough time to apply any
2321        // changes it wants to make (it shouldn't make any).
2322        main_inner(&mut migration, service_request_stream, mock_healthcheck, healthcheck_tick)
2323            .map(Err)
2324            .on_timeout(Duration::from_secs(1), || Ok(()))
2325            .await
2326            .expect("main fut should never exit");
2327
2328        // None of the persisted state should have changed.
2329        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(0)));
2330        assert_eq!(
2331            migration.persisted.rollback_history,
2332            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: MAX_ROLLBACKS_PER_EPOCH })
2333        );
2334        // A collaborative Reboot request should NOT be scheduled.
2335        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2336        // The current_boot should reflect that we've rolled back to NS2
2337        // indefinitely.
2338        assert_eq!(migration.current_boot, RollbackNetstackVersion::ForceNetstack2Indefinitely);
2339    }
2340
2341    #[fuchsia::test]
2342    async fn startup_with_new_epoch() {
2343        let start = Persisted {
2344            user: None,
2345            automated: Some(NetstackVersion::Netstack3),
2346            rollback: Some(rollback::Persisted::HealthcheckFailures(0)),
2347            rollback_history: Some(RollbackHistory {
2348                epoch: CURRENT_EPOCH - 1,
2349                count: MAX_ROLLBACKS_PER_EPOCH,
2350            }),
2351        };
2352        let target = Persisted {
2353            user: None,
2354            automated: Some(NetstackVersion::Netstack3),
2355            rollback: Some(rollback::Persisted::HealthcheckFailures(1)),
2356            rollback_history: Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 }),
2357        };
2358
2359        let (persistence, mut wait) = AwaitPersisted::with_persisted(start, &target);
2360        let mut migration =
2361            Migration::new(persistence, FakeCollaborativeReboot::default(), NoCrashReports);
2362        // No service requests.
2363        let service_request_stream = futures::stream::pending();
2364        // No Healthchecks.
2365        let healthcheck_tick = futures::stream::pending();
2366        let mock_healthcheck = rollback::testutil::MockHttpRequester(|| {
2367            panic!("This test isn't exercising healthchecks");
2368        });
2369        {
2370            let main_fut = main_inner(
2371                &mut migration,
2372                service_request_stream,
2373                mock_healthcheck,
2374                healthcheck_tick,
2375            )
2376            .fuse();
2377            futures::pin_mut!(main_fut);
2378            futures::select!(
2379                () = main_fut => unreachable!("main fut should never exit"),
2380                () = wait => {}
2381            );
2382        }
2383
2384        // The number of Healthcheck Failures should have been incremented,
2385        // indicating the rollback mechanism is trying to migrate to NS3.
2386        assert_eq!(migration.persisted.rollback, Some(rollback::Persisted::HealthcheckFailures(1)));
2387        // The rollback history should have been updated to reflect the new
2388        // epoch.
2389        assert_eq!(
2390            migration.persisted.rollback_history,
2391            Some(RollbackHistory { epoch: CURRENT_EPOCH, count: 0 })
2392        );
2393        // A collaborative Reboot request should NOT be scheduled. We're already
2394        // running Nestack3!
2395        assert_eq!(migration.collaborative_reboot.scheduler.req, None);
2396        // The current_boot should reflect that we're retrying migrations to NS3.
2397        assert_eq!(migration.current_boot, RollbackNetstackVersion::Netstack3);
2398    }
2399
2400    #[fuchsia::test]
2401    async fn noop_rollback_state_update() {
2402        let rollback_state = rollback::Persisted::HealthcheckFailures(
2403            HEALTHCHECK_FAILURE_THRESHOLD_FOR_CRASH_REPORTS,
2404        );
2405        let mut m = Migration::new(
2406            InMemory::with_persisted(Persisted {
2407                user: None,
2408                automated: Some(NetstackVersion::Netstack3),
2409                rollback: Some(rollback_state),
2410                rollback_history: None,
2411            }),
2412            // Expect not to see a Collaborative Reboot or a Crash Report.
2413            NoCollaborativeReboot,
2414            NoCrashReports,
2415        );
2416        m.update_rollback_state(rollback_state).await;
2417    }
2418}