Skip to main content

starnix_core/perf/
mod.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
6use anyhow::Context;
7use fidl_fuchsia_cpu_profiler as profiler;
8use fuchsia_component::client::connect_to_protocol;
9use futures::StreamExt;
10use futures::channel::mpsc as future_mpsc;
11use regex_lite::Regex;
12use std::collections::HashMap;
13use std::error::Error;
14use std::sync::atomic::{AtomicU64, Ordering};
15use std::sync::{Arc, OnceLock, mpsc as sync_mpsc};
16use zerocopy::{Immutable, IntoBytes};
17use zx::HandleBased;
18
19use futures::io::{AsyncReadExt, Cursor};
20use fxt::TraceRecord;
21use fxt::profiler::ProfilerRecord;
22use fxt::session::SessionParser;
23use seq_lock::{SeqLock, SeqLockable, WriteSize};
24use starnix_logging::{log_info, log_warn, track_stub};
25use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked};
26use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
27use starnix_uapi::arch32::{
28    PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
29    PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
30    PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
31    PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
32    perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
33    perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
34    perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
35    perf_event_type_PERF_RECORD_SAMPLE,
36};
37use starnix_uapi::errors::Errno;
38use starnix_uapi::open_flags::OpenFlags;
39use starnix_uapi::user_address::UserRef;
40use starnix_uapi::{
41    error, from_status_like_fdio, perf_event_attr, perf_event_header,
42    perf_event_mmap_page__bindgen_ty_1, perf_event_read_format_PERF_FORMAT_GROUP,
43    perf_event_read_format_PERF_FORMAT_ID, perf_event_read_format_PERF_FORMAT_LOST,
44    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
45    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
46};
47
48use crate::security::{self, TargetTaskType};
49use crate::task::{Kernel, LockedAndTask};
50
51static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
52// Default buffer size to read from socket (for sampling data).
53const DEFAULT_CHUNK_SIZE: usize = 4096;
54// 4096 * 10, page size * 10.
55// If tests flake due to running out of buffer space, or if the profiling duration is
56// significantly increased, this buffer size may need further adjustment (expansion).
57const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960;
58// perf_event_header struct size: 32 + 16 + 16 = 8 bytes.
59const PERF_EVENT_HEADER_SIZE: u16 = 8;
60// FXT magic bytes (little endian).
61const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
62
63mod event;
64pub use event::{TraceEvent, TraceEventQueue, TraceEventQueueList};
65
66#[repr(C)]
67#[derive(Copy, Clone, IntoBytes, Immutable)]
68struct PerfMetadataHeader {
69    version: u32,
70    compat_version: u32,
71}
72
73#[repr(C)]
74#[derive(Copy, Clone, IntoBytes, Immutable)]
75struct PerfMetadataValue {
76    lock: u32,
77    index: u32,
78    offset: i64,
79    time_enabled: u64,
80    time_running: u64,
81    __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
82    pmc_width: u16,
83    time_shift: u16,
84    time_mult: u32,
85    time_offset: u64,
86    time_zero: u64,
87    size: u32,
88    __reserved_1: u32,
89    time_cycles: u64,
90    time_mask: u64,
91    __reserved: [u8; 928usize],
92    data_head: u64,
93    data_tail: u64,
94    data_offset: u64,
95    data_size: u64,
96    aux_head: u64,
97    aux_tail: u64,
98    aux_offset: u64,
99    aux_size: u64,
100}
101
102// SAFETY: `PerfMetadataValue` can be safely written to shared memory in 8-byte chunks.
103// This is because it is composed of two u32s followed by only u64s.
104// The first u32 is the `lock` field, which is why HAS_INLINE_SEQUENCE is true.
105unsafe impl SeqLockable for PerfMetadataValue {
106    const WRITE_SIZE: WriteSize = WriteSize::Eight;
107    const HAS_INLINE_SEQUENCE: bool = true;
108    const VMO_NAME: &'static [u8] = b"starnix:perf_event";
109}
110
111struct PerfState {
112    // This table maps a group leader's file object id to its unique u64 "format ID".
113    //
114    // When a sample is generated for any event in a group, we use this
115    // "format ID" from the group leader as the value for *both* the
116    // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
117    format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>,
118}
119
120impl Default for PerfState {
121    fn default() -> Self {
122        Self { format_id_lookup_table: Mutex::new(HashMap::new()) }
123    }
124}
125
126fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
127    kernel.expando.get_or_init(PerfState::default)
128}
129
130uapi::check_arch_independent_layout! {
131    perf_event_attr {
132        type_, // "type" is a reserved keyword so add a trailing underscore.
133        size,
134        config,
135        __bindgen_anon_1,
136        sample_type,
137        read_format,
138        _bitfield_1,
139        __bindgen_anon_2,
140        bp_type,
141        __bindgen_anon_3,
142        __bindgen_anon_4,
143        branch_sample_type,
144        sample_regs_user,
145        sample_stack_user,
146        clockid,
147        sample_regs_intr,
148        aux_watermark,
149        sample_max_stack,
150        __reserved_2,
151        aux_sample_size,
152        __reserved_3,
153        sig_data,
154        config3,
155    }
156}
157
158#[derive(Clone, Copy, Debug, PartialEq)]
159enum IoctlOp {
160    Enable,
161    Disable,
162}
163
164struct PerfEventFileState {
165    attr: perf_event_attr,
166    rf_value: u64, // "count" for the config we passed in for the event.
167    // The most recent timestamp (ns) where we changed into an enabled state
168    // i.e. the most recent time we got an ENABLE ioctl().
169    most_recent_enabled_time: u64,
170    // Sum of all previous enablement segment durations (ns). If we are
171    // currently in an enabled state, explicitly does NOT include the current
172    // segment.
173    total_time_running: u64,
174    rf_id: u64,
175    sample_id: u64,
176    _rf_lost: u64,
177    disabled: u64,
178    sample_type: u64,
179    // Handle to blob that stores all the perf data that a user may want.
180    // At the moment it only stores some metadata and backtraces (bts).
181    perf_data_vmo: zx::Vmo,
182    // Channel used to send IoctlOps to start/stop sampling.
183    ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
184}
185
186// Have an implementation for PerfEventFileState because VMO
187// doesn't have Default so we can't derive it.
188impl PerfEventFileState {
189    fn new(
190        attr: perf_event_attr,
191        rf_value: u64,
192        disabled: u64,
193        sample_type: u64,
194        perf_data_vmo: zx::Vmo,
195        ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
196    ) -> PerfEventFileState {
197        PerfEventFileState {
198            attr,
199            rf_value,
200            most_recent_enabled_time: 0,
201            total_time_running: 0,
202            rf_id: 0,
203            sample_id: 0,
204            _rf_lost: 0,
205            disabled,
206            sample_type,
207            perf_data_vmo,
208            ioctl_sender,
209        }
210    }
211}
212
213pub struct PerfEventFile {
214    _tid: tid_t,
215    _cpu: i32,
216    perf_event_file: RwLock<PerfEventFileState>,
217    // The security state for this PerfEventFile.
218    pub security_state: security::PerfEventState,
219    seq_lock: Arc<OnceLock<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>>,
220}
221
222// PerfEventFile object that implements FileOps.
223// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
224// implementation details.
225// This object can be saved as a FileDescriptor.
226impl FileOps for PerfEventFile {
227    // Don't need to implement seek or sync for PerfEventFile.
228    fileops_impl_nonseekable!();
229    fileops_impl_noop_sync!();
230
231    fn close(
232        self: Box<Self>,
233        _locked: &mut Locked<FileOpsCore>,
234        file: &FileObjectState,
235        current_task: &CurrentTask,
236    ) {
237        let perf_state = get_perf_state(&current_task.kernel);
238        let mut events = perf_state.format_id_lookup_table.lock();
239        events.remove(&file.id);
240    }
241
242    // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
243    fn read(
244        &self,
245        _locked: &mut Locked<FileOpsCore>,
246        _file: &FileObject,
247        current_task: &CurrentTask,
248        _offset: usize,
249        data: &mut dyn OutputBuffer,
250    ) -> Result<usize, Errno> {
251        // Create/calculate and return the ReadFormatData object.
252        // If we create it earlier we might want to change it and it's immutable once created.
253        let read_format_data = {
254            // Once we get the `value` or count from kernel, we can change this to a read()
255            // call instead of write().
256            let mut perf_event_file = self.perf_event_file.write();
257
258            security::check_perf_event_read_access(current_task, &self)?;
259
260            let mut total_time_running_including_curr = perf_event_file.total_time_running;
261
262            // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
263            if perf_event_file.disabled == 0 {
264                // Calculate the value or "count" of the config we're interested in.
265                // This value should reflect the value we are counting (defined in the config).
266                // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
267                // For now we just return rf_value + 1.
268                track_stub!(
269                    TODO("https://fxbug.dev/402938671"),
270                    "[perf_event_open] implement read_format value"
271                );
272                perf_event_file.rf_value += 1;
273
274                // Update time duration.
275                let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
276                total_time_running_including_curr +=
277                    curr_time - perf_event_file.most_recent_enabled_time;
278            }
279
280            let mut output = Vec::<u8>::new();
281            let value = perf_event_file.rf_value.to_ne_bytes();
282            output.extend(value);
283
284            let read_format = perf_event_file.attr.read_format;
285
286            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
287                // Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
288                output.extend(total_time_running_including_curr.to_ne_bytes());
289            }
290            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
291                // Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
292                output.extend(total_time_running_including_curr.to_ne_bytes());
293            }
294            if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
295                // Adds a 64-bit unique value that corresponds to the event group.
296                output.extend(perf_event_file.rf_id.to_ne_bytes());
297            }
298
299            output
300        };
301
302        // The regular read() call allows the case where the bytes-we-want-to-read-in won't
303        // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
304        // into a buffer that is not big enough to hold the data, the error ENOSPC results."
305        if data.available() < read_format_data.len() {
306            return error!(ENOSPC);
307        }
308        track_stub!(
309            TODO("https://fxbug.dev/402453955"),
310            "[perf_event_open] implement remaining error handling"
311        );
312
313        data.write(&read_format_data)
314    }
315
316    fn ioctl(
317        &self,
318        _locked: &mut Locked<Unlocked>,
319        _file: &FileObject,
320        current_task: &CurrentTask,
321        op: u32,
322        _arg: SyscallArg,
323    ) -> Result<SyscallResult, Errno> {
324        track_stub!(
325            TODO("https://fxbug.dev/405463320"),
326            "[perf_event_open] implement PERF_IOC_FLAG_GROUP"
327        );
328        security::check_perf_event_write_access(current_task, &self)?;
329        let mut perf_event_file = self.perf_event_file.write();
330        match op {
331            PERF_EVENT_IOC_ENABLE => {
332                if perf_event_file.disabled != 0 {
333                    perf_event_file.disabled = 0; // 0 = false.
334                    perf_event_file.most_recent_enabled_time =
335                        zx::MonotonicInstant::get().into_nanos() as u64;
336                }
337
338                // If we are sampling, invoke the profiler and collect a sample.
339                // Currently this is an example sample collection.
340                track_stub!(
341                    TODO("https://fxbug.dev/398914921"),
342                    "[perf_event_open] implement full sampling features"
343                );
344                if perf_event_file.attr.freq() == 0
345                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
346                // This is always sound regardless of the union's tag.
347                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
348                {
349                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
350                }
351                return Ok(SUCCESS);
352            }
353            PERF_EVENT_IOC_DISABLE => {
354                if perf_event_file.disabled == 0 {
355                    perf_event_file.disabled = 1; // 1 = true.
356
357                    // Update total_time_running now that the segment has ended.
358                    let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
359                    perf_event_file.total_time_running +=
360                        curr_time - perf_event_file.most_recent_enabled_time;
361                }
362                if perf_event_file.attr.freq() == 0
363                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
364                // This is always sound regardless of the union's tag.
365                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
366                {
367                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Disable);
368                }
369                return Ok(SUCCESS);
370            }
371            PERF_EVENT_IOC_RESET => {
372                perf_event_file.rf_value = 0;
373                return Ok(SUCCESS);
374            }
375            PERF_EVENT_IOC_REFRESH
376            | PERF_EVENT_IOC_PERIOD
377            | PERF_EVENT_IOC_SET_OUTPUT
378            | PERF_EVENT_IOC_SET_FILTER
379            | PERF_EVENT_IOC_ID
380            | PERF_EVENT_IOC_SET_BPF
381            | PERF_EVENT_IOC_PAUSE_OUTPUT
382            | PERF_EVENT_IOC_MODIFY_ATTRIBUTES
383            | PERF_EVENT_IOC_QUERY_BPF => {
384                track_stub!(
385                    TODO("https://fxbug.dev/404941053"),
386                    "[perf_event_open] implement remaining ioctl() calls"
387                );
388                return error!(ENOSYS);
389            }
390            _ => error!(ENOTTY),
391        }
392    }
393
394    // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
395    // Gets called when mmap() is called.
396    // Immediately before sampling, this should get called by the user (e.g. the test
397    // or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
398    fn get_memory(
399        &self,
400        _locked: &mut Locked<FileOpsCore>,
401        _file: &FileObject,
402        current_task: &CurrentTask,
403        length: Option<usize>,
404        _prot: ProtectionFlags,
405    ) -> Result<Arc<MemoryObject>, Errno> {
406        let buffer_size: u64 = length.unwrap_or(0) as u64;
407        if buffer_size == 0 {
408            return error!(EINVAL);
409        }
410
411        self.seq_lock
412            .get_or_init(|| {
413                let perf_event_file = self.perf_event_file.read();
414                let vmo_copy = perf_event_file
415                    .perf_data_vmo
416                    .as_handle_ref()
417                    .duplicate(zx::Rights::SAME_RIGHTS)
418                    .map_err(|status| from_status_like_fdio!(status))?;
419                // SAFETY: See safety requirements on `create_seq_lock`.
420                Ok(unsafe { create_seq_lock(&vmo_copy, buffer_size) })
421            })
422            .as_ref()
423            .map_err(|e| e.clone())?;
424
425        // Write to a MemoryObject and return it (expected return type for get_memory()).
426        security::check_perf_event_read_access(current_task, &self)?;
427        let perf_event_file = self.perf_event_file.read();
428        match perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS) {
429            Ok(vmo) => {
430                let vmo: zx::Vmo = vmo.into();
431                let memory = MemoryObject::from(vmo);
432                return Ok(Arc::new(memory));
433            }
434            Err(_) => {
435                track_stub!(
436                    TODO("https://fxbug.dev/416323134"),
437                    "[perf_event_open] handle get_memory() errors"
438                );
439                return error!(EINVAL);
440            }
441        };
442    }
443
444    fn write(
445        &self,
446        _locked: &mut Locked<FileOpsCore>,
447        _file: &FileObject,
448        _current_task: &CurrentTask,
449        _offset: usize,
450        _data: &mut dyn InputBuffer,
451    ) -> Result<usize, Errno> {
452        track_stub!(
453            TODO("https://fxbug.dev/394960158"),
454            "[perf_event_open] implement perf event functions"
455        );
456        error!(ENOSYS)
457    }
458}
459
460// Given a PerfRecordSample struct, write it via the correct output format
461// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
462// We don't currently support all the sample_types listed in the docs.
463// Input:
464//    PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
465// Human-understandable output:
466//    9 1 40 111 5 10 3 111 222 333
467// Actual output (no spaces or \n in real output, just making it more readable):
468//    0x0000 0x0009                 <-- starts at `offset` bytes
469//    0x0001
470//    0x0040
471//    0x0000 0x0000 0x0000 0x006F   <-- starts at `offset` + 8 bytes
472//    0x0000 0x0000 0x0000 0x0005
473//    0x0000 0x0000 0x0000 0x0010
474//    0x0000 0x0000 0x0000 0x0003
475//    0x0000 0x0000 0x0000 0x006F
476//    0x0000 0x0000 0x0000 0x00DE
477//    0x0000 0x0000 0x0000 0x014D
478//
479//    Returns the length of bytes written. In above case, 8 + 28 = 36.
480//    This information is used to increment the global offset.
481fn write_record_to_vmo(
482    perf_record_sample: PerfRecordSample,
483    perf_data_vmo: &zx::Vmo,
484    sample_type: u64,
485    sample_id: u64,
486    sample_period: u64,
487    offset: u64,
488) -> u64 {
489    // Write header.
490    track_stub!(
491        TODO("https://fxbug.dev/432501467"),
492        "[perf_event_open] determines whether the record is KERNEL or USER"
493    );
494    let perf_event_header = perf_event_header {
495        type_: perf_event_type_PERF_RECORD_SAMPLE,
496        misc: PERF_RECORD_MISC_KERNEL as u16,
497        size: PERF_EVENT_HEADER_SIZE,
498    };
499
500    // Total data offset. This is where the record should start getting written.
501    // The first page is reserved for metadata, so we need to add the page size.
502    // Example:
503    //  You're writing the first record (size 100). Start writing at 0 + 4096.
504    //  You're writing the second record. Start writing at 100 + 4096.
505    let data_offset = offset + (zx::system_get_page_size() as u64);
506
507    match perf_data_vmo.write(&perf_event_header.as_bytes(), data_offset) {
508        Ok(_) => (),
509        Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
510    }
511
512    // Write sample.
513    let mut sample = Vec::<u8>::new();
514    // sample_id
515    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
516        sample.extend(sample_id.to_ne_bytes());
517    }
518    // ip
519    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
520        sample.extend(perf_record_sample.ips[0].to_ne_bytes());
521    }
522
523    if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
524        // pid
525        sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
526        // tid
527        sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
528    }
529
530    // id
531    if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
532        sample.extend(sample_id.to_ne_bytes());
533    }
534
535    // sample period
536    if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
537        sample.extend(sample_period.to_ne_bytes());
538    }
539
540    if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
541        // nr
542        sample.extend(perf_record_sample.ips.len().to_ne_bytes());
543
544        // ips[nr] - list of ips, u64 per ip.
545        for i in perf_record_sample.ips {
546            sample.extend(i.to_ne_bytes());
547        }
548    }
549    // The remaining data are not defined for now.
550
551    match perf_data_vmo
552        .write(&sample, data_offset + (std::mem::size_of::<perf_event_header>() as u64))
553    {
554        Ok(_) => {
555            let bytes_written: u64 =
556                (std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
557            // Return the total size we wrote (header + sample) so that we can
558            // increment offset counter.
559            return bytes_written;
560        }
561        Err(e) => {
562            log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
563            // Failed to write. Don't increment offset counter.
564            return 0;
565        }
566    }
567}
568
569#[derive(Debug, Clone)]
570struct PerfRecordSample {
571    pid: Option<u32>,
572    tid: Option<u32>,
573    // Instruction pointers (currently this is the address). First one is `ip` param.
574    ips: Vec<u64>,
575}
576
577// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
578//
579// 1234                     pid
580// 5555                     tid
581// {{{bt:0:0x1111:pc}}}    {{{bt:frame_number:address:type}}}
582// {{{bt:1:0x2222:ra}}}
583// {{{bt:2:0x3333:ra}}}
584//
585// Results in:
586// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
587
588fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
589    let mut pid: Option<u32> = None;
590    let mut tid: Option<u32> = None;
591    let mut ips: Vec<u64> = Vec::new();
592    let mut numbers_found = 0;
593    track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
594    let backtrace_regex =
595        Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
596
597    for line in backtrace.lines() {
598        let trimmed_line = line.trim();
599        // Try to parse as a raw number (for PID/TID).
600        if numbers_found < 2 {
601            if let Ok(num) = trimmed_line.parse::<u32>() {
602                if numbers_found == 0 {
603                    pid = Some(num);
604                } else {
605                    tid = Some(num);
606                }
607                numbers_found += 1;
608                continue;
609            }
610        }
611
612        // Try to parse as a backtrace line.
613        if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
614            let address_str = parsed_bt.get(1).unwrap().as_str();
615            if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
616                ips.push(ip_addr);
617            }
618        }
619    }
620
621    if pid == None || tid == None || ips.is_empty() {
622        // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
623        log_info!("No ips while getting PerfRecordSample");
624        None
625    } else {
626        Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
627    }
628}
629
630async fn set_up_profiler(
631    sample_period: zx::MonotonicDuration,
632) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
633    // Configuration for how we want to sample.
634    let sample = profiler::Sample {
635        callgraph: Some(profiler::CallgraphConfig {
636            strategy: Some(profiler::CallgraphStrategy::FramePointer),
637            ..Default::default()
638        }),
639        ..Default::default()
640    };
641
642    let sampling_config = profiler::SamplingConfig {
643        period: Some(sample_period.into_nanos() as u64),
644        timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
645        sample: Some(sample),
646        ..Default::default()
647    };
648
649    let tasks = vec![
650        // Should return ~300 samples for 100 millis.
651        profiler::Task::SystemWide(profiler::SystemWide {}),
652    ];
653    let targets = profiler::TargetConfig::Tasks(tasks);
654    let config = profiler::Config {
655        configs: Some(vec![sampling_config]),
656        target: Some(targets),
657        ..Default::default()
658    };
659    let (client, server) = fidl::Socket::create_stream();
660    let configure = profiler::SessionConfigureRequest {
661        output: Some(server),
662        config: Some(config),
663        ..Default::default()
664    };
665
666    let proxy = connect_to_protocol::<profiler::SessionMarker>()
667        .context("Error connecting to Profiler protocol");
668    let session_proxy: profiler::SessionProxy = match proxy {
669        Ok(p) => p.clone(),
670        Err(e) => return error!(EINVAL, e),
671    };
672
673    // Must configure before sampling start().
674    let config_request = session_proxy.configure(configure).await;
675    match config_request {
676        Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
677        Err(e) => return error!(EINVAL, e),
678    }
679}
680
681// Collects samples and puts backtrace in VMO.
682// - Reads in the buffer from the socket for that duration in chunks.
683// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
684// - Writes the PERF_RECORD_SAMPLE into VMO.
685async fn stop_and_collect_samples(
686    session_proxy: profiler::SessionProxy,
687    mut client: fidl::AsyncSocket,
688    seq_lock: &OnceLock<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>,
689    perf_data_vmo: &zx::Vmo,
690    sample_type: u64,
691    sample_id: u64,
692    sample_period: u64,
693    vmo_write_offset: &mut u64,
694) -> Result<(), Errno> {
695    let stats = session_proxy.stop().await;
696
697    let seq_lock_wrapper = match seq_lock.get() {
698        Some(Ok(l)) => l,
699        // Initialization failed in a previous mmap() call. Propagate the error.
700        Some(Err(e)) => return Err(e.clone()),
701        // Not initialized yet (i.e. mmap() hasn't been called). Skip updating metadata.
702        None => return Ok(()),
703    };
704
705    let samples_collected = match stats {
706        Ok(stats) => stats.samples_collected.unwrap(),
707        Err(e) => return error!(EINVAL, e),
708    };
709
710    track_stub!(
711        TODO("https://fxbug.dev/422502681"),
712        "[perf_event_open] symbolize sample output and delete the below log_info"
713    );
714    log_info!("profiler samples_collected: {:?}", samples_collected);
715
716    // Peek at the first 8 bytes to determine if it's FXT or text.
717    let mut header = [0; 8];
718    let mut bytes_read = 0;
719    while bytes_read < 8 {
720        match client.read(&mut header[bytes_read..]).await {
721            Ok(0) => {
722                // Peer closed the socket. This is the normal end of the stream.
723                log_info!("[perf_event_open] Finished reading fxt record from socket.");
724                break;
725            }
726            Ok(n) => bytes_read += n,
727            Err(e) => {
728                log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
729                break;
730            }
731        }
732    }
733
734    if bytes_read > 0 {
735        if bytes_read == 8 && header == FXT_MAGIC_BYTES {
736            // FXT format.
737            let header_cursor = Cursor::new(header);
738            let reader = header_cursor.chain(client);
739            let (mut stream, _task) = SessionParser::new_async(reader);
740            while let Some(record_result) = stream.next().await {
741                match record_result {
742                    Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
743                        let ips: Vec<u64> = backtrace.data;
744                        let pid = Some(backtrace.process.0 as u32);
745                        let tid = Some(backtrace.thread.0 as u32);
746                        let perf_record_sample = PerfRecordSample { pid, tid, ips };
747                        let bytes_written = write_record_to_vmo(
748                            perf_record_sample,
749                            perf_data_vmo,
750                            sample_type,
751                            sample_id,
752                            sample_period,
753                            *vmo_write_offset,
754                        );
755                        // Update data_head after writing sample.
756                        if bytes_written > 0 {
757                            *vmo_write_offset += bytes_written;
758                            let mut metadata = seq_lock_wrapper.get();
759                            metadata.data_head = *vmo_write_offset;
760                            seq_lock_wrapper.set_value(metadata);
761                        }
762                    }
763                    Ok(_) => {
764                        // Ignore other records.
765                    }
766                    Err(e) => {
767                        log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
768                        break;
769                    }
770                }
771            }
772        } else {
773            // Text format.
774            // Read chunks of sampling data from socket in this buffer temporarily. We will parse
775            // the data and write it into the output VMO (the one mmap points to).
776            let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
777
778            loop {
779                // Attempt to read data. This awaits until data is available, EOF, or error.
780                // Ignore the first 8 bytes as it's the {{{reset}}} marker.
781                let socket_data = client.read(&mut buffer).await;
782
783                match socket_data {
784                    Ok(0) => {
785                        // Peer closed the socket. This is the normal end of the stream.
786                        log_info!("[perf_event_open] Finished reading from socket.");
787                        break;
788                    }
789                    Ok(bytes_read) => {
790                        // Receive data in format {{{...}}}.
791                        let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
792                            Ok(data) => data,
793                            Err(e) => return error!(EINVAL, e),
794                        };
795                        // Parse data to PerfRecordSample struct.
796                        if let Some(perf_record_sample) =
797                            parse_perf_record_sample_format(received_data)
798                        {
799                            let bytes_written = write_record_to_vmo(
800                                perf_record_sample,
801                                perf_data_vmo,
802                                sample_type,
803                                sample_id,
804                                sample_period,
805                                *vmo_write_offset,
806                            );
807                            // Update data_head after writing sample.
808                            if bytes_written > 0 {
809                                *vmo_write_offset += bytes_written;
810                                let mut metadata = seq_lock_wrapper.get();
811                                metadata.data_head = *vmo_write_offset;
812                                seq_lock_wrapper.set_value(metadata);
813                            }
814                        }
815                    }
816                    Err(e) => {
817                        log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
818                        break;
819                    }
820                }
821            }
822        }
823    }
824
825    let reset_status = session_proxy.reset().await;
826    return match reset_status {
827        Ok(_) => Ok(()),
828        Err(e) => error!(EINVAL, e),
829    };
830}
831
832// Notifies other thread that we should start/stop sampling.
833// Once sampling is complete, that profiler session is no longer needed.
834// At that point, send back notification so that this is no longer blocking
835// (e.g. so that other profiler sessions can start).
836fn ping_receiver(
837    mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
838    command: IoctlOp,
839) {
840    log_info!("[perf_event_open] Received sampling command: {:?}", command);
841    let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
842    match ioctl_sender.try_send((command, profiling_complete_sender)) {
843        Ok(_) => (),
844        Err(e) => {
845            if e.is_full() {
846                log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
847            } else if e.is_disconnected() {
848                log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
849            } else {
850                log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
851            }
852        }
853    };
854    // Block on / wait until profiling is complete before returning.
855    // This notifies that the profiler is free to be used for another session.
856    let _ = profiling_complete_receiver.recv().unwrap();
857}
858
859// Creates a seq lock for the given VMO. Initializes the seq lock with
860// known initial values (unknown values default to 0).
861// Does NOT actually save this as a memory object until mmap() is called.
862//
863// # Safety
864//
865// The caller must ensure that the kernel maintains exclusive write access to this VMO and
866// there are only atomic accesses to this memory (see seq_lock lib.rs for details).
867unsafe fn create_seq_lock(
868    vmo_handle_ref: &zx::NullableHandle,
869    buffer_size: u64,
870) -> SeqLock<PerfMetadataHeader, PerfMetadataValue> {
871    // Currently we hardcode everything just to get something E2E working.
872    let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
873    let page_size = zx::system_get_page_size() as u64;
874    let metadata_value = PerfMetadataValue {
875        lock: 0,
876        index: 3,
877        offset: 19337,
878        time_enabled: 0,
879        time_running: 0,
880        __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
881        pmc_width: 0,
882        time_shift: 0,
883        time_mult: 0,
884        time_offset: 0,
885        time_zero: 0,
886        size: 0,
887        __reserved_1: 0,
888        time_cycles: 0,
889        time_mask: 0,
890        __reserved: [0; 928usize],
891        // This first page (metadata) has finished writing. Start data_head at 0.
892        data_head: 0,
893        // Start reading from 0; it is the user's responsibility to increment on their end.
894        data_tail: 0,
895        // We know the data will start after 1 page size so we can set this now.
896        data_offset: page_size,
897        data_size: buffer_size - page_size,
898        aux_head: 0,
899        aux_tail: 0,
900        aux_offset: 0,
901        aux_size: 0,
902    };
903    let vmo = zx::Vmo::from(vmo_handle_ref.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap());
904
905    // Create a SeqLock and safely initialize the `header` and `value` for it.
906    // SeqLock is formatted thusly:
907    //   header_struct : any size, params `version` and `compat_version` should not change
908    //   sequence_counter : u32, this is the lock and should increment
909    //   value_struct : any size, each param can change
910    //
911    // SAFETY: See safety requirements on `create_seq_lock`.
912    unsafe {
913        SeqLock::new_from_vmo(metadata_header, metadata_value, vmo)
914            .expect("failed to create seq_lock for perf metadata")
915    }
916}
917
918pub fn sys_perf_event_open(
919    locked: &mut Locked<Unlocked>,
920    current_task: &CurrentTask,
921    attr: UserRef<perf_event_attr>,
922    // Note that this is pid in Linux docs.
923    tid: tid_t,
924    cpu: i32,
925    group_fd: FdNumber,
926    _flags: u64,
927) -> Result<SyscallResult, Errno> {
928    // So far, the implementation only sets the read_data_format according to the "Reading results"
929    // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
930    // Other features will be added in the future (see below track_stubs).
931    let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
932
933    if tid == -1 && cpu == -1 {
934        return error!(EINVAL);
935    }
936
937    let target_task_type = match tid {
938        -1 => TargetTaskType::AllTasks,
939        0 => TargetTaskType::CurrentTask,
940        _ => {
941            track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
942            return error!(ENOSYS);
943        }
944    };
945    security::check_perf_event_open_access(
946        current_task,
947        target_task_type,
948        &perf_event_attrs,
949        perf_event_attrs.type_.try_into()?,
950    )?;
951
952    // Channel used to send info between notifier and spawned task thread.
953    // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
954    // quick succession (instead of something lower).
955    let (sender, mut receiver) = future_mpsc::channel(8);
956
957    let mut perf_event_file = PerfEventFileState::new(
958        perf_event_attrs,
959        0,
960        perf_event_attrs.disabled(),
961        perf_event_attrs.sample_type,
962        zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
963        sender,
964    );
965
966    let read_format = perf_event_attrs.read_format;
967
968    if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
969        || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
970    {
971        // Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
972        // as otherwise this param shouldn't be used for calculating anything.
973        if perf_event_file.disabled == 0 {
974            perf_event_file.most_recent_enabled_time =
975                zx::MonotonicInstant::get().into_nanos() as u64;
976        }
977        // Initialize this to 0 as we will need to return a time duration later during read().
978        perf_event_file.total_time_running = 0;
979    }
980
981    let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
982    perf_event_file.rf_id = event_id;
983
984    if group_fd.raw() == -1 {
985        perf_event_file.sample_id = event_id;
986    } else {
987        let group_file = current_task.get_file(group_fd)?;
988        let group_file_object_id = group_file.id;
989        let perf_state = get_perf_state(&current_task.kernel);
990        let events = perf_state.format_id_lookup_table.lock();
991        if let Some(rf_id) = events.get(&group_file_object_id) {
992            perf_event_file.sample_id = *rf_id;
993        } else {
994            return error!(EINVAL);
995        }
996    }
997
998    if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
999        track_stub!(
1000            TODO("https://fxbug.dev/402238049"),
1001            "[perf_event_open] implement read_format group"
1002        );
1003        return error!(ENOSYS);
1004    }
1005    if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
1006        track_stub!(
1007            TODO("https://fxbug.dev/402260383"),
1008            "[perf_event_open] implement read_format lost"
1009        );
1010    }
1011
1012    // Set up notifier for handling ioctl calls to enable/disable sampling.
1013    let mut vmo_handle_copy =
1014        perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS);
1015
1016    // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
1017    // This is always sound regardless of the union's tag.
1018    let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
1019    // The sample period from the PERF_COUNT_SW_CPU_CLOCK is
1020    // 1 nanosecond per tick. Convert this duration into zx::duration.
1021    let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
1022
1023    // SeqLock does not get instantiated with metadata values until mmap() is called.
1024    let seq_lock =
1025        Arc::new(OnceLock::<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>::new());
1026    let cloned_seq_lock = Arc::clone(&seq_lock);
1027    let mut vmo_write_offset = 0;
1028
1029    let closure = async move |_: LockedAndTask<'_>| {
1030        let mut profiler_state: Option<(profiler::SessionProxy, fidl::AsyncSocket)> = None;
1031
1032        // This loop will wait for messages from the sender.
1033        while let Some((command, profiling_complete_receiver)) = receiver.next().await {
1034            match command {
1035                IoctlOp::Enable => {
1036                    match set_up_profiler(zx_sample_period).await {
1037                        Ok((session_proxy, client)) => {
1038                            let start_request = profiler::SessionStartRequest {
1039                                buffer_results: Some(true),
1040                                buffer_size_mb: Some(8 as u64),
1041                                ..Default::default()
1042                            };
1043                            if let Err(e) = session_proxy.start(&start_request).await {
1044                                log_warn!("Failed to start profiling: {}", e);
1045                            } else {
1046                                profiler_state = Some((session_proxy, client));
1047                            }
1048                        }
1049                        Err(e) => {
1050                            log_warn!("Failed to profile: {}", e);
1051                        }
1052                    };
1053                    // Send notification anyway to unblock the ioctl caller.
1054                    let _ = profiling_complete_receiver.send(());
1055                }
1056                IoctlOp::Disable => {
1057                    if let Some((session_proxy, client)) = profiler_state.take() {
1058                        let handle = vmo_handle_copy
1059                            .as_mut()
1060                            .expect("Failed to get VMO handle")
1061                            .as_handle_ref()
1062                            .duplicate(zx::Rights::SAME_RIGHTS)
1063                            .unwrap();
1064
1065                        if let Err(e) = stop_and_collect_samples(
1066                            session_proxy,
1067                            client,
1068                            &cloned_seq_lock,
1069                            &zx::Vmo::from(handle),
1070                            perf_event_file.sample_type,
1071                            perf_event_file.sample_id,
1072                            sample_period_in_ticks,
1073                            &mut vmo_write_offset,
1074                        )
1075                        .await
1076                        {
1077                            log_warn!("Failed to collect sample: {:?}", e);
1078                        }
1079                    }
1080                    // Send notification anyway to unblock the ioctl caller.
1081                    let _ = profiling_complete_receiver.send(());
1082                }
1083            }
1084        }
1085        ()
1086    };
1087    let req = SpawnRequestBuilder::new()
1088        .with_debug_name("perf-event-sampler")
1089        .with_async_closure(closure)
1090        .build();
1091    current_task.kernel().kthreads.spawner().spawn_from_request(req);
1092
1093    let file = Box::new(PerfEventFile {
1094        _tid: tid,
1095        _cpu: cpu,
1096        perf_event_file: RwLock::new(perf_event_file),
1097        security_state: security::perf_event_alloc(current_task),
1098        seq_lock: seq_lock,
1099    });
1100    // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
1101    let file_handle =
1102        Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
1103    let file_object_id = file_handle.id;
1104    let file_descriptor: Result<FdNumber, Errno> =
1105        current_task.add_file(locked, file_handle, FdFlags::empty());
1106
1107    match file_descriptor {
1108        Ok(fd) => {
1109            if group_fd.raw() == -1 {
1110                let perf_state = get_perf_state(&current_task.kernel);
1111                let mut events = perf_state.format_id_lookup_table.lock();
1112                events.insert(file_object_id, event_id);
1113            }
1114            Ok(fd.into())
1115        }
1116        Err(_) => {
1117            track_stub!(
1118                TODO("https://fxbug.dev/402453955"),
1119                "[perf_event_open] implement remaining error handling"
1120            );
1121            error!(EMFILE)
1122        }
1123    }
1124}
1125// Syscalls for arch32 usage
1126#[cfg(target_arch = "aarch64")]
1127mod arch32 {
1128    pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
1129}
1130
1131#[cfg(target_arch = "aarch64")]
1132pub use arch32::*;
1133
1134use crate::mm::memory::MemoryObject;
1135use crate::mm::{MemoryAccessorExt, ProtectionFlags};
1136use crate::task::CurrentTask;
1137use crate::vfs::{
1138    Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
1139    OutputBuffer,
1140};
1141use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};