Skip to main content

starnix_core/perf/
mod.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
6use anyhow::Context;
7use fidl_fuchsia_cpu_profiler as profiler;
8use fuchsia_component::client::connect_to_protocol;
9use fuchsia_runtime;
10use futures::StreamExt;
11use futures::channel::mpsc as future_mpsc;
12use regex_lite::Regex;
13use std::collections::HashMap;
14use std::error::Error;
15use std::sync::atomic::{AtomicU64, Ordering};
16use std::sync::{Arc, OnceLock, mpsc as sync_mpsc};
17use zerocopy::{Immutable, IntoBytes};
18
19use futures::io::{AsyncReadExt, Cursor};
20use fxt::TraceRecord;
21use fxt::profiler::ProfilerRecord;
22use fxt::session::SessionParser;
23use seq_lock::{SeqLock, SeqLockable, WriteSize};
24use starnix_logging::{log_info, log_warn, track_stub};
25use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked};
26use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
27use starnix_uapi::arch32::{
28    PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
29    PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
30    PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
31    PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
32    perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
33    perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
34    perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
35    perf_event_type_PERF_RECORD_SAMPLE,
36};
37use starnix_uapi::errors::Errno;
38use starnix_uapi::open_flags::OpenFlags;
39use starnix_uapi::user_address::UserRef;
40use starnix_uapi::{
41    errno, error, from_status_like_fdio, perf_event_attr, perf_event_header,
42    perf_event_mmap_page__bindgen_ty_1, perf_event_read_format_PERF_FORMAT_GROUP,
43    perf_event_read_format_PERF_FORMAT_ID, perf_event_read_format_PERF_FORMAT_LOST,
44    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
45    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
46};
47
48use crate::security::{self, TargetTaskType};
49use crate::task::{Kernel, LockedAndTask};
50
51static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
52// Default buffer size to read from socket (for sampling data).
53const DEFAULT_CHUNK_SIZE: usize = 4096;
54// 4096 * 10, page size * 10.
55// If tests flake due to running out of buffer space, or if the profiling duration is
56// significantly increased, this buffer size may need further adjustment (expansion).
57const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960;
58// FXT magic bytes (little endian).
59const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
60
61mod event;
62pub use event::{TraceEvent, TraceEventQueue, TraceEventQueueList};
63
64pub mod lockless_ring_buffer;
65
66#[repr(C)]
67#[derive(Copy, Clone, IntoBytes, Immutable)]
68struct PerfMetadataHeader {
69    version: u32,
70    compat_version: u32,
71}
72
73#[repr(C)]
74#[derive(Copy, Clone, IntoBytes, Immutable)]
75struct PerfMetadataValue {
76    lock: u32,
77    index: u32,
78    offset: i64,
79    time_enabled: u64,
80    time_running: u64,
81    __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
82    pmc_width: u16,
83    time_shift: u16,
84    time_mult: u32,
85    time_offset: u64,
86    time_zero: u64,
87    size: u32,
88    __reserved_1: u32,
89    time_cycles: u64,
90    time_mask: u64,
91    __reserved: [u8; 928usize],
92    data_head: u64,
93    data_tail: u64,
94    data_offset: u64,
95    data_size: u64,
96    aux_head: u64,
97    aux_tail: u64,
98    aux_offset: u64,
99    aux_size: u64,
100}
101
102// SAFETY: `PerfMetadataValue` can be safely written to shared memory in 8-byte chunks.
103// This is because it is composed of two u32s followed by only u64s.
104// The first u32 is the `lock` field, which is why HAS_INLINE_SEQUENCE is true.
105unsafe impl SeqLockable for PerfMetadataValue {
106    const WRITE_SIZE: WriteSize = WriteSize::Eight;
107    const HAS_INLINE_SEQUENCE: bool = true;
108    const VMO_NAME: &'static [u8] = b"starnix:perf_event";
109}
110
111struct PerfState {
112    // This table maps a group leader's file object id to its unique u64 "format ID".
113    //
114    // When a sample is generated for any event in a group, we use this
115    // "format ID" from the group leader as the value for *both* the
116    // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
117    format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>,
118}
119
120impl Default for PerfState {
121    fn default() -> Self {
122        Self { format_id_lookup_table: Mutex::new(HashMap::new()) }
123    }
124}
125
126fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
127    kernel.expando.get_or_init(PerfState::default)
128}
129
130uapi::check_arch_independent_layout! {
131    perf_event_attr {
132        type_, // "type" is a reserved keyword so add a trailing underscore.
133        size,
134        config,
135        __bindgen_anon_1,
136        sample_type,
137        read_format,
138        _bitfield_1,
139        __bindgen_anon_2,
140        bp_type,
141        __bindgen_anon_3,
142        __bindgen_anon_4,
143        branch_sample_type,
144        sample_regs_user,
145        sample_stack_user,
146        clockid,
147        sample_regs_intr,
148        aux_watermark,
149        sample_max_stack,
150        __reserved_2,
151        aux_sample_size,
152        __reserved_3,
153        sig_data,
154        config3,
155    }
156}
157
158#[derive(Clone, Copy, Debug, PartialEq)]
159enum IoctlOp {
160    Enable,
161    Disable,
162}
163
164struct PerfEventFileState {
165    attr: perf_event_attr,
166    rf_value: u64, // "count" for the config we passed in for the event.
167    // The most recent timestamp (ns) where we changed into an enabled state
168    // i.e. the most recent time we got an ENABLE ioctl().
169    most_recent_enabled_time: u64,
170    // Sum of all previous enablement segment durations (ns). If we are
171    // currently in an enabled state, explicitly does NOT include the current
172    // segment.
173    total_time_running: u64,
174    rf_id: u64,
175    sample_id: u64,
176    _rf_lost: u64,
177    disabled: u64,
178    sample_type: u64,
179    // Handle to blob that stores all the perf data that a user may want.
180    // At the moment it only stores some metadata and backtraces (bts).
181    perf_data_vmo: zx::Vmo,
182    // Channel used to send IoctlOps to start/stop sampling.
183    ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
184}
185
186// Have an implementation for PerfEventFileState because VMO
187// doesn't have Default so we can't derive it.
188impl PerfEventFileState {
189    fn new(
190        attr: perf_event_attr,
191        rf_value: u64,
192        disabled: u64,
193        sample_type: u64,
194        perf_data_vmo: zx::Vmo,
195        ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
196    ) -> PerfEventFileState {
197        PerfEventFileState {
198            attr,
199            rf_value,
200            most_recent_enabled_time: 0,
201            total_time_running: 0,
202            rf_id: 0,
203            sample_id: 0,
204            _rf_lost: 0,
205            disabled,
206            sample_type,
207            perf_data_vmo,
208            ioctl_sender,
209        }
210    }
211}
212
213pub struct PerfEventFile {
214    _tid: tid_t,
215    _cpu: i32,
216    perf_event_file: RwLock<PerfEventFileState>,
217    // The security state for this PerfEventFile.
218    pub security_state: security::PerfEventState,
219    seq_lock: Arc<OnceLock<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>>,
220}
221
222// PerfEventFile object that implements FileOps.
223// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
224// implementation details.
225// This object can be saved as a FileDescriptor.
226impl FileOps for PerfEventFile {
227    // Don't need to implement seek or sync for PerfEventFile.
228    fileops_impl_nonseekable!();
229    fileops_impl_noop_sync!();
230
231    fn close(
232        self: Box<Self>,
233        _locked: &mut Locked<FileOpsCore>,
234        file: &FileObjectState,
235        current_task: &CurrentTask,
236    ) {
237        let perf_state = get_perf_state(&current_task.kernel);
238        let mut events = perf_state.format_id_lookup_table.lock();
239        events.remove(&file.id);
240    }
241
242    // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
243    fn read(
244        &self,
245        _locked: &mut Locked<FileOpsCore>,
246        _file: &FileObject,
247        current_task: &CurrentTask,
248        _offset: usize,
249        data: &mut dyn OutputBuffer,
250    ) -> Result<usize, Errno> {
251        // Create/calculate and return the ReadFormatData object.
252        // If we create it earlier we might want to change it and it's immutable once created.
253        let read_format_data = {
254            // Once we get the `value` or count from kernel, we can change this to a read()
255            // call instead of write().
256            let mut perf_event_file = self.perf_event_file.write();
257
258            security::check_perf_event_read_access(current_task, &self)?;
259
260            let mut total_time_running_including_curr = perf_event_file.total_time_running;
261
262            // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
263            if perf_event_file.disabled == 0 {
264                // Calculate the value or "count" of the config we're interested in.
265                // This value should reflect the value we are counting (defined in the config).
266                // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
267                // For now we just return rf_value + 1.
268                track_stub!(
269                    TODO("https://fxbug.dev/402938671"),
270                    "[perf_event_open] implement read_format value"
271                );
272                perf_event_file.rf_value += 1;
273
274                // Update time duration.
275                let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
276                total_time_running_including_curr +=
277                    curr_time - perf_event_file.most_recent_enabled_time;
278            }
279
280            let mut output = Vec::<u8>::new();
281            let value = perf_event_file.rf_value.to_ne_bytes();
282            output.extend(value);
283
284            let read_format = perf_event_file.attr.read_format;
285
286            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
287                // Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
288                output.extend(total_time_running_including_curr.to_ne_bytes());
289            }
290            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
291                // Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
292                output.extend(total_time_running_including_curr.to_ne_bytes());
293            }
294            if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
295                // Adds a 64-bit unique value that corresponds to the event group.
296                output.extend(perf_event_file.rf_id.to_ne_bytes());
297            }
298
299            output
300        };
301
302        // The regular read() call allows the case where the bytes-we-want-to-read-in won't
303        // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
304        // into a buffer that is not big enough to hold the data, the error ENOSPC results."
305        if data.available() < read_format_data.len() {
306            return error!(ENOSPC);
307        }
308        track_stub!(
309            TODO("https://fxbug.dev/402453955"),
310            "[perf_event_open] implement remaining error handling"
311        );
312
313        data.write(&read_format_data)
314    }
315
316    fn ioctl(
317        &self,
318        _locked: &mut Locked<Unlocked>,
319        _file: &FileObject,
320        current_task: &CurrentTask,
321        op: u32,
322        _arg: SyscallArg,
323    ) -> Result<SyscallResult, Errno> {
324        track_stub!(
325            TODO("https://fxbug.dev/405463320"),
326            "[perf_event_open] implement PERF_IOC_FLAG_GROUP"
327        );
328        security::check_perf_event_write_access(current_task, &self)?;
329        let mut perf_event_file = self.perf_event_file.write();
330        match op {
331            PERF_EVENT_IOC_ENABLE => {
332                if perf_event_file.disabled != 0 {
333                    perf_event_file.disabled = 0; // 0 = false.
334                    perf_event_file.most_recent_enabled_time =
335                        zx::MonotonicInstant::get().into_nanos() as u64;
336                }
337
338                // If we are sampling, invoke the profiler and collect a sample.
339                // Currently this is an example sample collection.
340                track_stub!(
341                    TODO("https://fxbug.dev/398914921"),
342                    "[perf_event_open] implement full sampling features"
343                );
344                if perf_event_file.attr.freq() == 0
345                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
346                // This is always sound regardless of the union's tag.
347                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
348                {
349                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
350                }
351                return Ok(SUCCESS);
352            }
353            PERF_EVENT_IOC_DISABLE => {
354                if perf_event_file.disabled == 0 {
355                    perf_event_file.disabled = 1; // 1 = true.
356
357                    // Update total_time_running now that the segment has ended.
358                    let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
359                    perf_event_file.total_time_running +=
360                        curr_time - perf_event_file.most_recent_enabled_time;
361                }
362                if perf_event_file.attr.freq() == 0
363                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
364                // This is always sound regardless of the union's tag.
365                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
366                {
367                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Disable);
368                }
369                return Ok(SUCCESS);
370            }
371            PERF_EVENT_IOC_RESET => {
372                perf_event_file.rf_value = 0;
373                return Ok(SUCCESS);
374            }
375            PERF_EVENT_IOC_REFRESH
376            | PERF_EVENT_IOC_PERIOD
377            | PERF_EVENT_IOC_SET_OUTPUT
378            | PERF_EVENT_IOC_SET_FILTER
379            | PERF_EVENT_IOC_ID
380            | PERF_EVENT_IOC_SET_BPF
381            | PERF_EVENT_IOC_PAUSE_OUTPUT
382            | PERF_EVENT_IOC_MODIFY_ATTRIBUTES
383            | PERF_EVENT_IOC_QUERY_BPF => {
384                track_stub!(
385                    TODO("https://fxbug.dev/404941053"),
386                    "[perf_event_open] implement remaining ioctl() calls"
387                );
388                return error!(ENOSYS);
389            }
390            _ => error!(ENOTTY),
391        }
392    }
393
394    // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
395    // Gets called when mmap() is called.
396    // Immediately before sampling, this should get called by the user (e.g. the test
397    // or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
398    fn get_memory(
399        &self,
400        _locked: &mut Locked<FileOpsCore>,
401        _file: &FileObject,
402        current_task: &CurrentTask,
403        length: Option<usize>,
404        _prot: ProtectionFlags,
405    ) -> Result<Arc<MemoryObject>, Errno> {
406        let buffer_size: u64 = length.unwrap_or(0) as u64;
407        if buffer_size == 0 {
408            return error!(EINVAL);
409        }
410
411        self.seq_lock
412            .get_or_init(|| {
413                let perf_event_file = self.perf_event_file.read();
414                let vmo_copy = perf_event_file
415                    .perf_data_vmo
416                    .as_handle_ref()
417                    .duplicate_handle(zx::Rights::SAME_RIGHTS)
418                    .map_err(|status| from_status_like_fdio!(status))?;
419                // SAFETY: See safety requirements on `create_seq_lock`.
420                Ok(unsafe { create_seq_lock(&vmo_copy, buffer_size) })
421            })
422            .as_ref()
423            .map_err(|e| e.clone())?;
424
425        // Write to a MemoryObject and return it (expected return type for get_memory()).
426        security::check_perf_event_read_access(current_task, &self)?;
427        let perf_event_file = self.perf_event_file.read();
428        match perf_event_file
429            .perf_data_vmo
430            .as_handle_ref()
431            .duplicate_handle(zx::Rights::SAME_RIGHTS)
432        {
433            Ok(vmo) => {
434                let vmo: zx::Vmo = vmo.into();
435                let memory = MemoryObject::from(vmo);
436                return Ok(Arc::new(memory));
437            }
438            Err(_) => {
439                track_stub!(
440                    TODO("https://fxbug.dev/416323134"),
441                    "[perf_event_open] handle get_memory() errors"
442                );
443                return error!(EINVAL);
444            }
445        };
446    }
447
448    fn write(
449        &self,
450        _locked: &mut Locked<FileOpsCore>,
451        _file: &FileObject,
452        _current_task: &CurrentTask,
453        _offset: usize,
454        _data: &mut dyn InputBuffer,
455    ) -> Result<usize, Errno> {
456        track_stub!(
457            TODO("https://fxbug.dev/394960158"),
458            "[perf_event_open] implement perf event functions"
459        );
460        error!(ENOSYS)
461    }
462}
463
464// Given a PerfRecordSample struct, write it via the correct output format
465// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
466// We don't currently support all the sample_types listed in the docs.
467// Input:
468//    PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
469// Human-understandable output:
470//    9 1 40 111 5 10 3 111 222 333
471// Actual output (no spaces or \n in real output, just making it more readable):
472//    0x0000 0x0009                 <-- starts at `offset` bytes
473//    0x0001
474//    0x0040
475//    0x0000 0x0000 0x0000 0x006F   <-- starts at `offset` + 8 bytes
476//    0x0000 0x0000 0x0000 0x0005
477//    0x0000 0x0000 0x0000 0x0010
478//    0x0000 0x0000 0x0000 0x0003
479//    0x0000 0x0000 0x0000 0x006F
480//    0x0000 0x0000 0x0000 0x00DE
481//    0x0000 0x0000 0x0000 0x014D
482//
483//    Returns the length of bytes written. In above case, 8 + 28 = 36.
484//    This information is used to increment the global offset.
485fn write_record_to_vmo(
486    perf_record_sample: PerfRecordSample,
487    perf_data_vmo: &zx::Vmo,
488    sample_type: u64,
489    sample_id: u64,
490    sample_period: u64,
491    offset: u64,
492) -> u64 {
493    // First, build record to determine its size (so that we can fill out `size` in header).
494    let mut sample = Vec::<u8>::new();
495    // sample_id
496    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
497        sample.extend(sample_id.to_ne_bytes());
498    }
499    // ip
500    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
501        sample.extend(perf_record_sample.ips[0].to_ne_bytes());
502    }
503
504    if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
505        // pid
506        sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
507        // tid
508        sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
509    }
510
511    // id
512    if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
513        sample.extend(sample_id.to_ne_bytes());
514    }
515
516    // sample period
517    if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
518        sample.extend(sample_period.to_ne_bytes());
519    }
520
521    if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
522        // nr
523        sample.extend(perf_record_sample.ips.len().to_ne_bytes());
524
525        // ips[nr] - list of ips, u64 per ip.
526        for i in perf_record_sample.ips {
527            sample.extend(i.to_ne_bytes());
528        }
529    }
530    // The remaining data are not defined for now.
531
532    // Now that we know the sample size, we can calculate the record size.
533    // record_size = perf_event_header_size + sample_size.
534    // perf_event_header is defined to be 8 bytes.
535    let record_size: u64 = (std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
536
537    track_stub!(
538        TODO("https://fxbug.dev/432501467"),
539        "[perf_event_open] determines whether the record is KERNEL or USER"
540    );
541    let perf_event_header = perf_event_header {
542        type_: perf_event_type_PERF_RECORD_SAMPLE,
543        misc: PERF_RECORD_MISC_KERNEL as u16,
544        size: record_size as u16,
545    };
546
547    // Total data offset. This is where the record should start getting written.
548    // The first page is reserved for metadata, so we need to add the page size.
549    // Example:
550    //  You're writing the first record (size 100). Start writing at 0 + 4096.
551    //  You're writing the second record. Start writing at 100 + 4096.
552    let data_offset = offset + (zx::system_get_page_size() as u64);
553
554    // Write header to memory.
555    match perf_data_vmo.write(&perf_event_header.as_bytes(), data_offset) {
556        Ok(_) => (),
557        Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
558    }
559
560    // Write sample to memory immediately after the header.
561    match perf_data_vmo
562        .write(&sample, data_offset + (std::mem::size_of::<perf_event_header>() as u64))
563    {
564        Ok(_) => {
565            // Return the total size we wrote (header + sample) so that we can
566            // increment offset counter.
567            return record_size;
568        }
569        Err(e) => {
570            log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
571            // Failed to write. Don't increment offset counter.
572            return 0;
573        }
574    }
575}
576
577#[derive(Debug, Clone)]
578struct PerfRecordSample {
579    pid: Option<u32>,
580    tid: Option<u32>,
581    // Instruction pointers (currently this is the address). First one is `ip` param.
582    ips: Vec<u64>,
583}
584
585// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
586//
587// 1234                     pid
588// 5555                     tid
589// {{{bt:0:0x1111:pc}}}    {{{bt:frame_number:address:type}}}
590// {{{bt:1:0x2222:ra}}}
591// {{{bt:2:0x3333:ra}}}
592//
593// Results in:
594// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
595
596fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
597    let mut pid: Option<u32> = None;
598    let mut tid: Option<u32> = None;
599    let mut ips: Vec<u64> = Vec::new();
600    let mut numbers_found = 0;
601    track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
602    let backtrace_regex =
603        Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
604
605    for line in backtrace.lines() {
606        let trimmed_line = line.trim();
607        // Try to parse as a raw number (for PID/TID).
608        if numbers_found < 2 {
609            if let Ok(num) = trimmed_line.parse::<u32>() {
610                if numbers_found == 0 {
611                    pid = Some(num);
612                } else {
613                    tid = Some(num);
614                }
615                numbers_found += 1;
616                continue;
617            }
618        }
619
620        // Try to parse as a backtrace line.
621        if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
622            let address_str = parsed_bt.get(1).unwrap().as_str();
623            if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
624                ips.push(ip_addr);
625            }
626        }
627    }
628
629    if pid == None || tid == None || ips.is_empty() {
630        // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
631        log_info!("No ips while getting PerfRecordSample");
632        None
633    } else {
634        Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
635    }
636}
637
638async fn set_up_profiler(
639    sample_period: zx::MonotonicDuration,
640) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
641    // Configuration for how we want to sample.
642    let sample = profiler::Sample {
643        callgraph: Some(profiler::CallgraphConfig {
644            strategy: Some(profiler::CallgraphStrategy::FramePointer),
645            ..Default::default()
646        }),
647        ..Default::default()
648    };
649
650    let sampling_config = profiler::SamplingConfig {
651        period: Some(sample_period.into_nanos() as u64),
652        timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
653        sample: Some(sample),
654        ..Default::default()
655    };
656
657    track_stub!(
658        TODO("https://fxbug.dev/398914921"),
659        "[perf_event_open] allow for profiling system-wide not during tests"
660    );
661    let job = fuchsia_runtime::job_default();
662    let koid = job.koid().map_err(|e| errno!(EINVAL, e.to_string()))?;
663    let tasks = vec![
664        // Should return ~1300 samples for 1000 millis.
665        profiler::Task::Job(koid.raw_koid()),
666    ];
667    let targets = profiler::TargetConfig::Tasks(tasks);
668    let config = profiler::Config {
669        configs: Some(vec![sampling_config]),
670        target: Some(targets),
671        ..Default::default()
672    };
673    let (client, server) = fidl::Socket::create_stream();
674    let configure = profiler::SessionConfigureRequest {
675        output: Some(server),
676        config: Some(config),
677        ..Default::default()
678    };
679
680    let proxy = connect_to_protocol::<profiler::SessionMarker>()
681        .context("Error connecting to Profiler protocol");
682    let session_proxy: profiler::SessionProxy = match proxy {
683        Ok(p) => p.clone(),
684        Err(e) => return error!(EINVAL, e),
685    };
686
687    // Must configure before sampling start().
688    let config_request = session_proxy.configure(configure).await;
689    match config_request {
690        Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
691        Err(e) => return error!(EINVAL, e),
692    }
693}
694
695// Collects samples and puts backtrace in VMO.
696// - Reads in the buffer from the socket for that duration in chunks.
697// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
698// - Writes the PERF_RECORD_SAMPLE into VMO.
699async fn stop_and_collect_samples(
700    session_proxy: profiler::SessionProxy,
701    mut client: fidl::AsyncSocket,
702    seq_lock: &OnceLock<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>,
703    perf_data_vmo: &zx::Vmo,
704    sample_type: u64,
705    sample_id: u64,
706    sample_period: u64,
707    vmo_write_offset: &mut u64,
708) -> Result<(), Errno> {
709    let stats = session_proxy.stop().await;
710
711    let seq_lock_wrapper = match seq_lock.get() {
712        Some(Ok(l)) => l,
713        // Initialization failed in a previous mmap() call. Propagate the error.
714        Some(Err(e)) => return Err(e.clone()),
715        // Not initialized yet (i.e. mmap() hasn't been called). Skip updating metadata.
716        None => return Ok(()),
717    };
718
719    let samples_collected = match stats {
720        Ok(stats) => stats.samples_collected.unwrap(),
721        Err(e) => return error!(EINVAL, e),
722    };
723
724    track_stub!(
725        TODO("https://fxbug.dev/422502681"),
726        "[perf_event_open] symbolize sample output and delete the below log_info"
727    );
728    log_info!("profiler samples_collected: {:?}", samples_collected);
729
730    // Peek at the first 8 bytes to determine if it's FXT or text.
731    let mut header = [0; 8];
732    let mut bytes_read = 0;
733    while bytes_read < 8 {
734        match client.read(&mut header[bytes_read..]).await {
735            Ok(0) => {
736                // Peer closed the socket. This is the normal end of the stream.
737                log_info!("[perf_event_open] Finished reading fxt record from socket.");
738                break;
739            }
740            Ok(n) => bytes_read += n,
741            Err(e) => {
742                log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
743                break;
744            }
745        }
746    }
747
748    if bytes_read > 0 {
749        if bytes_read == 8 && header == FXT_MAGIC_BYTES {
750            // FXT format.
751            let header_cursor = Cursor::new(header);
752            let reader = header_cursor.chain(client);
753            let (mut stream, _task) = SessionParser::new_async(reader);
754            while let Some(record_result) = stream.next().await {
755                match record_result {
756                    Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
757                        let ips: Vec<u64> = backtrace.data;
758                        let pid = Some(backtrace.process.0 as u32);
759                        let tid = Some(backtrace.thread.0 as u32);
760                        let perf_record_sample = PerfRecordSample { pid, tid, ips };
761                        let bytes_written = write_record_to_vmo(
762                            perf_record_sample,
763                            perf_data_vmo,
764                            sample_type,
765                            sample_id,
766                            sample_period,
767                            *vmo_write_offset,
768                        );
769                        // Update data_head after writing sample.
770                        if bytes_written > 0 {
771                            *vmo_write_offset += bytes_written;
772                            let mut metadata = seq_lock_wrapper.get();
773                            metadata.data_head = *vmo_write_offset;
774                            seq_lock_wrapper.set_value(metadata);
775                        }
776                    }
777                    Ok(_) => {
778                        // Ignore other records.
779                    }
780                    Err(e) => {
781                        log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
782                        break;
783                    }
784                }
785            }
786        } else {
787            // Text format.
788            // Read chunks of sampling data from socket in this buffer temporarily. We will parse
789            // the data and write it into the output VMO (the one mmap points to).
790            let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
791
792            loop {
793                // Attempt to read data. This awaits until data is available, EOF, or error.
794                // Ignore the first 8 bytes as it's the {{{reset}}} marker.
795                let socket_data = client.read(&mut buffer).await;
796
797                match socket_data {
798                    Ok(0) => {
799                        // Peer closed the socket. This is the normal end of the stream.
800                        log_info!("[perf_event_open] Finished reading from socket.");
801                        break;
802                    }
803                    Ok(bytes_read) => {
804                        // Receive data in format {{{...}}}.
805                        let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
806                            Ok(data) => data,
807                            Err(e) => return error!(EINVAL, e),
808                        };
809                        // Parse data to PerfRecordSample struct.
810                        if let Some(perf_record_sample) =
811                            parse_perf_record_sample_format(received_data)
812                        {
813                            let bytes_written = write_record_to_vmo(
814                                perf_record_sample,
815                                perf_data_vmo,
816                                sample_type,
817                                sample_id,
818                                sample_period,
819                                *vmo_write_offset,
820                            );
821                            // Update data_head after writing sample.
822                            if bytes_written > 0 {
823                                *vmo_write_offset += bytes_written;
824                                let mut metadata = seq_lock_wrapper.get();
825                                metadata.data_head = *vmo_write_offset;
826                                seq_lock_wrapper.set_value(metadata);
827                            }
828                        }
829                    }
830                    Err(e) => {
831                        log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
832                        break;
833                    }
834                }
835            }
836        }
837    }
838
839    let reset_status = session_proxy.reset().await;
840    return match reset_status {
841        Ok(_) => Ok(()),
842        Err(e) => error!(EINVAL, e),
843    };
844}
845
846// Notifies other thread that we should start/stop sampling.
847// Once sampling is complete, that profiler session is no longer needed.
848// At that point, send back notification so that this is no longer blocking
849// (e.g. so that other profiler sessions can start).
850fn ping_receiver(
851    mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
852    command: IoctlOp,
853) {
854    log_info!("[perf_event_open] Received sampling command: {:?}", command);
855    let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
856    match ioctl_sender.try_send((command, profiling_complete_sender)) {
857        Ok(_) => (),
858        Err(e) => {
859            if e.is_full() {
860                log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
861            } else if e.is_disconnected() {
862                log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
863            } else {
864                log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
865            }
866        }
867    };
868    // Block on / wait until profiling is complete before returning.
869    // This notifies that the profiler is free to be used for another session.
870    let _ = profiling_complete_receiver.recv().unwrap();
871}
872
873// Creates a seq lock for the given VMO. Initializes the seq lock with
874// known initial values (unknown values default to 0).
875// Does NOT actually save this as a memory object until mmap() is called.
876//
877// # Safety
878//
879// The caller must ensure that the kernel maintains exclusive write access to this VMO and
880// there are only atomic accesses to this memory (see seq_lock lib.rs for details).
881unsafe fn create_seq_lock(
882    vmo_handle_ref: &zx::NullableHandle,
883    buffer_size: u64,
884) -> SeqLock<PerfMetadataHeader, PerfMetadataValue> {
885    // Currently we hardcode everything just to get something E2E working.
886    let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
887    let page_size = zx::system_get_page_size() as u64;
888    let metadata_value = PerfMetadataValue {
889        lock: 0,
890        index: 3,
891        offset: 19337,
892        time_enabled: 0,
893        time_running: 0,
894        __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
895        pmc_width: 0,
896        time_shift: 0,
897        time_mult: 0,
898        time_offset: 0,
899        time_zero: 0,
900        size: 0,
901        __reserved_1: 0,
902        time_cycles: 0,
903        time_mask: 0,
904        __reserved: [0; 928usize],
905        // This first page (metadata) has finished writing. Start data_head at 0.
906        data_head: 0,
907        // Start reading from 0; it is the user's responsibility to increment on their end.
908        data_tail: 0,
909        // We know the data will start after 1 page size so we can set this now.
910        data_offset: page_size,
911        data_size: buffer_size - page_size,
912        aux_head: 0,
913        aux_tail: 0,
914        aux_offset: 0,
915        aux_size: 0,
916    };
917    let vmo = zx::Vmo::from(vmo_handle_ref.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap());
918
919    // Create a SeqLock and safely initialize the `header` and `value` for it.
920    // SeqLock is formatted thusly:
921    //   header_struct : any size, params `version` and `compat_version` should not change
922    //   sequence_counter : u32, this is the lock and should increment
923    //   value_struct : any size, each param can change
924    //
925    // SAFETY: See safety requirements on `create_seq_lock`.
926    unsafe {
927        SeqLock::new_from_vmo(metadata_header, metadata_value, vmo)
928            .expect("failed to create seq_lock for perf metadata")
929    }
930}
931
932pub fn sys_perf_event_open(
933    locked: &mut Locked<Unlocked>,
934    current_task: &CurrentTask,
935    attr: UserRef<perf_event_attr>,
936    // Note that this is pid in Linux docs.
937    tid: tid_t,
938    cpu: i32,
939    group_fd: FdNumber,
940    _flags: u64,
941) -> Result<SyscallResult, Errno> {
942    // So far, the implementation only sets the read_data_format according to the "Reading results"
943    // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
944    // Other features will be added in the future (see below track_stubs).
945    let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
946
947    if tid == -1 && cpu == -1 {
948        return error!(EINVAL);
949    }
950
951    let target_task_type = match tid {
952        -1 => TargetTaskType::AllTasks,
953        0 => TargetTaskType::CurrentTask,
954        _ => {
955            track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
956            return error!(ENOSYS);
957        }
958    };
959    security::check_perf_event_open_access(
960        current_task,
961        target_task_type,
962        &perf_event_attrs,
963        perf_event_attrs.type_.try_into()?,
964    )?;
965
966    // Channel used to send info between notifier and spawned task thread.
967    // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
968    // quick succession (instead of something lower).
969    let (sender, mut receiver) = future_mpsc::channel(8);
970
971    let mut perf_event_file = PerfEventFileState::new(
972        perf_event_attrs,
973        0,
974        perf_event_attrs.disabled(),
975        perf_event_attrs.sample_type,
976        zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
977        sender,
978    );
979
980    let read_format = perf_event_attrs.read_format;
981
982    if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
983        || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
984    {
985        // Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
986        // as otherwise this param shouldn't be used for calculating anything.
987        if perf_event_file.disabled == 0 {
988            perf_event_file.most_recent_enabled_time =
989                zx::MonotonicInstant::get().into_nanos() as u64;
990        }
991        // Initialize this to 0 as we will need to return a time duration later during read().
992        perf_event_file.total_time_running = 0;
993    }
994
995    let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
996    perf_event_file.rf_id = event_id;
997
998    if group_fd.raw() == -1 {
999        perf_event_file.sample_id = event_id;
1000    } else {
1001        let group_file = current_task.get_file(group_fd)?;
1002        let group_file_object_id = group_file.id;
1003        let perf_state = get_perf_state(&current_task.kernel);
1004        let events = perf_state.format_id_lookup_table.lock();
1005        if let Some(rf_id) = events.get(&group_file_object_id) {
1006            perf_event_file.sample_id = *rf_id;
1007        } else {
1008            return error!(EINVAL);
1009        }
1010    }
1011
1012    if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
1013        track_stub!(
1014            TODO("https://fxbug.dev/402238049"),
1015            "[perf_event_open] implement read_format group"
1016        );
1017        return error!(ENOSYS);
1018    }
1019    if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
1020        track_stub!(
1021            TODO("https://fxbug.dev/402260383"),
1022            "[perf_event_open] implement read_format lost"
1023        );
1024    }
1025
1026    // Set up notifier for handling ioctl calls to enable/disable sampling.
1027    let mut vmo_handle_copy =
1028        perf_event_file.perf_data_vmo.as_handle_ref().duplicate_handle(zx::Rights::SAME_RIGHTS);
1029
1030    // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
1031    // This is always sound regardless of the union's tag.
1032    let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
1033    // The sample period from the PERF_COUNT_SW_CPU_CLOCK is
1034    // 1 nanosecond per tick. Convert this duration into zx::duration.
1035    let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
1036
1037    // SeqLock does not get instantiated with metadata values until mmap() is called.
1038    let seq_lock =
1039        Arc::new(OnceLock::<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>::new());
1040    let cloned_seq_lock = Arc::clone(&seq_lock);
1041    let mut vmo_write_offset = 0;
1042
1043    let closure = async move |_: LockedAndTask<'_>| {
1044        let mut profiler_state: Option<(profiler::SessionProxy, fidl::AsyncSocket)> = None;
1045
1046        // This loop will wait for messages from the sender.
1047        while let Some((command, profiling_complete_receiver)) = receiver.next().await {
1048            match command {
1049                IoctlOp::Enable => {
1050                    match set_up_profiler(zx_sample_period).await {
1051                        Ok((session_proxy, client)) => {
1052                            let start_request = profiler::SessionStartRequest {
1053                                buffer_results: Some(true),
1054                                buffer_size_mb: Some(8 as u64),
1055                                ..Default::default()
1056                            };
1057                            if let Err(e) = session_proxy.start(&start_request).await {
1058                                log_warn!("Failed to start profiling: {}", e);
1059                            } else {
1060                                profiler_state = Some((session_proxy, client));
1061                            }
1062                        }
1063                        Err(e) => {
1064                            log_warn!("Failed to profile: {}", e);
1065                        }
1066                    };
1067                    // Send notification anyway to unblock the ioctl caller.
1068                    let _ = profiling_complete_receiver.send(());
1069                }
1070                IoctlOp::Disable => {
1071                    if let Some((session_proxy, client)) = profiler_state.take() {
1072                        let handle = vmo_handle_copy
1073                            .as_mut()
1074                            .expect("Failed to get VMO handle")
1075                            .as_handle_ref()
1076                            .duplicate_handle(zx::Rights::SAME_RIGHTS)
1077                            .unwrap();
1078
1079                        if let Err(e) = stop_and_collect_samples(
1080                            session_proxy,
1081                            client,
1082                            &cloned_seq_lock,
1083                            &zx::Vmo::from(handle),
1084                            perf_event_file.sample_type,
1085                            perf_event_file.sample_id,
1086                            sample_period_in_ticks,
1087                            &mut vmo_write_offset,
1088                        )
1089                        .await
1090                        {
1091                            log_warn!("Failed to collect sample: {:?}", e);
1092                        }
1093                    }
1094                    // Send notification anyway to unblock the ioctl caller.
1095                    let _ = profiling_complete_receiver.send(());
1096                }
1097            }
1098        }
1099        ()
1100    };
1101    let req = SpawnRequestBuilder::new()
1102        .with_debug_name("perf-event-sampler")
1103        .with_async_closure(closure)
1104        .build();
1105    current_task.kernel().kthreads.spawner().spawn_from_request(req);
1106
1107    let file = Box::new(PerfEventFile {
1108        _tid: tid,
1109        _cpu: cpu,
1110        perf_event_file: RwLock::new(perf_event_file),
1111        security_state: security::perf_event_alloc(current_task),
1112        seq_lock: seq_lock,
1113    });
1114    // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
1115    let file_handle =
1116        Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
1117    let file_object_id = file_handle.id;
1118    let file_descriptor: Result<FdNumber, Errno> =
1119        current_task.add_file(locked, file_handle, FdFlags::empty());
1120
1121    match file_descriptor {
1122        Ok(fd) => {
1123            if group_fd.raw() == -1 {
1124                let perf_state = get_perf_state(&current_task.kernel);
1125                let mut events = perf_state.format_id_lookup_table.lock();
1126                events.insert(file_object_id, event_id);
1127            }
1128            Ok(fd.into())
1129        }
1130        Err(_) => {
1131            track_stub!(
1132                TODO("https://fxbug.dev/402453955"),
1133                "[perf_event_open] implement remaining error handling"
1134            );
1135            error!(EMFILE)
1136        }
1137    }
1138}
1139// Syscalls for arch32 usage
1140#[cfg(target_arch = "aarch64")]
1141mod arch32 {
1142    pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
1143}
1144
1145#[cfg(target_arch = "aarch64")]
1146pub use arch32::*;
1147
1148use crate::mm::memory::MemoryObject;
1149use crate::mm::{MemoryAccessorExt, ProtectionFlags};
1150use crate::task::CurrentTask;
1151use crate::vfs::{
1152    Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
1153    OutputBuffer,
1154};
1155use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};