Skip to main content

starnix_core/perf/
mod.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
6use anyhow::Context;
7use fidl_fuchsia_cpu_profiler as profiler;
8use fuchsia_component::client::connect_to_protocol;
9use fuchsia_runtime;
10use futures::StreamExt;
11use futures::channel::mpsc as future_mpsc;
12use regex_lite::Regex;
13use std::collections::HashMap;
14use std::error::Error;
15use std::sync::atomic::{AtomicU64, Ordering};
16use std::sync::{Arc, OnceLock, mpsc as sync_mpsc};
17use zerocopy::{Immutable, IntoBytes};
18
19use futures::io::{AsyncReadExt, Cursor};
20use fxt::TraceRecord;
21use fxt::profiler::ProfilerRecord;
22use fxt::session::SessionParser;
23use seq_lock::{SeqLock, SeqLockable, WriteSize};
24use starnix_logging::{log_info, log_warn, track_stub};
25use starnix_sync::{
26    FileOpsCore, LockDepMutex, LockDepRwLock, Locked, PerfEventLevel, PerfFormatIdLookupTableLock,
27    Unlocked,
28};
29use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
30use starnix_uapi::arch32::{
31    PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
32    PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
33    PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
34    PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
35    perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
36    perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
37    perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
38    perf_event_type_PERF_RECORD_SAMPLE,
39};
40use starnix_uapi::errors::Errno;
41use starnix_uapi::open_flags::OpenFlags;
42use starnix_uapi::user_address::UserRef;
43use starnix_uapi::{
44    errno, error, from_status_like_fdio, perf_event_attr, perf_event_header,
45    perf_event_mmap_page__bindgen_ty_1, perf_event_read_format_PERF_FORMAT_GROUP,
46    perf_event_read_format_PERF_FORMAT_ID, perf_event_read_format_PERF_FORMAT_LOST,
47    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
48    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
49};
50
51use crate::security::{self, TargetTaskType};
52use crate::task::{Kernel, LockedAndTask};
53
54static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
55// Default buffer size to read from socket (for sampling data).
56const DEFAULT_CHUNK_SIZE: usize = 4096;
57// 4096 * 10, page size * 10.
58// If tests flake due to running out of buffer space, or if the profiling duration is
59// significantly increased, this buffer size may need further adjustment (expansion).
60const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960;
61// FXT magic bytes (little endian).
62const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
63
64mod event;
65pub use event::{TraceEvent, TraceEventQueue, TraceEventQueueList};
66
67pub mod lockless_ring_buffer;
68
69#[repr(C)]
70#[derive(Copy, Clone, IntoBytes, Immutable)]
71struct PerfMetadataHeader {
72    version: u32,
73    compat_version: u32,
74}
75
76#[repr(C)]
77#[derive(Copy, Clone, IntoBytes, Immutable)]
78struct PerfMetadataValue {
79    lock: u32,
80    index: u32,
81    offset: i64,
82    time_enabled: u64,
83    time_running: u64,
84    __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
85    pmc_width: u16,
86    time_shift: u16,
87    time_mult: u32,
88    time_offset: u64,
89    time_zero: u64,
90    size: u32,
91    __reserved_1: u32,
92    time_cycles: u64,
93    time_mask: u64,
94    __reserved: [u8; 928usize],
95    data_head: u64,
96    data_tail: u64,
97    data_offset: u64,
98    data_size: u64,
99    aux_head: u64,
100    aux_tail: u64,
101    aux_offset: u64,
102    aux_size: u64,
103}
104
105// SAFETY: `PerfMetadataValue` can be safely written to shared memory in 8-byte chunks.
106// This is because it is composed of two u32s followed by only u64s.
107// The first u32 is the `lock` field, which is why HAS_INLINE_SEQUENCE is true.
108unsafe impl SeqLockable for PerfMetadataValue {
109    const WRITE_SIZE: WriteSize = WriteSize::Eight;
110    const HAS_INLINE_SEQUENCE: bool = true;
111    const VMO_NAME: &'static [u8] = b"starnix:perf_event";
112}
113
114struct PerfState {
115    // This table maps a group leader's file object id to its unique u64 "format ID".
116    //
117    // When a sample is generated for any event in a group, we use this
118    // "format ID" from the group leader as the value for *both* the
119    // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
120    format_id_lookup_table: LockDepMutex<HashMap<FileObjectId, u64>, PerfFormatIdLookupTableLock>,
121}
122
123impl Default for PerfState {
124    fn default() -> Self {
125        Self { format_id_lookup_table: LockDepMutex::new(HashMap::new()) }
126    }
127}
128
129fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
130    kernel.expando.get_or_init(PerfState::default)
131}
132
133uapi::check_arch_independent_layout! {
134    perf_event_attr {
135        type_, // "type" is a reserved keyword so add a trailing underscore.
136        size,
137        config,
138        __bindgen_anon_1,
139        sample_type,
140        read_format,
141        _bitfield_1,
142        __bindgen_anon_2,
143        bp_type,
144        __bindgen_anon_3,
145        __bindgen_anon_4,
146        branch_sample_type,
147        sample_regs_user,
148        sample_stack_user,
149        clockid,
150        sample_regs_intr,
151        aux_watermark,
152        sample_max_stack,
153        __reserved_2,
154        aux_sample_size,
155        __reserved_3,
156        sig_data,
157        config3,
158    }
159}
160
161#[derive(Clone, Copy, Debug, PartialEq)]
162enum IoctlOp {
163    Enable,
164    Disable,
165}
166
167struct PerfEventFileState {
168    attr: perf_event_attr,
169    rf_value: u64, // "count" for the config we passed in for the event.
170    // The most recent timestamp (ns) where we changed into an enabled state
171    // i.e. the most recent time we got an ENABLE ioctl().
172    most_recent_enabled_time: u64,
173    // Sum of all previous enablement segment durations (ns). If we are
174    // currently in an enabled state, explicitly does NOT include the current
175    // segment.
176    total_time_running: u64,
177    rf_id: u64,
178    sample_id: u64,
179    _rf_lost: u64,
180    disabled: u64,
181    sample_type: u64,
182    // Handle to blob that stores all the perf data that a user may want.
183    // At the moment it only stores some metadata and backtraces (bts).
184    perf_data_vmo: zx::Vmo,
185    // Channel used to send IoctlOps to start/stop sampling.
186    ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
187}
188
189// Have an implementation for PerfEventFileState because VMO
190// doesn't have Default so we can't derive it.
191impl PerfEventFileState {
192    fn new(
193        attr: perf_event_attr,
194        rf_value: u64,
195        disabled: u64,
196        sample_type: u64,
197        perf_data_vmo: zx::Vmo,
198        ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
199    ) -> PerfEventFileState {
200        PerfEventFileState {
201            attr,
202            rf_value,
203            most_recent_enabled_time: 0,
204            total_time_running: 0,
205            rf_id: 0,
206            sample_id: 0,
207            _rf_lost: 0,
208            disabled,
209            sample_type,
210            perf_data_vmo,
211            ioctl_sender,
212        }
213    }
214}
215
216pub struct PerfEventFile {
217    _tid: tid_t,
218    _cpu: i32,
219    perf_event_file: LockDepRwLock<PerfEventFileState, PerfEventLevel>,
220    // The security state for this PerfEventFile.
221    pub security_state: security::PerfEventState,
222    seq_lock: Arc<OnceLock<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>>,
223}
224
225// PerfEventFile object that implements FileOps.
226// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
227// implementation details.
228// This object can be saved as a FileDescriptor.
229impl FileOps for PerfEventFile {
230    // Don't need to implement seek or sync for PerfEventFile.
231    fileops_impl_nonseekable!();
232    fileops_impl_noop_sync!();
233
234    fn close(
235        self: Box<Self>,
236        _locked: &mut Locked<FileOpsCore>,
237        file: &FileObjectState,
238        current_task: &CurrentTask,
239    ) {
240        let perf_state = get_perf_state(&current_task.kernel);
241        let mut events = perf_state.format_id_lookup_table.lock();
242        events.remove(&file.id);
243    }
244
245    // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
246    fn read(
247        &self,
248        _locked: &mut Locked<FileOpsCore>,
249        _file: &FileObject,
250        current_task: &CurrentTask,
251        _offset: usize,
252        data: &mut dyn OutputBuffer,
253    ) -> Result<usize, Errno> {
254        // Create/calculate and return the ReadFormatData object.
255        // If we create it earlier we might want to change it and it's immutable once created.
256        let read_format_data = {
257            // Once we get the `value` or count from kernel, we can change this to a read()
258            // call instead of write().
259            let mut perf_event_file = self.perf_event_file.write();
260
261            security::check_perf_event_read_access(current_task, &self)?;
262
263            let mut total_time_running_including_curr = perf_event_file.total_time_running;
264
265            // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
266            if perf_event_file.disabled == 0 {
267                // Calculate the value or "count" of the config we're interested in.
268                // This value should reflect the value we are counting (defined in the config).
269                // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
270                // For now we just return rf_value + 1.
271                track_stub!(
272                    TODO("https://fxbug.dev/402938671"),
273                    "[perf_event_open] implement read_format value"
274                );
275                perf_event_file.rf_value += 1;
276
277                // Update time duration.
278                let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
279                total_time_running_including_curr +=
280                    curr_time - perf_event_file.most_recent_enabled_time;
281            }
282
283            let mut output = Vec::<u8>::new();
284            let value = perf_event_file.rf_value.to_ne_bytes();
285            output.extend(value);
286
287            let read_format = perf_event_file.attr.read_format;
288
289            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
290                // Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
291                output.extend(total_time_running_including_curr.to_ne_bytes());
292            }
293            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
294                // Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
295                output.extend(total_time_running_including_curr.to_ne_bytes());
296            }
297            if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
298                // Adds a 64-bit unique value that corresponds to the event group.
299                output.extend(perf_event_file.rf_id.to_ne_bytes());
300            }
301
302            output
303        };
304
305        // The regular read() call allows the case where the bytes-we-want-to-read-in won't
306        // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
307        // into a buffer that is not big enough to hold the data, the error ENOSPC results."
308        if data.available() < read_format_data.len() {
309            return error!(ENOSPC);
310        }
311        track_stub!(
312            TODO("https://fxbug.dev/402453955"),
313            "[perf_event_open] implement remaining error handling"
314        );
315
316        data.write(&read_format_data)
317    }
318
319    fn ioctl(
320        &self,
321        _locked: &mut Locked<Unlocked>,
322        _file: &FileObject,
323        current_task: &CurrentTask,
324        op: u32,
325        _arg: SyscallArg,
326    ) -> Result<SyscallResult, Errno> {
327        track_stub!(
328            TODO("https://fxbug.dev/405463320"),
329            "[perf_event_open] implement PERF_IOC_FLAG_GROUP"
330        );
331        security::check_perf_event_write_access(current_task, &self)?;
332        let mut perf_event_file = self.perf_event_file.write();
333        match op {
334            PERF_EVENT_IOC_ENABLE => {
335                if perf_event_file.disabled != 0 {
336                    perf_event_file.disabled = 0; // 0 = false.
337                    perf_event_file.most_recent_enabled_time =
338                        zx::MonotonicInstant::get().into_nanos() as u64;
339                }
340
341                // If we are sampling, invoke the profiler and collect a sample.
342                // Currently this is an example sample collection.
343                track_stub!(
344                    TODO("https://fxbug.dev/398914921"),
345                    "[perf_event_open] implement full sampling features"
346                );
347                if perf_event_file.attr.freq() == 0
348                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
349                // This is always sound regardless of the union's tag.
350                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
351                {
352                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
353                }
354                return Ok(SUCCESS);
355            }
356            PERF_EVENT_IOC_DISABLE => {
357                if perf_event_file.disabled == 0 {
358                    perf_event_file.disabled = 1; // 1 = true.
359
360                    // Update total_time_running now that the segment has ended.
361                    let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
362                    perf_event_file.total_time_running +=
363                        curr_time - perf_event_file.most_recent_enabled_time;
364                }
365                if perf_event_file.attr.freq() == 0
366                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
367                // This is always sound regardless of the union's tag.
368                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
369                {
370                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Disable);
371                }
372                return Ok(SUCCESS);
373            }
374            PERF_EVENT_IOC_RESET => {
375                perf_event_file.rf_value = 0;
376                return Ok(SUCCESS);
377            }
378            PERF_EVENT_IOC_REFRESH
379            | PERF_EVENT_IOC_PERIOD
380            | PERF_EVENT_IOC_SET_OUTPUT
381            | PERF_EVENT_IOC_SET_FILTER
382            | PERF_EVENT_IOC_ID
383            | PERF_EVENT_IOC_SET_BPF
384            | PERF_EVENT_IOC_PAUSE_OUTPUT
385            | PERF_EVENT_IOC_MODIFY_ATTRIBUTES
386            | PERF_EVENT_IOC_QUERY_BPF => {
387                track_stub!(
388                    TODO("https://fxbug.dev/404941053"),
389                    "[perf_event_open] implement remaining ioctl() calls"
390                );
391                return error!(ENOSYS);
392            }
393            _ => error!(ENOTTY),
394        }
395    }
396
397    // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
398    // Gets called when mmap() is called.
399    // Immediately before sampling, this should get called by the user (e.g. the test
400    // or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
401    fn get_memory(
402        &self,
403        _locked: &mut Locked<FileOpsCore>,
404        _file: &FileObject,
405        current_task: &CurrentTask,
406        length: Option<usize>,
407        _prot: ProtectionFlags,
408    ) -> Result<Arc<MemoryObject>, Errno> {
409        let buffer_size: u64 = length.unwrap_or(0) as u64;
410        if buffer_size == 0 {
411            return error!(EINVAL);
412        }
413
414        self.seq_lock
415            .get_or_init(|| {
416                let perf_event_file = self.perf_event_file.read();
417                let vmo_copy = perf_event_file
418                    .perf_data_vmo
419                    .as_handle_ref()
420                    .duplicate_handle(zx::Rights::SAME_RIGHTS)
421                    .map_err(|status| from_status_like_fdio!(status))?;
422                // SAFETY: See safety requirements on `create_seq_lock`.
423                Ok(unsafe { create_seq_lock(&vmo_copy, buffer_size) })
424            })
425            .as_ref()
426            .map_err(|e| e.clone())?;
427
428        // Write to a MemoryObject and return it (expected return type for get_memory()).
429        security::check_perf_event_read_access(current_task, &self)?;
430        let perf_event_file = self.perf_event_file.read();
431        match perf_event_file
432            .perf_data_vmo
433            .as_handle_ref()
434            .duplicate_handle(zx::Rights::SAME_RIGHTS)
435        {
436            Ok(vmo) => {
437                let vmo: zx::Vmo = vmo.into();
438                let memory = MemoryObject::from(vmo);
439                return Ok(Arc::new(memory));
440            }
441            Err(_) => {
442                track_stub!(
443                    TODO("https://fxbug.dev/416323134"),
444                    "[perf_event_open] handle get_memory() errors"
445                );
446                return error!(EINVAL);
447            }
448        };
449    }
450
451    fn write(
452        &self,
453        _locked: &mut Locked<FileOpsCore>,
454        _file: &FileObject,
455        _current_task: &CurrentTask,
456        _offset: usize,
457        _data: &mut dyn InputBuffer,
458    ) -> Result<usize, Errno> {
459        track_stub!(
460            TODO("https://fxbug.dev/394960158"),
461            "[perf_event_open] implement perf event functions"
462        );
463        error!(ENOSYS)
464    }
465}
466
467// Given a PerfRecordSample struct, write it via the correct output format
468// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
469// We don't currently support all the sample_types listed in the docs.
470// Input:
471//    PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
472// Human-understandable output:
473//    9 1 40 111 5 10 3 111 222 333
474// Actual output (no spaces or \n in real output, just making it more readable):
475//    0x0000 0x0009                 <-- starts at `offset` bytes
476//    0x0001
477//    0x0040
478//    0x0000 0x0000 0x0000 0x006F   <-- starts at `offset` + 8 bytes
479//    0x0000 0x0000 0x0000 0x0005
480//    0x0000 0x0000 0x0000 0x0010
481//    0x0000 0x0000 0x0000 0x0003
482//    0x0000 0x0000 0x0000 0x006F
483//    0x0000 0x0000 0x0000 0x00DE
484//    0x0000 0x0000 0x0000 0x014D
485//
486//    Returns the length of bytes written. In above case, 8 + 28 = 36.
487//    This information is used to increment the global offset.
488fn write_record_to_vmo(
489    perf_record_sample: PerfRecordSample,
490    perf_data_vmo: &zx::Vmo,
491    sample_type: u64,
492    sample_id: u64,
493    sample_period: u64,
494    offset: u64,
495) -> u64 {
496    // First, build record to determine its size (so that we can fill out `size` in header).
497    let mut sample = Vec::<u8>::new();
498    // sample_id
499    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
500        sample.extend(sample_id.to_ne_bytes());
501    }
502    // ip
503    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
504        sample.extend(perf_record_sample.ips[0].to_ne_bytes());
505    }
506
507    if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
508        // pid
509        sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
510        // tid
511        sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
512    }
513
514    // id
515    if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
516        sample.extend(sample_id.to_ne_bytes());
517    }
518
519    // sample period
520    if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
521        sample.extend(sample_period.to_ne_bytes());
522    }
523
524    if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
525        // nr
526        sample.extend(perf_record_sample.ips.len().to_ne_bytes());
527
528        // ips[nr] - list of ips, u64 per ip.
529        for i in perf_record_sample.ips {
530            sample.extend(i.to_ne_bytes());
531        }
532    }
533    // The remaining data are not defined for now.
534
535    // Now that we know the sample size, we can calculate the record size.
536    // record_size = perf_event_header_size + sample_size.
537    // perf_event_header is defined to be 8 bytes.
538    let record_size: u64 = (std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
539
540    track_stub!(
541        TODO("https://fxbug.dev/432501467"),
542        "[perf_event_open] determines whether the record is KERNEL or USER"
543    );
544    let perf_event_header = perf_event_header {
545        type_: perf_event_type_PERF_RECORD_SAMPLE,
546        misc: PERF_RECORD_MISC_KERNEL as u16,
547        size: record_size as u16,
548    };
549
550    // Total data offset. This is where the record should start getting written.
551    // The first page is reserved for metadata, so we need to add the page size.
552    // Example:
553    //  You're writing the first record (size 100). Start writing at 0 + 4096.
554    //  You're writing the second record. Start writing at 100 + 4096.
555    let data_offset = offset + (zx::system_get_page_size() as u64);
556
557    // Write header to memory.
558    match perf_data_vmo.write(&perf_event_header.as_bytes(), data_offset) {
559        Ok(_) => (),
560        Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
561    }
562
563    // Write sample to memory immediately after the header.
564    match perf_data_vmo
565        .write(&sample, data_offset + (std::mem::size_of::<perf_event_header>() as u64))
566    {
567        Ok(_) => {
568            // Return the total size we wrote (header + sample) so that we can
569            // increment offset counter.
570            return record_size;
571        }
572        Err(e) => {
573            log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
574            // Failed to write. Don't increment offset counter.
575            return 0;
576        }
577    }
578}
579
580#[derive(Debug, Clone)]
581struct PerfRecordSample {
582    pid: Option<u32>,
583    tid: Option<u32>,
584    // Instruction pointers (currently this is the address). First one is `ip` param.
585    ips: Vec<u64>,
586}
587
588// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
589//
590// 1234                     pid
591// 5555                     tid
592// {{{bt:0:0x1111:pc}}}    {{{bt:frame_number:address:type}}}
593// {{{bt:1:0x2222:ra}}}
594// {{{bt:2:0x3333:ra}}}
595//
596// Results in:
597// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
598
599fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
600    let mut pid: Option<u32> = None;
601    let mut tid: Option<u32> = None;
602    let mut ips: Vec<u64> = Vec::new();
603    let mut numbers_found = 0;
604    track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
605    let backtrace_regex =
606        Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
607
608    for line in backtrace.lines() {
609        let trimmed_line = line.trim();
610        // Try to parse as a raw number (for PID/TID).
611        if numbers_found < 2 {
612            if let Ok(num) = trimmed_line.parse::<u32>() {
613                if numbers_found == 0 {
614                    pid = Some(num);
615                } else {
616                    tid = Some(num);
617                }
618                numbers_found += 1;
619                continue;
620            }
621        }
622
623        // Try to parse as a backtrace line.
624        if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
625            let address_str = parsed_bt.get(1).unwrap().as_str();
626            if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
627                ips.push(ip_addr);
628            }
629        }
630    }
631
632    if pid == None || tid == None || ips.is_empty() {
633        // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
634        log_info!("No ips while getting PerfRecordSample");
635        None
636    } else {
637        Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
638    }
639}
640
641async fn set_up_profiler(
642    sample_period: zx::MonotonicDuration,
643) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
644    // Configuration for how we want to sample.
645    let sample = profiler::Sample {
646        callgraph: Some(profiler::CallgraphConfig {
647            strategy: Some(profiler::CallgraphStrategy::FramePointer),
648            ..Default::default()
649        }),
650        ..Default::default()
651    };
652
653    let sampling_config = profiler::SamplingConfig {
654        period: Some(sample_period.into_nanos() as u64),
655        timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
656        sample: Some(sample),
657        ..Default::default()
658    };
659
660    track_stub!(
661        TODO("https://fxbug.dev/398914921"),
662        "[perf_event_open] allow for profiling system-wide not during tests"
663    );
664    let job = fuchsia_runtime::job_default();
665    let koid = job.koid().map_err(|e| errno!(EINVAL, e.to_string()))?;
666    let tasks = vec![
667        // Should return ~1300 samples for 1000 millis.
668        profiler::Task::Job(koid.raw_koid()),
669    ];
670    let targets = profiler::TargetConfig::Tasks(tasks);
671    let config = profiler::Config {
672        configs: Some(vec![sampling_config]),
673        target: Some(targets),
674        ..Default::default()
675    };
676    let (client, server) = fidl::Socket::create_stream();
677    let configure = profiler::SessionConfigureRequest {
678        output: Some(server),
679        config: Some(config),
680        ..Default::default()
681    };
682
683    let proxy = connect_to_protocol::<profiler::SessionMarker>()
684        .context("Error connecting to Profiler protocol");
685    let session_proxy: profiler::SessionProxy = match proxy {
686        Ok(p) => p.clone(),
687        Err(e) => return error!(EINVAL, e),
688    };
689
690    // Must configure before sampling start().
691    let config_request = session_proxy.configure(configure).await;
692    match config_request {
693        Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
694        Err(e) => return error!(EINVAL, e),
695    }
696}
697
698// Collects samples and puts backtrace in VMO.
699// - Reads in the buffer from the socket for that duration in chunks.
700// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
701// - Writes the PERF_RECORD_SAMPLE into VMO.
702async fn stop_and_collect_samples(
703    session_proxy: profiler::SessionProxy,
704    mut client: fidl::AsyncSocket,
705    seq_lock: &OnceLock<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>,
706    perf_data_vmo: &zx::Vmo,
707    sample_type: u64,
708    sample_id: u64,
709    sample_period: u64,
710    vmo_write_offset: &mut u64,
711) -> Result<(), Errno> {
712    let stats = session_proxy.stop().await;
713
714    let seq_lock_wrapper = match seq_lock.get() {
715        Some(Ok(l)) => l,
716        // Initialization failed in a previous mmap() call. Propagate the error.
717        Some(Err(e)) => return Err(e.clone()),
718        // Not initialized yet (i.e. mmap() hasn't been called). Skip updating metadata.
719        None => return Ok(()),
720    };
721
722    let samples_collected = match stats {
723        Ok(stats) => stats.samples_collected.unwrap(),
724        Err(e) => return error!(EINVAL, e),
725    };
726
727    track_stub!(
728        TODO("https://fxbug.dev/422502681"),
729        "[perf_event_open] symbolize sample output and delete the below log_info"
730    );
731    log_info!("profiler samples_collected: {:?}", samples_collected);
732
733    // Peek at the first 8 bytes to determine if it's FXT or text.
734    let mut header = [0; 8];
735    let mut bytes_read = 0;
736    while bytes_read < 8 {
737        match client.read(&mut header[bytes_read..]).await {
738            Ok(0) => {
739                // Peer closed the socket. This is the normal end of the stream.
740                log_info!("[perf_event_open] Finished reading fxt record from socket.");
741                break;
742            }
743            Ok(n) => bytes_read += n,
744            Err(e) => {
745                log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
746                break;
747            }
748        }
749    }
750
751    if bytes_read > 0 {
752        if bytes_read == 8 && header == FXT_MAGIC_BYTES {
753            // FXT format.
754            let header_cursor = Cursor::new(header);
755            let reader = header_cursor.chain(client);
756            let (mut stream, _task) = SessionParser::new_async(reader);
757            while let Some(record_result) = stream.next().await {
758                match record_result {
759                    Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
760                        let ips: Vec<u64> = backtrace.data;
761                        let pid = Some(backtrace.process.0 as u32);
762                        let tid = Some(backtrace.thread.0 as u32);
763                        let perf_record_sample = PerfRecordSample { pid, tid, ips };
764                        let bytes_written = write_record_to_vmo(
765                            perf_record_sample,
766                            perf_data_vmo,
767                            sample_type,
768                            sample_id,
769                            sample_period,
770                            *vmo_write_offset,
771                        );
772                        // Update data_head after writing sample.
773                        if bytes_written > 0 {
774                            *vmo_write_offset += bytes_written;
775                            let mut metadata = seq_lock_wrapper.get();
776                            metadata.data_head = *vmo_write_offset;
777                            seq_lock_wrapper.set_value(metadata);
778                        }
779                    }
780                    Ok(_) => {
781                        // Ignore other records.
782                    }
783                    Err(e) => {
784                        log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
785                        break;
786                    }
787                }
788            }
789        } else {
790            // Text format.
791            // Read chunks of sampling data from socket in this buffer temporarily. We will parse
792            // the data and write it into the output VMO (the one mmap points to).
793            let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
794
795            loop {
796                // Attempt to read data. This awaits until data is available, EOF, or error.
797                // Ignore the first 8 bytes as it's the {{{reset}}} marker.
798                let socket_data = client.read(&mut buffer).await;
799
800                match socket_data {
801                    Ok(0) => {
802                        // Peer closed the socket. This is the normal end of the stream.
803                        log_info!("[perf_event_open] Finished reading from socket.");
804                        break;
805                    }
806                    Ok(bytes_read) => {
807                        // Receive data in format {{{...}}}.
808                        let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
809                            Ok(data) => data,
810                            Err(e) => return error!(EINVAL, e),
811                        };
812                        // Parse data to PerfRecordSample struct.
813                        if let Some(perf_record_sample) =
814                            parse_perf_record_sample_format(received_data)
815                        {
816                            let bytes_written = write_record_to_vmo(
817                                perf_record_sample,
818                                perf_data_vmo,
819                                sample_type,
820                                sample_id,
821                                sample_period,
822                                *vmo_write_offset,
823                            );
824                            // Update data_head after writing sample.
825                            if bytes_written > 0 {
826                                *vmo_write_offset += bytes_written;
827                                let mut metadata = seq_lock_wrapper.get();
828                                metadata.data_head = *vmo_write_offset;
829                                seq_lock_wrapper.set_value(metadata);
830                            }
831                        }
832                    }
833                    Err(e) => {
834                        log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
835                        break;
836                    }
837                }
838            }
839        }
840    }
841
842    let reset_status = session_proxy.reset().await;
843    return match reset_status {
844        Ok(_) => Ok(()),
845        Err(e) => error!(EINVAL, e),
846    };
847}
848
849// Notifies other thread that we should start/stop sampling.
850// Once sampling is complete, that profiler session is no longer needed.
851// At that point, send back notification so that this is no longer blocking
852// (e.g. so that other profiler sessions can start).
853fn ping_receiver(
854    mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
855    command: IoctlOp,
856) {
857    log_info!("[perf_event_open] Received sampling command: {:?}", command);
858    let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
859    match ioctl_sender.try_send((command, profiling_complete_sender)) {
860        Ok(_) => (),
861        Err(e) => {
862            if e.is_full() {
863                log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
864            } else if e.is_disconnected() {
865                log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
866            } else {
867                log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
868            }
869        }
870    };
871    // Block on / wait until profiling is complete before returning.
872    // This notifies that the profiler is free to be used for another session.
873    let _ = profiling_complete_receiver.recv().unwrap();
874}
875
876// Creates a seq lock for the given VMO. Initializes the seq lock with
877// known initial values (unknown values default to 0).
878// Does NOT actually save this as a memory object until mmap() is called.
879//
880// # Safety
881//
882// The caller must ensure that the kernel maintains exclusive write access to this VMO and
883// there are only atomic accesses to this memory (see seq_lock lib.rs for details).
884unsafe fn create_seq_lock(
885    vmo_handle_ref: &zx::NullableHandle,
886    buffer_size: u64,
887) -> SeqLock<PerfMetadataHeader, PerfMetadataValue> {
888    // Currently we hardcode everything just to get something E2E working.
889    let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
890    let page_size = zx::system_get_page_size() as u64;
891    let metadata_value = PerfMetadataValue {
892        lock: 0,
893        index: 3,
894        offset: 19337,
895        time_enabled: 0,
896        time_running: 0,
897        __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
898        pmc_width: 0,
899        time_shift: 0,
900        time_mult: 0,
901        time_offset: 0,
902        time_zero: 0,
903        size: 0,
904        __reserved_1: 0,
905        time_cycles: 0,
906        time_mask: 0,
907        __reserved: [0; 928usize],
908        // This first page (metadata) has finished writing. Start data_head at 0.
909        data_head: 0,
910        // Start reading from 0; it is the user's responsibility to increment on their end.
911        data_tail: 0,
912        // We know the data will start after 1 page size so we can set this now.
913        data_offset: page_size,
914        data_size: buffer_size - page_size,
915        aux_head: 0,
916        aux_tail: 0,
917        aux_offset: 0,
918        aux_size: 0,
919    };
920    let vmo = zx::Vmo::from(vmo_handle_ref.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap());
921
922    // Create a SeqLock and safely initialize the `header` and `value` for it.
923    // SeqLock is formatted thusly:
924    //   header_struct : any size, params `version` and `compat_version` should not change
925    //   sequence_counter : u32, this is the lock and should increment
926    //   value_struct : any size, each param can change
927    //
928    // SAFETY: See safety requirements on `create_seq_lock`.
929    unsafe {
930        SeqLock::new_from_vmo(metadata_header, metadata_value, vmo)
931            .expect("failed to create seq_lock for perf metadata")
932    }
933}
934
935pub fn sys_perf_event_open(
936    locked: &mut Locked<Unlocked>,
937    current_task: &CurrentTask,
938    attr: UserRef<perf_event_attr>,
939    // Note that this is pid in Linux docs.
940    tid: tid_t,
941    cpu: i32,
942    group_fd: FdNumber,
943    _flags: u64,
944) -> Result<SyscallResult, Errno> {
945    // So far, the implementation only sets the read_data_format according to the "Reading results"
946    // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
947    // Other features will be added in the future (see below track_stubs).
948    let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
949
950    if tid == -1 && cpu == -1 {
951        return error!(EINVAL);
952    }
953
954    let target_task_type = match tid {
955        -1 => TargetTaskType::AllTasks,
956        0 => TargetTaskType::CurrentTask,
957        _ => {
958            track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
959            return error!(ENOSYS);
960        }
961    };
962    security::check_perf_event_open_access(
963        current_task,
964        target_task_type,
965        &perf_event_attrs,
966        perf_event_attrs.type_.try_into()?,
967    )?;
968
969    // Channel used to send info between notifier and spawned task thread.
970    // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
971    // quick succession (instead of something lower).
972    let (sender, mut receiver) = future_mpsc::channel(8);
973
974    let mut perf_event_file = PerfEventFileState::new(
975        perf_event_attrs,
976        0,
977        perf_event_attrs.disabled(),
978        perf_event_attrs.sample_type,
979        zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
980        sender,
981    );
982
983    let read_format = perf_event_attrs.read_format;
984
985    if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
986        || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
987    {
988        // Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
989        // as otherwise this param shouldn't be used for calculating anything.
990        if perf_event_file.disabled == 0 {
991            perf_event_file.most_recent_enabled_time =
992                zx::MonotonicInstant::get().into_nanos() as u64;
993        }
994        // Initialize this to 0 as we will need to return a time duration later during read().
995        perf_event_file.total_time_running = 0;
996    }
997
998    let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
999    perf_event_file.rf_id = event_id;
1000
1001    if group_fd.raw() == -1 {
1002        perf_event_file.sample_id = event_id;
1003    } else {
1004        let group_file = current_task.get_file(group_fd)?;
1005        let group_file_object_id = group_file.id;
1006        let perf_state = get_perf_state(&current_task.kernel);
1007        let events = perf_state.format_id_lookup_table.lock();
1008        if let Some(rf_id) = events.get(&group_file_object_id) {
1009            perf_event_file.sample_id = *rf_id;
1010        } else {
1011            return error!(EINVAL);
1012        }
1013    }
1014
1015    if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
1016        track_stub!(
1017            TODO("https://fxbug.dev/402238049"),
1018            "[perf_event_open] implement read_format group"
1019        );
1020        return error!(ENOSYS);
1021    }
1022    if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
1023        track_stub!(
1024            TODO("https://fxbug.dev/402260383"),
1025            "[perf_event_open] implement read_format lost"
1026        );
1027    }
1028
1029    // Set up notifier for handling ioctl calls to enable/disable sampling.
1030    let mut vmo_handle_copy =
1031        perf_event_file.perf_data_vmo.as_handle_ref().duplicate_handle(zx::Rights::SAME_RIGHTS);
1032
1033    // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
1034    // This is always sound regardless of the union's tag.
1035    let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
1036    // The sample period from the PERF_COUNT_SW_CPU_CLOCK is
1037    // 1 nanosecond per tick. Convert this duration into zx::duration.
1038    let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
1039
1040    // SeqLock does not get instantiated with metadata values until mmap() is called.
1041    let seq_lock =
1042        Arc::new(OnceLock::<Result<SeqLock<PerfMetadataHeader, PerfMetadataValue>, Errno>>::new());
1043    let cloned_seq_lock = Arc::clone(&seq_lock);
1044    let mut vmo_write_offset = 0;
1045
1046    let closure = async move |_: LockedAndTask<'_>| {
1047        let mut profiler_state: Option<(profiler::SessionProxy, fidl::AsyncSocket)> = None;
1048
1049        // This loop will wait for messages from the sender.
1050        while let Some((command, profiling_complete_receiver)) = receiver.next().await {
1051            match command {
1052                IoctlOp::Enable => {
1053                    match set_up_profiler(zx_sample_period).await {
1054                        Ok((session_proxy, client)) => {
1055                            let start_request = profiler::SessionStartRequest {
1056                                buffer_results: Some(true),
1057                                buffer_size_mb: Some(8 as u64),
1058                                ..Default::default()
1059                            };
1060                            if let Err(e) = session_proxy.start(&start_request).await {
1061                                log_warn!("Failed to start profiling: {}", e);
1062                            } else {
1063                                profiler_state = Some((session_proxy, client));
1064                            }
1065                        }
1066                        Err(e) => {
1067                            log_warn!("Failed to profile: {}", e);
1068                        }
1069                    };
1070                    // Send notification anyway to unblock the ioctl caller.
1071                    let _ = profiling_complete_receiver.send(());
1072                }
1073                IoctlOp::Disable => {
1074                    if let Some((session_proxy, client)) = profiler_state.take() {
1075                        let handle = vmo_handle_copy
1076                            .as_mut()
1077                            .expect("Failed to get VMO handle")
1078                            .as_handle_ref()
1079                            .duplicate_handle(zx::Rights::SAME_RIGHTS)
1080                            .unwrap();
1081
1082                        if let Err(e) = stop_and_collect_samples(
1083                            session_proxy,
1084                            client,
1085                            &cloned_seq_lock,
1086                            &zx::Vmo::from(handle),
1087                            perf_event_file.sample_type,
1088                            perf_event_file.sample_id,
1089                            sample_period_in_ticks,
1090                            &mut vmo_write_offset,
1091                        )
1092                        .await
1093                        {
1094                            log_warn!("Failed to collect sample: {:?}", e);
1095                        }
1096                    }
1097                    // Send notification anyway to unblock the ioctl caller.
1098                    let _ = profiling_complete_receiver.send(());
1099                }
1100            }
1101        }
1102        ()
1103    };
1104    let req = SpawnRequestBuilder::new()
1105        .with_debug_name("perf-event-sampler")
1106        .with_async_closure(closure)
1107        .build();
1108    current_task.kernel().kthreads.spawner().spawn_from_request(req);
1109
1110    let file = Box::new(PerfEventFile {
1111        _tid: tid,
1112        _cpu: cpu,
1113        perf_event_file: LockDepRwLock::new(perf_event_file),
1114        security_state: security::perf_event_alloc(current_task),
1115        seq_lock: seq_lock,
1116    });
1117    // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
1118    let file_handle =
1119        Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
1120    let file_object_id = file_handle.id;
1121    let file_descriptor: Result<FdNumber, Errno> =
1122        current_task.add_file(locked, file_handle, FdFlags::empty());
1123
1124    match file_descriptor {
1125        Ok(fd) => {
1126            if group_fd.raw() == -1 {
1127                let perf_state = get_perf_state(&current_task.kernel);
1128                let mut events = perf_state.format_id_lookup_table.lock();
1129                events.insert(file_object_id, event_id);
1130            }
1131            Ok(fd.into())
1132        }
1133        Err(_) => {
1134            track_stub!(
1135                TODO("https://fxbug.dev/402453955"),
1136                "[perf_event_open] implement remaining error handling"
1137            );
1138            error!(EMFILE)
1139        }
1140    }
1141}
1142// Syscalls for arch32 usage
1143#[cfg(target_arch = "aarch64")]
1144mod arch32 {
1145    pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
1146}
1147
1148#[cfg(target_arch = "aarch64")]
1149pub use arch32::*;
1150
1151use crate::mm::memory::MemoryObject;
1152use crate::mm::{MemoryAccessorExt, ProtectionFlags};
1153use crate::task::CurrentTask;
1154use crate::vfs::{
1155    Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
1156    OutputBuffer,
1157};
1158use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};