Skip to main content

starnix_core/perf/
mod.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
6use anyhow::Context;
7use fidl_fuchsia_cpu_profiler as profiler;
8use fuchsia_component::client::connect_to_protocol;
9use futures::StreamExt;
10use futures::channel::mpsc as future_mpsc;
11use regex_lite::Regex;
12use std::collections::HashMap;
13use std::error::Error;
14use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};
15use std::sync::{Arc, mpsc as sync_mpsc};
16use zerocopy::{Immutable, IntoBytes};
17use zx::HandleBased;
18
19use futures::io::{AsyncReadExt, Cursor};
20use fxt::TraceRecord;
21use fxt::profiler::ProfilerRecord;
22use fxt::session::SessionParser;
23use seq_lock::{SeqLock, SeqLockable, WriteSize};
24use starnix_logging::{log_info, log_warn, track_stub};
25use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked};
26use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
27use starnix_uapi::arch32::{
28    PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
29    PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
30    PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
31    PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
32    perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
33    perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
34    perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
35    perf_event_type_PERF_RECORD_SAMPLE,
36};
37use starnix_uapi::errors::Errno;
38use starnix_uapi::open_flags::OpenFlags;
39use starnix_uapi::user_address::UserRef;
40use starnix_uapi::{
41    error, perf_event_attr, perf_event_header, perf_event_mmap_page__bindgen_ty_1,
42    perf_event_read_format_PERF_FORMAT_GROUP, perf_event_read_format_PERF_FORMAT_ID,
43    perf_event_read_format_PERF_FORMAT_LOST, perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
44    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
45};
46
47use crate::security::{self, TargetTaskType};
48use crate::task::{Kernel, LockedAndTask};
49
50static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
51// Default buffer size to read from socket (for sampling data).
52const DEFAULT_CHUNK_SIZE: usize = 4096;
53// 4096 * 10, page size * 10.
54// If tests flake due to running out of buffer space, or if the profiling duration is
55// significantly increased, this buffer size may need further adjustment (expansion).
56const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960;
57// perf_event_header struct size: 32 + 16 + 16 = 8 bytes.
58const PERF_EVENT_HEADER_SIZE: u16 = 8;
59// FXT magic bytes (little endian).
60const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
61
62mod event;
63pub use event::{TraceEvent, TraceEventQueue};
64
65#[repr(C)]
66#[derive(Copy, Clone, IntoBytes, Immutable)]
67struct PerfMetadataHeader {
68    version: u32,
69    compat_version: u32,
70}
71
72#[repr(C)]
73#[derive(Copy, Clone, IntoBytes, Immutable)]
74struct PerfMetadataValue {
75    lock: u32,
76    index: u32,
77    offset: i64,
78    time_enabled: u64,
79    time_running: u64,
80    __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
81    pmc_width: u16,
82    time_shift: u16,
83    time_mult: u32,
84    time_offset: u64,
85    time_zero: u64,
86    size: u32,
87    __reserved_1: u32,
88    time_cycles: u64,
89    time_mask: u64,
90    __reserved: [u8; 928usize],
91    data_head: u64,
92    data_tail: u64,
93    data_offset: u64,
94    data_size: u64,
95    aux_head: u64,
96    aux_tail: u64,
97    aux_offset: u64,
98    aux_size: u64,
99}
100
101// SAFETY: `PerfMetadataValue` can be safely written to shared memory in 8-byte chunks.
102// This is because it is composed of two u32s followed by only u64s.
103// The first u32 is the `lock` field, which is why HAS_INLINE_SEQUENCE is true.
104unsafe impl SeqLockable for PerfMetadataValue {
105    const WRITE_SIZE: WriteSize = WriteSize::Eight;
106    const HAS_INLINE_SEQUENCE: bool = true;
107    const VMO_NAME: &'static [u8] = b"starnix:perf_event";
108}
109
110struct PerfState {
111    // This table maps a group leader's file object id to its unique u64 "format ID".
112    //
113    // When a sample is generated for any event in a group, we use this
114    // "format ID" from the group leader as the value for *both* the
115    // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
116    format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>,
117}
118
119impl Default for PerfState {
120    fn default() -> Self {
121        Self { format_id_lookup_table: Mutex::new(HashMap::new()) }
122    }
123}
124
125fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
126    kernel.expando.get_or_init(PerfState::default)
127}
128
129uapi::check_arch_independent_layout! {
130    perf_event_attr {
131        type_, // "type" is a reserved keyword so add a trailing underscore.
132        size,
133        config,
134        __bindgen_anon_1,
135        sample_type,
136        read_format,
137        _bitfield_1,
138        __bindgen_anon_2,
139        bp_type,
140        __bindgen_anon_3,
141        __bindgen_anon_4,
142        branch_sample_type,
143        sample_regs_user,
144        sample_stack_user,
145        clockid,
146        sample_regs_intr,
147        aux_watermark,
148        sample_max_stack,
149        __reserved_2,
150        aux_sample_size,
151        __reserved_3,
152        sig_data,
153        config3,
154    }
155}
156
157#[derive(Clone, Copy, Debug, PartialEq)]
158enum IoctlOp {
159    Enable,
160    Disable,
161}
162
163struct PerfEventFileState {
164    attr: perf_event_attr,
165    rf_value: u64, // "count" for the config we passed in for the event.
166    // The most recent timestamp (ns) where we changed into an enabled state
167    // i.e. the most recent time we got an ENABLE ioctl().
168    most_recent_enabled_time: u64,
169    // Sum of all previous enablement segment durations (ns). If we are
170    // currently in an enabled state, explicitly does NOT include the current
171    // segment.
172    total_time_running: u64,
173    rf_id: u64,
174    sample_id: u64,
175    _rf_lost: u64,
176    disabled: u64,
177    sample_type: u64,
178    // Handle to blob that stores all the perf data that a user may want.
179    // At the moment it only stores some metadata and backtraces (bts).
180    perf_data_vmo: zx::Vmo,
181    // Remember to increment this offset as the number of pages increases.
182    // Currently we just have a bound of 1 page_size of information.
183    vmo_write_offset: u64,
184    // Channel used to send IoctlOps to start/stop sampling.
185    ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
186}
187
188// Have an implementation for PerfEventFileState because VMO
189// doesn't have Default so we can't derive it.
190impl PerfEventFileState {
191    fn new(
192        attr: perf_event_attr,
193        rf_value: u64,
194        disabled: u64,
195        sample_type: u64,
196        perf_data_vmo: zx::Vmo,
197        vmo_write_offset: u64,
198        ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
199    ) -> PerfEventFileState {
200        PerfEventFileState {
201            attr,
202            rf_value,
203            most_recent_enabled_time: 0,
204            total_time_running: 0,
205            rf_id: 0,
206            sample_id: 0,
207            _rf_lost: 0,
208            disabled,
209            sample_type,
210            perf_data_vmo,
211            vmo_write_offset,
212            ioctl_sender,
213        }
214    }
215}
216
217pub struct PerfEventFile {
218    _tid: tid_t,
219    _cpu: i32,
220    perf_event_file: RwLock<PerfEventFileState>,
221    // The security state for this PerfEventFile.
222    pub security_state: security::PerfEventState,
223    // Pointer to the perf_event_mmap_page metadata's data_head.
224    // TODO(https://fxbug.dev/460203776) Remove Arc after figuring out
225    // "borrowed value does not live long enough" issue.
226    _data_head_pointer: Arc<AtomicPtr<u64>>,
227    seq_lock: SeqLock<PerfMetadataHeader, PerfMetadataValue>,
228}
229
230// PerfEventFile object that implements FileOps.
231// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
232// implementation details.
233// This object can be saved as a FileDescriptor.
234impl FileOps for PerfEventFile {
235    // Don't need to implement seek or sync for PerfEventFile.
236    fileops_impl_nonseekable!();
237    fileops_impl_noop_sync!();
238
239    fn close(
240        self: Box<Self>,
241        _locked: &mut Locked<FileOpsCore>,
242        file: &FileObjectState,
243        current_task: &CurrentTask,
244    ) {
245        let perf_state = get_perf_state(&current_task.kernel);
246        let mut events = perf_state.format_id_lookup_table.lock();
247        events.remove(&file.id);
248    }
249
250    // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
251    fn read(
252        &self,
253        _locked: &mut Locked<FileOpsCore>,
254        _file: &FileObject,
255        current_task: &CurrentTask,
256        _offset: usize,
257        data: &mut dyn OutputBuffer,
258    ) -> Result<usize, Errno> {
259        // Create/calculate and return the ReadFormatData object.
260        // If we create it earlier we might want to change it and it's immutable once created.
261        let read_format_data = {
262            // Once we get the `value` or count from kernel, we can change this to a read()
263            // call instead of write().
264            let mut perf_event_file = self.perf_event_file.write();
265
266            security::check_perf_event_read_access(current_task, &self)?;
267
268            let mut total_time_running_including_curr = perf_event_file.total_time_running;
269
270            // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
271            if perf_event_file.disabled == 0 {
272                // Calculate the value or "count" of the config we're interested in.
273                // This value should reflect the value we are counting (defined in the config).
274                // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
275                // For now we just return rf_value + 1.
276                track_stub!(
277                    TODO("https://fxbug.dev/402938671"),
278                    "[perf_event_open] implement read_format value"
279                );
280                perf_event_file.rf_value += 1;
281
282                // Update time duration.
283                let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
284                total_time_running_including_curr +=
285                    curr_time - perf_event_file.most_recent_enabled_time;
286            }
287
288            let mut output = Vec::<u8>::new();
289            let value = perf_event_file.rf_value.to_ne_bytes();
290            output.extend(value);
291
292            let read_format = perf_event_file.attr.read_format;
293
294            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
295                // Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
296                output.extend(total_time_running_including_curr.to_ne_bytes());
297            }
298            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
299                // Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
300                output.extend(total_time_running_including_curr.to_ne_bytes());
301            }
302            if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
303                // Adds a 64-bit unique value that corresponds to the event group.
304                output.extend(perf_event_file.rf_id.to_ne_bytes());
305            }
306
307            output
308        };
309
310        // The regular read() call allows the case where the bytes-we-want-to-read-in won't
311        // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
312        // into a buffer that is not big enough to hold the data, the error ENOSPC results."
313        if data.available() < read_format_data.len() {
314            return error!(ENOSPC);
315        }
316        track_stub!(
317            TODO("https://fxbug.dev/402453955"),
318            "[perf_event_open] implement remaining error handling"
319        );
320
321        data.write(&read_format_data)
322    }
323
324    fn ioctl(
325        &self,
326        _locked: &mut Locked<Unlocked>,
327        _file: &FileObject,
328        current_task: &CurrentTask,
329        op: u32,
330        _arg: SyscallArg,
331    ) -> Result<SyscallResult, Errno> {
332        track_stub!(
333            TODO("https://fxbug.dev/405463320"),
334            "[perf_event_open] implement PERF_IOC_FLAG_GROUP"
335        );
336        security::check_perf_event_write_access(current_task, &self)?;
337        let mut perf_event_file = self.perf_event_file.write();
338        match op {
339            PERF_EVENT_IOC_ENABLE => {
340                if perf_event_file.disabled != 0 {
341                    perf_event_file.disabled = 0; // 0 = false.
342                    perf_event_file.most_recent_enabled_time =
343                        zx::MonotonicInstant::get().into_nanos() as u64;
344                }
345
346                // If we are sampling, invoke the profiler and collect a sample.
347                // Currently this is an example sample collection.
348                track_stub!(
349                    TODO("https://fxbug.dev/398914921"),
350                    "[perf_event_open] implement full sampling features"
351                );
352                if perf_event_file.attr.freq() == 0
353                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
354                // This is always sound regardless of the union's tag.
355                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
356                {
357                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
358                }
359                return Ok(SUCCESS);
360            }
361            PERF_EVENT_IOC_DISABLE => {
362                if perf_event_file.disabled == 0 {
363                    perf_event_file.disabled = 1; // 1 = true.
364
365                    // Update total_time_running now that the segment has ended.
366                    let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
367                    perf_event_file.total_time_running +=
368                        curr_time - perf_event_file.most_recent_enabled_time;
369                }
370                if perf_event_file.attr.freq() == 0
371                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
372                // This is always sound regardless of the union's tag.
373                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
374                {
375                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Disable);
376                }
377                return Ok(SUCCESS);
378            }
379            PERF_EVENT_IOC_RESET => {
380                perf_event_file.rf_value = 0;
381                return Ok(SUCCESS);
382            }
383            PERF_EVENT_IOC_REFRESH
384            | PERF_EVENT_IOC_PERIOD
385            | PERF_EVENT_IOC_SET_OUTPUT
386            | PERF_EVENT_IOC_SET_FILTER
387            | PERF_EVENT_IOC_ID
388            | PERF_EVENT_IOC_SET_BPF
389            | PERF_EVENT_IOC_PAUSE_OUTPUT
390            | PERF_EVENT_IOC_MODIFY_ATTRIBUTES
391            | PERF_EVENT_IOC_QUERY_BPF => {
392                track_stub!(
393                    TODO("https://fxbug.dev/404941053"),
394                    "[perf_event_open] implement remaining ioctl() calls"
395                );
396                return error!(ENOSYS);
397            }
398            _ => error!(ENOTTY),
399        }
400    }
401
402    // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
403    // Gets called when mmap() is called.
404    // Immediately before sampling, this should get called by the user (e.g. the test
405    // or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
406    fn get_memory(
407        &self,
408        _locked: &mut Locked<FileOpsCore>,
409        _file: &FileObject,
410        current_task: &CurrentTask,
411        length: Option<usize>,
412        _prot: ProtectionFlags,
413    ) -> Result<Arc<MemoryObject>, Errno> {
414        let buffer_size: u64 = length.unwrap_or(0) as u64;
415        if buffer_size == 0 {
416            return error!(EINVAL);
417        }
418
419        // Update metadata page now that we have new information.
420        // Also update `data_head` to indicate that this first page has finished writing.
421        let mut metadata_value: PerfMetadataValue = self.seq_lock.get();
422        let page_size = zx::system_get_page_size() as u64;
423        metadata_value.data_head = page_size;
424        metadata_value.data_size = buffer_size - page_size;
425        // Write directly to memory location (not MemoryObject).
426        self.seq_lock.set_value(metadata_value);
427
428        // Write to a MemoryObject and return it (expected return type for get_memory()).
429        security::check_perf_event_read_access(current_task, &self)?;
430        let perf_event_file = self.perf_event_file.read();
431        match perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS) {
432            Ok(vmo) => {
433                let memory = MemoryObject::Vmo(vmo.into());
434                return Ok(Arc::new(memory));
435            }
436            Err(_) => {
437                track_stub!(
438                    TODO("https://fxbug.dev/416323134"),
439                    "[perf_event_open] handle get_memory() errors"
440                );
441                return error!(EINVAL);
442            }
443        };
444    }
445
446    fn write(
447        &self,
448        _locked: &mut Locked<FileOpsCore>,
449        _file: &FileObject,
450        _current_task: &CurrentTask,
451        _offset: usize,
452        _data: &mut dyn InputBuffer,
453    ) -> Result<usize, Errno> {
454        track_stub!(
455            TODO("https://fxbug.dev/394960158"),
456            "[perf_event_open] implement perf event functions"
457        );
458        error!(ENOSYS)
459    }
460}
461
462// Given a PerfRecordSample struct, write it via the correct output format
463// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
464// We don't currently support all the sample_types listed in the docs.
465// Input:
466//    PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
467// Human-understandable output:
468//    9 1 40 111 5 10 3 111 222 333
469// Actual output (no spaces or \n in real output, just making it more readable):
470//    0x0000 0x0009                 <-- starts at `offset` bytes
471//    0x0001
472//    0x0040
473//    0x0000 0x0000 0x0000 0x006F   <-- starts at `offset` + 8 bytes
474//    0x0000 0x0000 0x0000 0x0005
475//    0x0000 0x0000 0x0000 0x0010
476//    0x0000 0x0000 0x0000 0x0003
477//    0x0000 0x0000 0x0000 0x006F
478//    0x0000 0x0000 0x0000 0x00DE
479//    0x0000 0x0000 0x0000 0x014D
480//
481//    Returns the length of bytes written. In above case, 8 + 28 = 36.
482//    This information is used to increment the global offset.
483fn write_record_to_vmo(
484    perf_record_sample: PerfRecordSample,
485    perf_data_vmo: &zx::Vmo,
486    _data_head_pointer: &AtomicPtr<u64>,
487    sample_type: u64,
488    sample_id: u64,
489    sample_period: u64,
490    offset: u64,
491) -> u64 {
492    // Write header.
493    track_stub!(
494        TODO("https://fxbug.dev/432501467"),
495        "[perf_event_open] determines whether the record is KERNEL or USER"
496    );
497    let perf_event_header = perf_event_header {
498        type_: perf_event_type_PERF_RECORD_SAMPLE,
499        misc: PERF_RECORD_MISC_KERNEL as u16,
500        size: PERF_EVENT_HEADER_SIZE,
501    };
502
503    match perf_data_vmo.write(&perf_event_header.as_bytes(), offset) {
504        Ok(_) => (),
505        Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
506    }
507
508    // Write sample.
509    let mut sample = Vec::<u8>::new();
510    // sample_id
511    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
512        sample.extend(sample_id.to_ne_bytes());
513    }
514    // ip
515    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
516        sample.extend(perf_record_sample.ips[0].to_ne_bytes());
517    }
518
519    if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
520        // pid
521        sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
522        // tid
523        sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
524    }
525
526    // id
527    if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
528        sample.extend(sample_id.to_ne_bytes());
529    }
530
531    // sample period
532    if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
533        sample.extend(sample_period.to_ne_bytes());
534    }
535
536    if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
537        // nr
538        sample.extend(perf_record_sample.ips.len().to_ne_bytes());
539
540        // ips[nr] - list of ips, u64 per ip.
541        for i in perf_record_sample.ips {
542            sample.extend(i.to_ne_bytes());
543        }
544    }
545    // The remaining data are not defined for now.
546
547    match perf_data_vmo.write(&sample, offset + (std::mem::size_of::<perf_event_header>() as u64)) {
548        Ok(_) => {
549            let bytes_written: u64 =
550                (std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
551
552            // TODO(http://fuchsia.dev/460203776) implement this better before enabling
553            // any setting of data_head value.
554            // Update data_head because we have now written to the VMO.
555            // Ordering::Release pushes update that this (and, transitively, the sample
556            // too) has updated.
557            // data_head_pointer.fetch_add(bytes_written, Ordering::Release);
558
559            // Return the total size we wrote (header + sample) so that we can
560            // increment offset counter.
561            return bytes_written;
562        }
563        Err(e) => {
564            log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
565            // Failed to write. Don't increment offset counter.
566            return 0;
567        }
568    }
569}
570
571#[derive(Debug, Clone)]
572struct PerfRecordSample {
573    pid: Option<u32>,
574    tid: Option<u32>,
575    // Instruction pointers (currently this is the address). First one is `ip` param.
576    ips: Vec<u64>,
577}
578
579// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
580//
581// 1234                     pid
582// 5555                     tid
583// {{{bt:0:0x1111:pc}}}    {{{bt:frame_number:address:type}}}
584// {{{bt:1:0x2222:ra}}}
585// {{{bt:2:0x3333:ra}}}
586//
587// Results in:
588// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
589
590fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
591    let mut pid: Option<u32> = None;
592    let mut tid: Option<u32> = None;
593    let mut ips: Vec<u64> = Vec::new();
594    let mut numbers_found = 0;
595    track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
596    let backtrace_regex =
597        Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
598
599    for line in backtrace.lines() {
600        let trimmed_line = line.trim();
601        // Try to parse as a raw number (for PID/TID).
602        if numbers_found < 2 {
603            if let Ok(num) = trimmed_line.parse::<u32>() {
604                if numbers_found == 0 {
605                    pid = Some(num);
606                } else {
607                    tid = Some(num);
608                }
609                numbers_found += 1;
610                continue;
611            }
612        }
613
614        // Try to parse as a backtrace line.
615        if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
616            let address_str = parsed_bt.get(1).unwrap().as_str();
617            if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
618                ips.push(ip_addr);
619            }
620        }
621    }
622
623    if pid == None || tid == None || ips.is_empty() {
624        // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
625        log_info!("No ips while getting PerfRecordSample");
626        None
627    } else {
628        Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
629    }
630}
631
632async fn set_up_profiler(
633    sample_period: zx::MonotonicDuration,
634) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
635    // Configuration for how we want to sample.
636    let sample = profiler::Sample {
637        callgraph: Some(profiler::CallgraphConfig {
638            strategy: Some(profiler::CallgraphStrategy::FramePointer),
639            ..Default::default()
640        }),
641        ..Default::default()
642    };
643
644    let sampling_config = profiler::SamplingConfig {
645        period: Some(sample_period.into_nanos() as u64),
646        timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
647        sample: Some(sample),
648        ..Default::default()
649    };
650
651    let tasks = vec![
652        // Should return ~300 samples for 100 millis.
653        profiler::Task::SystemWide(profiler::SystemWide {}),
654    ];
655    let targets = profiler::TargetConfig::Tasks(tasks);
656    let config = profiler::Config {
657        configs: Some(vec![sampling_config]),
658        target: Some(targets),
659        ..Default::default()
660    };
661    let (client, server) = fidl::Socket::create_stream();
662    let configure = profiler::SessionConfigureRequest {
663        output: Some(server),
664        config: Some(config),
665        ..Default::default()
666    };
667
668    let proxy = connect_to_protocol::<profiler::SessionMarker>()
669        .context("Error connecting to Profiler protocol");
670    let session_proxy: profiler::SessionProxy = match proxy {
671        Ok(p) => p.clone(),
672        Err(e) => return error!(EINVAL, e),
673    };
674
675    // Must configure before sampling start().
676    let config_request = session_proxy.configure(configure).await;
677    match config_request {
678        Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
679        Err(e) => return error!(EINVAL, e),
680    }
681}
682
683// Collects samples and puts backtrace in VMO.
684// - Reads in the buffer from the socket for that duration in chunks.
685// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
686// - Writes the PERF_RECORD_SAMPLE into VMO.
687async fn stop_and_collect_samples(
688    session_proxy: profiler::SessionProxy,
689    mut client: fidl::AsyncSocket,
690    perf_data_vmo: &zx::Vmo,
691    data_head_pointer: &AtomicPtr<u64>,
692    sample_type: u64,
693    sample_id: u64,
694    sample_period: u64,
695    vmo_write_offset: u64,
696) -> Result<(), Errno> {
697    let stats = session_proxy.stop().await;
698    let samples_collected = match stats {
699        Ok(stats) => stats.samples_collected.unwrap(),
700        Err(e) => return error!(EINVAL, e),
701    };
702
703    track_stub!(
704        TODO("https://fxbug.dev/422502681"),
705        "[perf_event_open] symbolize sample output and delete the below log_info"
706    );
707    log_info!("profiler samples_collected: {:?}", samples_collected);
708
709    // Peek at the first 8 bytes to determine if it's FXT or text.
710    let mut header = [0; 8];
711    let mut bytes_read = 0;
712    while bytes_read < 8 {
713        match client.read(&mut header[bytes_read..]).await {
714            Ok(0) => {
715                // Peer closed the socket. This is the normal end of the stream.
716                log_info!("[perf_event_open] Finished reading fxt record from socket.");
717                break;
718            }
719            Ok(n) => bytes_read += n,
720            Err(e) => {
721                log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
722                break;
723            }
724        }
725    }
726
727    if bytes_read > 0 {
728        if bytes_read == 8 && header == FXT_MAGIC_BYTES {
729            // FXT format.
730            let header_cursor = Cursor::new(header);
731            let reader = header_cursor.chain(client);
732            let (mut stream, _task) = SessionParser::new_async(reader);
733            while let Some(record_result) = stream.next().await {
734                match record_result {
735                    Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
736                        let ips: Vec<u64> = backtrace.data;
737                        let pid = Some(backtrace.process.0 as u32);
738                        let tid = Some(backtrace.thread.0 as u32);
739                        let perf_record_sample = PerfRecordSample { pid, tid, ips };
740                        write_record_to_vmo(
741                            perf_record_sample,
742                            perf_data_vmo,
743                            data_head_pointer,
744                            sample_type,
745                            sample_id,
746                            sample_period,
747                            vmo_write_offset,
748                        );
749                    }
750                    Ok(_) => {
751                        // Ignore other records.
752                    }
753                    Err(e) => {
754                        log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
755                        break;
756                    }
757                }
758            }
759        } else {
760            // Text format.
761            // Read chunks of sampling data from socket in this buffer temporarily. We will parse
762            // the data and write it into the output VMO (the one mmap points to).
763            let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
764
765            loop {
766                // Attempt to read data. This awaits until data is available, EOF, or error.
767                // Ignore the first 8 bytes as it's the {{{reset}}} marker.
768                let socket_data = client.read(&mut buffer).await;
769
770                match socket_data {
771                    Ok(0) => {
772                        // Peer closed the socket. This is the normal end of the stream.
773                        log_info!("[perf_event_open] Finished reading from socket.");
774                        break;
775                    }
776                    Ok(bytes_read) => {
777                        // Receive data in format {{{...}}}.
778                        let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
779                            Ok(data) => data,
780                            Err(e) => return error!(EINVAL, e),
781                        };
782                        // Parse data to PerfRecordSample struct.
783                        if let Some(perf_record_sample) =
784                            parse_perf_record_sample_format(received_data)
785                        {
786                            write_record_to_vmo(
787                                perf_record_sample,
788                                perf_data_vmo,
789                                data_head_pointer,
790                                sample_type,
791                                sample_id,
792                                sample_period,
793                                vmo_write_offset,
794                            );
795                        }
796                    }
797                    Err(e) => {
798                        log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
799                        break;
800                    }
801                }
802            }
803        }
804    }
805
806    let reset_status = session_proxy.reset().await;
807    return match reset_status {
808        Ok(_) => Ok(()),
809        Err(e) => error!(EINVAL, e),
810    };
811}
812
813// Notifies other thread that we should start/stop sampling.
814// Once sampling is complete, that profiler session is no longer needed.
815// At that point, send back notification so that this is no longer blocking
816// (e.g. so that other profiler sessions can start).
817fn ping_receiver(
818    mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
819    command: IoctlOp,
820) {
821    log_info!("[perf_event_open] Received sampling command: {:?}", command);
822    let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
823    match ioctl_sender.try_send((command, profiling_complete_sender)) {
824        Ok(_) => (),
825        Err(e) => {
826            if e.is_full() {
827                log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
828            } else if e.is_disconnected() {
829                log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
830            } else {
831                log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
832            }
833        }
834    };
835    // Block on / wait until profiling is complete before returning.
836    // This notifies that the profiler is free to be used for another session.
837    let _ = profiling_complete_receiver.recv().unwrap();
838}
839
840// Creates a seq lock for the given VMO. Initializes the seq lock with
841// known initial values (unknown values default to 0).
842// Does NOT actually save this as a memory object until mmap() is called.
843//
844// # Safety
845//
846// The caller must ensure that the kernel maintains exclusive write access to this VMO and
847// there are only atomic accesses to this memory (see seq_lock lib.rs for details).
848unsafe fn create_seq_lock(
849    vmo_handle_ref: &zx::NullableHandle,
850) -> SeqLock<PerfMetadataHeader, PerfMetadataValue> {
851    // Currently we hardcode everything just to get something E2E working.
852    let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
853    let metadata_value = PerfMetadataValue {
854        lock: 0,
855        index: 3,
856        offset: 19337,
857        time_enabled: 0,
858        time_running: 0,
859        __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
860        pmc_width: 0,
861        time_shift: 0,
862        time_mult: 0,
863        time_offset: 0,
864        time_zero: 0,
865        size: 0,
866        __reserved_1: 0,
867        time_cycles: 0,
868        time_mask: 0,
869        __reserved: [0; 928usize],
870        data_head: 0,
871        // Start reading from 0; it is the user's responsibility to increment on their end.
872        data_tail: 0,
873        // We know the data will start after 1 page size so we can set this now.
874        data_offset: zx::system_get_page_size() as u64,
875        // We can only calculate this value when mmap() is called. Initialize to 0.
876        data_size: 0,
877        aux_head: 0,
878        aux_tail: 0,
879        aux_offset: 0,
880        aux_size: 0,
881    };
882    let vmo = zx::Vmo::from(vmo_handle_ref.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap());
883
884    // Create a SeqLock and safely initialize the `header` and `value` for it.
885    // SeqLock is formatted thusly:
886    //   header_struct : any size, params `version` and `compat_version` should not change
887    //   sequence_counter : u32, this is the lock and should increment
888    //   value_struct : any size, each param can change
889    //
890    // SAFETY: See safety requirements on `create_seq_lock`.
891    unsafe {
892        SeqLock::new_from_vmo(metadata_header, metadata_value, vmo)
893            .expect("failed to create seq_lock for perf metadata")
894    }
895}
896
897pub fn sys_perf_event_open(
898    locked: &mut Locked<Unlocked>,
899    current_task: &CurrentTask,
900    attr: UserRef<perf_event_attr>,
901    // Note that this is pid in Linux docs.
902    tid: tid_t,
903    cpu: i32,
904    group_fd: FdNumber,
905    _flags: u64,
906) -> Result<SyscallResult, Errno> {
907    // So far, the implementation only sets the read_data_format according to the "Reading results"
908    // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
909    // Other features will be added in the future (see below track_stubs).
910    let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
911
912    if tid == -1 && cpu == -1 {
913        return error!(EINVAL);
914    }
915
916    let target_task_type = match tid {
917        -1 => TargetTaskType::AllTasks,
918        0 => TargetTaskType::CurrentTask,
919        _ => {
920            track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
921            return error!(ENOSYS);
922        }
923    };
924    security::check_perf_event_open_access(
925        current_task,
926        target_task_type,
927        &perf_event_attrs,
928        perf_event_attrs.type_.try_into()?,
929    )?;
930
931    // Channel used to send info between notifier and spawned task thread.
932    // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
933    // quick succession (instead of something lower).
934    let (sender, mut receiver) = future_mpsc::channel(8);
935
936    let page_size = zx::system_get_page_size() as u64;
937    let mut perf_event_file = PerfEventFileState::new(
938        perf_event_attrs,
939        0,
940        perf_event_attrs.disabled(),
941        perf_event_attrs.sample_type,
942        zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
943        page_size, // Start with this amount of offset, we can increment as we write.
944        sender,
945    );
946
947    let read_format = perf_event_attrs.read_format;
948
949    if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
950        || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
951    {
952        // Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
953        // as otherwise this param shouldn't be used for calculating anything.
954        if perf_event_file.disabled == 0 {
955            perf_event_file.most_recent_enabled_time =
956                zx::MonotonicInstant::get().into_nanos() as u64;
957        }
958        // Initialize this to 0 as we will need to return a time duration later during read().
959        perf_event_file.total_time_running = 0;
960    }
961
962    let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
963    perf_event_file.rf_id = event_id;
964
965    if group_fd.raw() == -1 {
966        perf_event_file.sample_id = event_id;
967    } else {
968        let group_file = current_task.get_file(group_fd)?;
969        let group_file_object_id = group_file.id;
970        let perf_state = get_perf_state(&current_task.kernel);
971        let events = perf_state.format_id_lookup_table.lock();
972        if let Some(rf_id) = events.get(&group_file_object_id) {
973            perf_event_file.sample_id = *rf_id;
974        } else {
975            return error!(EINVAL);
976        }
977    }
978
979    if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
980        track_stub!(
981            TODO("https://fxbug.dev/402238049"),
982            "[perf_event_open] implement read_format group"
983        );
984        return error!(ENOSYS);
985    }
986    if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
987        track_stub!(
988            TODO("https://fxbug.dev/402260383"),
989            "[perf_event_open] implement read_format lost"
990        );
991    }
992
993    // Set up notifier for handling ioctl calls to enable/disable sampling.
994    let mut vmo_handle_copy =
995        perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS);
996
997    // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
998    // This is always sound regardless of the union's tag.
999    let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
1000    // The sample period from the PERF_COUNT_SW_CPU_CLOCK is
1001    // 1 nanosecond per tick. Convert this duration into zx::duration.
1002    let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
1003
1004    let data_head_pointer = Arc::new(AtomicPtr::new(std::ptr::null_mut::<u64>()));
1005    // Pass cloned into the thread.
1006    let cloned_data_head_pointer = Arc::clone(&data_head_pointer);
1007
1008    // SAFETY: This is safe because the kernel maintains exclusive write access to this VMO and
1009    // there are only atomic accesses to this memory (see seq_lock lib.rs for details).
1010    let seq_lock = unsafe { create_seq_lock(vmo_handle_copy.as_ref().unwrap()) };
1011
1012    let closure = async move |_: LockedAndTask<'_>| {
1013        let mut profiler_state: Option<(profiler::SessionProxy, fidl::AsyncSocket)> = None;
1014
1015        // This loop will wait for messages from the sender.
1016        while let Some((command, profiling_complete_receiver)) = receiver.next().await {
1017            match command {
1018                IoctlOp::Enable => {
1019                    match set_up_profiler(zx_sample_period).await {
1020                        Ok((session_proxy, client)) => {
1021                            let start_request = profiler::SessionStartRequest {
1022                                buffer_results: Some(true),
1023                                buffer_size_mb: Some(8 as u64),
1024                                ..Default::default()
1025                            };
1026                            if let Err(e) = session_proxy.start(&start_request).await {
1027                                log_warn!("Failed to start profiling: {}", e);
1028                            } else {
1029                                profiler_state = Some((session_proxy, client));
1030                            }
1031                        }
1032                        Err(e) => {
1033                            log_warn!("Failed to profile: {}", e);
1034                        }
1035                    };
1036                    // Send notification anyway to unblock the ioctl caller.
1037                    let _ = profiling_complete_receiver.send(());
1038                }
1039                IoctlOp::Disable => {
1040                    if let Some((session_proxy, client)) = profiler_state.take() {
1041                        let handle = vmo_handle_copy
1042                            .as_mut()
1043                            .expect("Failed to get VMO handle")
1044                            .as_handle_ref()
1045                            .duplicate(zx::Rights::SAME_RIGHTS)
1046                            .unwrap();
1047
1048                        if let Err(e) = stop_and_collect_samples(
1049                            session_proxy,
1050                            client,
1051                            &zx::Vmo::from(handle),
1052                            &*cloned_data_head_pointer,
1053                            perf_event_file.sample_type,
1054                            perf_event_file.sample_id,
1055                            sample_period_in_ticks,
1056                            perf_event_file.vmo_write_offset,
1057                        )
1058                        .await
1059                        {
1060                            log_warn!("Failed to collect sample: {:?}", e);
1061                        }
1062                    }
1063                    // Send notification anyway to unblock the ioctl caller.
1064                    let _ = profiling_complete_receiver.send(());
1065                }
1066            }
1067        }
1068        ()
1069    };
1070    let req = SpawnRequestBuilder::new()
1071        .with_debug_name("perf-event-sampler")
1072        .with_async_closure(closure)
1073        .build();
1074    current_task.kernel().kthreads.spawner().spawn_from_request(req);
1075
1076    let file = Box::new(PerfEventFile {
1077        _tid: tid,
1078        _cpu: cpu,
1079        perf_event_file: RwLock::new(perf_event_file),
1080        security_state: security::perf_event_alloc(current_task),
1081        _data_head_pointer: data_head_pointer,
1082        seq_lock: seq_lock,
1083    });
1084    // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
1085    let file_handle =
1086        Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
1087    let file_object_id = file_handle.id;
1088    let file_descriptor: Result<FdNumber, Errno> =
1089        current_task.add_file(locked, file_handle, FdFlags::empty());
1090
1091    match file_descriptor {
1092        Ok(fd) => {
1093            if group_fd.raw() == -1 {
1094                let perf_state = get_perf_state(&current_task.kernel);
1095                let mut events = perf_state.format_id_lookup_table.lock();
1096                events.insert(file_object_id, event_id);
1097            }
1098            Ok(fd.into())
1099        }
1100        Err(_) => {
1101            track_stub!(
1102                TODO("https://fxbug.dev/402453955"),
1103                "[perf_event_open] implement remaining error handling"
1104            );
1105            error!(EMFILE)
1106        }
1107    }
1108}
1109// Syscalls for arch32 usage
1110#[cfg(target_arch = "aarch64")]
1111mod arch32 {
1112    pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
1113}
1114
1115#[cfg(target_arch = "aarch64")]
1116pub use arch32::*;
1117
1118use crate::mm::memory::MemoryObject;
1119use crate::mm::{MemoryAccessorExt, ProtectionFlags};
1120use crate::task::CurrentTask;
1121use crate::vfs::{
1122    Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
1123    OutputBuffer,
1124};
1125use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};