starnix_core/perf/
mod.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
6use anyhow::Context;
7use fuchsia_component::client::connect_to_protocol;
8use futures::StreamExt;
9use futures::channel::mpsc as future_mpsc;
10use regex::Regex;
11use std::collections::HashMap;
12use std::error::Error;
13use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};
14use std::sync::{Arc, mpsc as sync_mpsc};
15use std::time::Duration;
16use zerocopy::{Immutable, IntoBytes};
17use {fidl_fuchsia_cpu_profiler as profiler, fuchsia_async};
18
19use futures::io::{AsyncReadExt, Cursor};
20use fxt::TraceRecord;
21use fxt::profiler::ProfilerRecord;
22use fxt::session::SessionParser;
23use seq_lock::SeqLock;
24use starnix_logging::{log_info, log_warn, track_stub};
25use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked};
26use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
27use starnix_uapi::arch32::{
28    PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
29    PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
30    PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
31    PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
32    perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
33    perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
34    perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
35    perf_event_type_PERF_RECORD_SAMPLE,
36};
37use starnix_uapi::errors::Errno;
38use starnix_uapi::open_flags::OpenFlags;
39use starnix_uapi::user_address::UserRef;
40use starnix_uapi::{
41    error, perf_event_attr, perf_event_header, perf_event_mmap_page__bindgen_ty_1,
42    perf_event_read_format_PERF_FORMAT_GROUP, perf_event_read_format_PERF_FORMAT_ID,
43    perf_event_read_format_PERF_FORMAT_LOST, perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
44    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
45};
46
47use crate::security::{self, TargetTaskType};
48use crate::task::{Kernel, LockedAndTask};
49
50static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
51// Default buffer size to read from socket (for sampling data).
52const DEFAULT_CHUNK_SIZE: usize = 4096;
53const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960; // 4096 * 10, page size * 10.
54// perf_event_header struct size: 32 + 16 + 16 = 8 bytes.
55const PERF_EVENT_HEADER_SIZE: u16 = 8;
56// FXT magic bytes (little endian).
57const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
58
59mod event;
60pub use event::{TraceEvent, TraceEventQueue};
61
62#[repr(C)]
63#[derive(Copy, Clone, IntoBytes, Immutable)]
64struct PerfMetadataHeader {
65    version: u32,
66    compat_version: u32,
67}
68
69#[repr(C, packed)]
70#[derive(Copy, Clone, IntoBytes, Immutable)]
71struct PerfMetadataValue {
72    index: u32,
73    offset: i64,
74    time_enabled: u64,
75    time_running: u64,
76    __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
77    pmc_width: u16,
78    time_shift: u16,
79    time_mult: u32,
80    time_offset: u64,
81    time_zero: u64,
82    size: u32,
83    __reserved_1: u32,
84    time_cycles: u64,
85    time_mask: u64,
86    __reserved: [u8; 928usize],
87    data_head: u64,
88    data_tail: u64,
89    data_offset: u64,
90    data_size: u64,
91    aux_head: u64,
92    aux_tail: u64,
93    aux_offset: u64,
94    aux_size: u64,
95}
96
97struct PerfState {
98    // This table maps a group leader's file object id to its unique u64 "format ID".
99    //
100    // When a sample is generated for any event in a group, we use this
101    // "format ID" from the group leader as the value for *both* the
102    // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
103    format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>,
104}
105
106impl Default for PerfState {
107    fn default() -> Self {
108        Self { format_id_lookup_table: Mutex::new(HashMap::new()) }
109    }
110}
111
112fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
113    kernel.expando.get_or_init(PerfState::default)
114}
115
116uapi::check_arch_independent_layout! {
117    perf_event_attr {
118        type_, // "type" is a reserved keyword so add a trailing underscore.
119        size,
120        config,
121        __bindgen_anon_1,
122        sample_type,
123        read_format,
124        _bitfield_1,
125        __bindgen_anon_2,
126        bp_type,
127        __bindgen_anon_3,
128        __bindgen_anon_4,
129        branch_sample_type,
130        sample_regs_user,
131        sample_stack_user,
132        clockid,
133        sample_regs_intr,
134        aux_watermark,
135        sample_max_stack,
136        __reserved_2,
137        aux_sample_size,
138        __reserved_3,
139        sig_data,
140        config3,
141    }
142}
143
144#[derive(Clone, Copy, Debug, PartialEq)]
145enum IoctlOp {
146    Enable,
147}
148
149struct PerfEventFileState {
150    attr: perf_event_attr,
151    rf_value: u64, // "count" for the config we passed in for the event.
152    // The most recent timestamp (ns) where we changed into an enabled state
153    // i.e. the most recent time we got an ENABLE ioctl().
154    most_recent_enabled_time: u64,
155    // Sum of all previous enablement segment durations (ns). If we are
156    // currently in an enabled state, explicitly does NOT include the current
157    // segment.
158    total_time_running: u64,
159    rf_id: u64,
160    sample_id: u64,
161    _rf_lost: u64,
162    disabled: u64,
163    sample_type: u64,
164    // Handle to blob that stores all the perf data that a user may want.
165    // At the moment it only stores some metadata and backtraces (bts).
166    perf_data_vmo: zx::Vmo,
167    // Remember to increment this offset as the number of pages increases.
168    // Currently we just have a bound of 1 page_size of information.
169    vmo_write_offset: u64,
170    // Channel used to send IoctlOps to start/stop sampling.
171    ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
172}
173
174// Have an implementation for PerfEventFileState because VMO
175// doesn't have Default so we can't derive it.
176impl PerfEventFileState {
177    fn new(
178        attr: perf_event_attr,
179        rf_value: u64,
180        disabled: u64,
181        sample_type: u64,
182        perf_data_vmo: zx::Vmo,
183        vmo_write_offset: u64,
184        ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
185    ) -> PerfEventFileState {
186        PerfEventFileState {
187            attr,
188            rf_value,
189            most_recent_enabled_time: 0,
190            total_time_running: 0,
191            rf_id: 0,
192            sample_id: 0,
193            _rf_lost: 0,
194            disabled,
195            sample_type,
196            perf_data_vmo,
197            vmo_write_offset,
198            ioctl_sender,
199        }
200    }
201}
202
203pub struct PerfEventFile {
204    _tid: tid_t,
205    _cpu: i32,
206    perf_event_file: RwLock<PerfEventFileState>,
207    // The security state for this PerfEventFile.
208    pub security_state: security::PerfEventState,
209    // Pointer to the perf_event_mmap_page metadata's data_head.
210    // TODO(https://fxbug.dev/460203776) Remove Arc after figuring out
211    // "borrowed value does not live long enough" issue.
212    data_head_pointer: Arc<AtomicPtr<u64>>,
213}
214
215// PerfEventFile object that implements FileOps.
216// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
217// implementation details.
218// This object can be saved as a FileDescriptor.
219impl FileOps for PerfEventFile {
220    // Don't need to implement seek or sync for PerfEventFile.
221    fileops_impl_nonseekable!();
222    fileops_impl_noop_sync!();
223
224    fn close(
225        self: Box<Self>,
226        _locked: &mut Locked<FileOpsCore>,
227        file: &FileObjectState,
228        current_task: &CurrentTask,
229    ) {
230        let perf_state = get_perf_state(&current_task.kernel);
231        let mut events = perf_state.format_id_lookup_table.lock();
232        events.remove(&file.id);
233    }
234
235    // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
236    fn read(
237        &self,
238        _locked: &mut Locked<FileOpsCore>,
239        _file: &FileObject,
240        current_task: &CurrentTask,
241        _offset: usize,
242        data: &mut dyn OutputBuffer,
243    ) -> Result<usize, Errno> {
244        // Create/calculate and return the ReadFormatData object.
245        // If we create it earlier we might want to change it and it's immutable once created.
246        let read_format_data = {
247            // Once we get the `value` or count from kernel, we can change this to a read()
248            // call instead of write().
249            let mut perf_event_file = self.perf_event_file.write();
250
251            security::check_perf_event_read_access(current_task, &self)?;
252
253            let mut total_time_running_including_curr = perf_event_file.total_time_running;
254
255            // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
256            if perf_event_file.disabled == 0 {
257                // Calculate the value or "count" of the config we're interested in.
258                // This value should reflect the value we are counting (defined in the config).
259                // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
260                // For now we just return rf_value + 1.
261                track_stub!(
262                    TODO("https://fxbug.dev/402938671"),
263                    "[perf_event_open] implement read_format value"
264                );
265                perf_event_file.rf_value += 1;
266
267                // Update time duration.
268                let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
269                total_time_running_including_curr +=
270                    curr_time - perf_event_file.most_recent_enabled_time;
271            }
272
273            let mut output = Vec::<u8>::new();
274            let value = perf_event_file.rf_value.to_ne_bytes();
275            output.extend(value);
276
277            let read_format = perf_event_file.attr.read_format;
278
279            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
280                // Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
281                output.extend(total_time_running_including_curr.to_ne_bytes());
282            }
283            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
284                // Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
285                output.extend(total_time_running_including_curr.to_ne_bytes());
286            }
287            if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
288                // Adds a 64-bit unique value that corresponds to the event group.
289                output.extend(perf_event_file.rf_id.to_ne_bytes());
290            }
291
292            output
293        };
294
295        // The regular read() call allows the case where the bytes-we-want-to-read-in won't
296        // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
297        // into a buffer that is not big enough to hold the data, the error ENOSPC results."
298        if data.available() < read_format_data.len() {
299            return error!(ENOSPC);
300        }
301        track_stub!(
302            TODO("https://fxbug.dev/402453955"),
303            "[perf_event_open] implement remaining error handling"
304        );
305
306        data.write(&read_format_data)
307    }
308
309    fn ioctl(
310        &self,
311        _locked: &mut Locked<Unlocked>,
312        _file: &FileObject,
313        current_task: &CurrentTask,
314        op: u32,
315        _arg: SyscallArg,
316    ) -> Result<SyscallResult, Errno> {
317        track_stub!(
318            TODO("https://fxbug.dev/405463320"),
319            "[perf_event_open] implement PERF_IOC_FLAG_GROUP"
320        );
321        security::check_perf_event_write_access(current_task, &self)?;
322        let mut perf_event_file = self.perf_event_file.write();
323        match op {
324            PERF_EVENT_IOC_ENABLE => {
325                if perf_event_file.disabled != 0 {
326                    perf_event_file.disabled = 0; // 0 = false.
327                    perf_event_file.most_recent_enabled_time =
328                        zx::MonotonicInstant::get().into_nanos() as u64;
329                }
330
331                // If we are sampling, invoke the profiler and collect a sample.
332                // Currently this is an example sample collection.
333                track_stub!(
334                    TODO("https://fxbug.dev/398914921"),
335                    "[perf_event_open] implement full sampling features"
336                );
337                if perf_event_file.attr.freq() == 0
338                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
339                // This is always sound regardless of the union's tag.
340                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
341                {
342                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
343                }
344                return Ok(SUCCESS);
345            }
346            PERF_EVENT_IOC_DISABLE => {
347                if perf_event_file.disabled == 0 {
348                    perf_event_file.disabled = 1; // 1 = true.
349
350                    // Update total_time_running now that the segment has ended.
351                    let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
352                    perf_event_file.total_time_running +=
353                        curr_time - perf_event_file.most_recent_enabled_time;
354                }
355                track_stub!(
356                    TODO("https://fxbug.dev/422502681"),
357                    "[perf_event_open] implement Disable to not hardcode profiling"
358                );
359                return Ok(SUCCESS);
360            }
361            PERF_EVENT_IOC_RESET => {
362                perf_event_file.rf_value = 0;
363                return Ok(SUCCESS);
364            }
365            PERF_EVENT_IOC_REFRESH
366            | PERF_EVENT_IOC_PERIOD
367            | PERF_EVENT_IOC_SET_OUTPUT
368            | PERF_EVENT_IOC_SET_FILTER
369            | PERF_EVENT_IOC_ID
370            | PERF_EVENT_IOC_SET_BPF
371            | PERF_EVENT_IOC_PAUSE_OUTPUT
372            | PERF_EVENT_IOC_MODIFY_ATTRIBUTES
373            | PERF_EVENT_IOC_QUERY_BPF => {
374                track_stub!(
375                    TODO("https://fxbug.dev/404941053"),
376                    "[perf_event_open] implement remaining ioctl() calls"
377                );
378                return error!(ENOSYS);
379            }
380            _ => error!(ENOTTY),
381        }
382    }
383
384    // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
385    // Gets called when mmap() is called.
386    // Immediately before sampling, this should get called by the user (e.g. the test
387    // or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
388    fn get_memory(
389        &self,
390        _locked: &mut Locked<FileOpsCore>,
391        _file: &FileObject,
392        current_task: &CurrentTask,
393        length: Option<usize>,
394        _prot: ProtectionFlags,
395    ) -> Result<Arc<MemoryObject>, Errno> {
396        let buffer_size: u64 = length.unwrap_or(0) as u64;
397        if buffer_size == 0 {
398            return error!(EINVAL);
399        }
400        let page_size = zx::system_get_page_size() as u64;
401
402        security::check_perf_event_read_access(current_task, &self)?;
403
404        // TODO(https://fxbug.dev/460246292) confirm when to create metadata.
405        // Create metadata structs. Currently we hardcode everything just to get
406        // something E2E working.
407        let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
408        let metadata_value = PerfMetadataValue {
409            index: 2,
410            offset: 19337,
411            time_enabled: 0,
412            time_running: 0,
413            __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
414            pmc_width: 0,
415            time_shift: 0,
416            time_mult: 0,
417            time_offset: 0,
418            time_zero: 0,
419            size: 0,
420            __reserved_1: 0,
421            time_cycles: 0,
422            time_mask: 0,
423            __reserved: [0; 928usize],
424            data_head: page_size,
425            // Start reading from 0; it is the user's responsibility to increment on their end.
426            data_tail: 0,
427            data_offset: page_size,
428            data_size: (buffer_size - page_size) as u64,
429            aux_head: 0,
430            aux_tail: 0,
431            aux_offset: 0,
432            aux_size: 0,
433        };
434
435        // Then, wrap metadata in a SeqLock so that user can be made aware of updates.
436        // SeqLock is formatted thusly:
437        //   header_struct : any size, values should not change
438        //   sequence_counter : u32
439        //   value_struct : any size, needs locking because each value can change
440        // We split our perf_event_mmap_page accordingly. The `version` and `compat_version`
441        // should not change while the params below the `lock` may change.
442        // Sequence counter for `lock` param gets inserted between these via
443        // the `SeqLock` implementation.
444        let perf_event_file = self.perf_event_file.read();
445        // VMO does not implement Copy trait. We duplicate the VMO handle
446        // so that we can pass it to the SeqLock and the MemoryObject.
447        let vmo_handle_copy = match perf_event_file
448            .perf_data_vmo
449            .as_handle_ref()
450            .duplicate(zx::Rights::SAME_RIGHTS)
451        {
452            Ok(h) => h,
453            Err(_) => return error!(EINVAL),
454        };
455
456        // SAFETY: This is ok right now because we are the only reference to this memory.
457        // Once there are multiple references we should update this comment to confirm that
458        // there are only atomic accesses to this memory (see seq_lock lib.rs for details).
459        let mut seq_lock = match unsafe {
460            SeqLock::new_from_vmo(metadata_header, metadata_value, vmo_handle_copy.into())
461        } {
462            Ok(s) => s,
463            Err(_) => return error!(EINVAL),
464        };
465
466        // Now, the perf_data_vmo contains the full metadata page enclosed in a SeqLock.
467        // Save data_head pointer so that we can write atomically to it after profiling.
468        let metadata_struct = seq_lock.get_map_address() as *mut PerfMetadataValue;
469        // SAFETY: This is ok as we previously set the exact format (PerfMetadataValue).
470        let data_head_pointer = unsafe { std::ptr::addr_of_mut!((*metadata_struct).data_head) };
471        self.data_head_pointer.store(data_head_pointer, Ordering::Release);
472
473        match perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS) {
474            Ok(vmo) => {
475                let memory = MemoryObject::Vmo(vmo.into());
476                return Ok(Arc::new(memory));
477            }
478            Err(_) => {
479                track_stub!(
480                    TODO("https://fxbug.dev/416323134"),
481                    "[perf_event_open] handle get_memory() errors"
482                );
483                return error!(EINVAL);
484            }
485        };
486    }
487
488    fn write(
489        &self,
490        _locked: &mut Locked<FileOpsCore>,
491        _file: &FileObject,
492        _current_task: &CurrentTask,
493        _offset: usize,
494        _data: &mut dyn InputBuffer,
495    ) -> Result<usize, Errno> {
496        track_stub!(
497            TODO("https://fxbug.dev/394960158"),
498            "[perf_event_open] implement perf event functions"
499        );
500        error!(ENOSYS)
501    }
502}
503
504// Given a PerfRecordSample struct, write it via the correct output format
505// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
506// We don't currently support all the sample_types listed in the docs.
507// Input:
508//    PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
509// Human-understandable output:
510//    9 1 40 111 5 10 3 111 222 333
511// Actual output (no spaces or \n in real output, just making it more readable):
512//    0x0000 0x0009                 <-- starts at `offset` bytes
513//    0x0001
514//    0x0040
515//    0x0000 0x0000 0x0000 0x006F   <-- starts at `offset` + 8 bytes
516//    0x0000 0x0000 0x0000 0x0005
517//    0x0000 0x0000 0x0000 0x0010
518//    0x0000 0x0000 0x0000 0x0003
519//    0x0000 0x0000 0x0000 0x006F
520//    0x0000 0x0000 0x0000 0x00DE
521//    0x0000 0x0000 0x0000 0x014D
522//
523//    Returns the length of bytes written. In above case, 8 + 28 = 36.
524//    This information is used to increment the global offset.
525fn write_record_to_vmo(
526    perf_record_sample: PerfRecordSample,
527    perf_data_vmo: &zx::Vmo,
528    _data_head_pointer: &AtomicPtr<u64>,
529    sample_type: u64,
530    sample_id: u64,
531    sample_period: u64,
532    offset: u64,
533) -> u64 {
534    // Write header.
535    track_stub!(
536        TODO("https://fxbug.dev/432501467"),
537        "[perf_event_open] determines whether the record is KERNEL or USER"
538    );
539    let perf_event_header = perf_event_header {
540        type_: perf_event_type_PERF_RECORD_SAMPLE,
541        misc: PERF_RECORD_MISC_KERNEL as u16,
542        size: PERF_EVENT_HEADER_SIZE,
543    };
544
545    match perf_data_vmo.write(&perf_event_header.as_bytes(), offset) {
546        Ok(_) => (),
547        Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
548    }
549
550    // Write sample.
551    let mut sample = Vec::<u8>::new();
552    // sample_id
553    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
554        sample.extend(sample_id.to_ne_bytes());
555    }
556    // ip
557    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
558        sample.extend(perf_record_sample.ips[0].to_ne_bytes());
559    }
560
561    if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
562        // pid
563        sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
564        // tid
565        sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
566    }
567
568    // id
569    if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
570        sample.extend(sample_id.to_ne_bytes());
571    }
572
573    // sample period
574    if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
575        sample.extend(sample_period.to_ne_bytes());
576    }
577
578    if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
579        // nr
580        sample.extend(perf_record_sample.ips.len().to_ne_bytes());
581
582        // ips[nr] - list of ips, u64 per ip.
583        for i in perf_record_sample.ips {
584            sample.extend(i.to_ne_bytes());
585        }
586    }
587    // The remaining data are not defined for now.
588
589    match perf_data_vmo.write(&sample, offset + (std::mem::size_of::<perf_event_header>() as u64)) {
590        Ok(_) => {
591            let bytes_written: u64 =
592                (std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
593
594            // TODO(http://fuchsia.dev/460203776) implement this better before enabling
595            // any setting of data_head value.
596            // Update data_head because we have now written to the VMO.
597            // Ordering::Release pushes update that this (and, transitively, the sample
598            // too) has updated.
599            // data_head_pointer.fetch_add(bytes_written, Ordering::Release);
600
601            // Return the total size we wrote (header + sample) so that we can
602            // increment offset counter.
603            return bytes_written;
604        }
605        Err(e) => {
606            log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
607            // Failed to write. Don't increment offset counter.
608            return 0;
609        }
610    }
611}
612
613#[derive(Debug, Clone)]
614struct PerfRecordSample {
615    pid: Option<u32>,
616    tid: Option<u32>,
617    // Instruction pointers (currently this is the address). First one is `ip` param.
618    ips: Vec<u64>,
619}
620
621// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
622//
623// 1234                     pid
624// 5555                     tid
625// {{{bt:0:0x1111:pc}}}    {{{bt:frame_number:address:type}}}
626// {{{bt:1:0x2222:ra}}}
627// {{{bt:2:0x3333:ra}}}
628//
629// Results in:
630// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
631
632fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
633    let mut pid: Option<u32> = None;
634    let mut tid: Option<u32> = None;
635    let mut ips: Vec<u64> = Vec::new();
636    let mut numbers_found = 0;
637    track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
638    let backtrace_regex =
639        Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
640
641    for line in backtrace.lines() {
642        let trimmed_line = line.trim();
643        // Try to parse as a raw number (for PID/TID).
644        if numbers_found < 2 {
645            if let Ok(num) = trimmed_line.parse::<u32>() {
646                if numbers_found == 0 {
647                    pid = Some(num);
648                } else {
649                    tid = Some(num);
650                }
651                numbers_found += 1;
652                continue;
653            }
654        }
655
656        // Try to parse as a backtrace line.
657        if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
658            let address_str = parsed_bt.get(1).unwrap().as_str();
659            if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
660                ips.push(ip_addr);
661            }
662        }
663    }
664
665    if pid == None || tid == None || ips.is_empty() {
666        // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
667        log_info!("No ips while getting PerfRecordSample");
668        None
669    } else {
670        Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
671    }
672}
673
674async fn set_up_profiler(
675    sample_period: zx::MonotonicDuration,
676) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
677    // Configuration for how we want to sample.
678    let sample = profiler::Sample {
679        callgraph: Some(profiler::CallgraphConfig {
680            strategy: Some(profiler::CallgraphStrategy::FramePointer),
681            ..Default::default()
682        }),
683        ..Default::default()
684    };
685
686    let sampling_config = profiler::SamplingConfig {
687        period: Some(sample_period.into_nanos() as u64),
688        timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
689        sample: Some(sample),
690        ..Default::default()
691    };
692
693    let tasks = vec![
694        // Should return ~300 samples for 100 millis.
695        profiler::Task::SystemWide(profiler::SystemWide {}),
696    ];
697    let targets = profiler::TargetConfig::Tasks(tasks);
698    let config = profiler::Config {
699        configs: Some(vec![sampling_config]),
700        target: Some(targets),
701        ..Default::default()
702    };
703    let (client, server) = fidl::Socket::create_stream();
704    let configure = profiler::SessionConfigureRequest {
705        output: Some(server),
706        config: Some(config),
707        ..Default::default()
708    };
709
710    let proxy = connect_to_protocol::<profiler::SessionMarker>()
711        .context("Error connecting to Profiler protocol");
712    let session_proxy: profiler::SessionProxy = match proxy {
713        Ok(p) => p.clone(),
714        Err(e) => return error!(EINVAL, e),
715    };
716
717    // Must configure before sampling start().
718    let config_request = session_proxy.configure(configure).await;
719    match config_request {
720        Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
721        Err(e) => return error!(EINVAL, e),
722    }
723}
724
725// Collects samples and puts backtrace in VMO.
726// - Starts and stops sampling for a duration.
727// - Reads in the buffer from the socket for that duration in chunks.
728// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
729// - Writes the PERF_RECORD_SAMPLE into VMO.
730async fn collect_sample(
731    session_proxy: profiler::SessionProxy,
732    mut client: fidl::AsyncSocket,
733    duration: Duration,
734    perf_data_vmo: &zx::Vmo,
735    data_head_pointer: &AtomicPtr<u64>,
736    sample_type: u64,
737    sample_id: u64,
738    sample_period: u64,
739    vmo_write_offset: u64,
740) -> Result<(), Errno> {
741    let start_request = profiler::SessionStartRequest {
742        buffer_results: Some(true),
743        buffer_size_mb: Some(8 as u64),
744        ..Default::default()
745    };
746    let _ = session_proxy.start(&start_request).await.expect("Failed to start profiling");
747
748    // Hardcode a duration so that samples can be collected. This is currently solely used to
749    // demonstrate that an E2E implementation of sample collection works.
750    track_stub!(
751        TODO("https://fxbug.dev/428974888"),
752        "[perf_event_open] don't hardcode sleep; test/user should decide sample duration"
753    );
754    let _ = fuchsia_async::Timer::new(duration).await;
755
756    let stats = session_proxy.stop().await;
757    let samples_collected = match stats {
758        Ok(stats) => stats.samples_collected.unwrap(),
759        Err(e) => return error!(EINVAL, e),
760    };
761
762    track_stub!(
763        TODO("https://fxbug.dev/422502681"),
764        "[perf_event_open] symbolize sample output and delete the below log_info"
765    );
766    log_info!("profiler samples_collected: {:?}", samples_collected);
767
768    // Peek at the first 8 bytes to determine if it's FXT or text.
769    let mut header = [0; 8];
770    let mut bytes_read = 0;
771    while bytes_read < 8 {
772        match client.read(&mut header[bytes_read..]).await {
773            Ok(0) => {
774                // Peer closed the socket. This is the normal end of the stream.
775                log_info!("[perf_event_open] Finished reading fxt record from socket.");
776                break;
777            }
778            Ok(n) => bytes_read += n,
779            Err(e) => {
780                log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
781                break;
782            }
783        }
784    }
785
786    if bytes_read > 0 {
787        if bytes_read == 8 && header == FXT_MAGIC_BYTES {
788            // FXT format.
789            let header_cursor = Cursor::new(header);
790            let reader = header_cursor.chain(client);
791            let (mut stream, _task) = SessionParser::new_async(reader);
792            while let Some(record_result) = stream.next().await {
793                match record_result {
794                    Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
795                        let ips: Vec<u64> = backtrace.data;
796                        let pid = Some(backtrace.process.0 as u32);
797                        let tid = Some(backtrace.thread.0 as u32);
798                        let perf_record_sample = PerfRecordSample { pid, tid, ips };
799                        write_record_to_vmo(
800                            perf_record_sample,
801                            perf_data_vmo,
802                            data_head_pointer,
803                            sample_type,
804                            sample_id,
805                            sample_period,
806                            vmo_write_offset,
807                        );
808                    }
809                    Ok(_) => {
810                        // Ignore other records.
811                    }
812                    Err(e) => {
813                        log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
814                        break;
815                    }
816                }
817            }
818        } else {
819            // Text format.
820            // Read chunks of sampling data from socket in this buffer temporarily. We will parse
821            // the data and write it into the output VMO (the one mmap points to).
822            let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
823
824            loop {
825                // Attempt to read data. This awaits until data is available, EOF, or error.
826                // Ignore the first 8 bytes as it's the {{{reset}}} marker.
827                let socket_data = client.read(&mut buffer).await;
828
829                match socket_data {
830                    Ok(0) => {
831                        // Peer closed the socket. This is the normal end of the stream.
832                        log_info!("[perf_event_open] Finished reading from socket.");
833                        break;
834                    }
835                    Ok(bytes_read) => {
836                        // Receive data in format {{{...}}}.
837                        let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
838                            Ok(data) => data,
839                            Err(e) => return error!(EINVAL, e),
840                        };
841                        // Parse data to PerfRecordSample struct.
842                        if let Some(perf_record_sample) =
843                            parse_perf_record_sample_format(received_data)
844                        {
845                            write_record_to_vmo(
846                                perf_record_sample,
847                                perf_data_vmo,
848                                data_head_pointer,
849                                sample_type,
850                                sample_id,
851                                sample_period,
852                                vmo_write_offset,
853                            );
854                        }
855                    }
856                    Err(e) => {
857                        log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
858                        break;
859                    }
860                }
861            }
862        }
863    }
864
865    let reset_status = session_proxy.reset().await;
866    return match reset_status {
867        Ok(_) => Ok(()),
868        Err(e) => error!(EINVAL, e),
869    };
870}
871
872// Notifies other thread that we should start/stop sampling.
873// Once sampling is complete, that profiler session is no longer needed.
874// At that point, send back notification so that this is no longer blocking
875// (e.g. so that other profiler sessions can start).
876fn ping_receiver(
877    mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
878    command: IoctlOp,
879) {
880    log_info!("[perf_event_open] Received sampling command: {:?}", command);
881    let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
882    match ioctl_sender.try_send((command, profiling_complete_sender)) {
883        Ok(_) => (),
884        Err(e) => {
885            if e.is_full() {
886                log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
887            } else if e.is_disconnected() {
888                log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
889            } else {
890                log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
891            }
892        }
893    };
894    // Block on / wait until profiling is complete before returning.
895    // This notifies that the profiler is free to be used for another session.
896    let _ = profiling_complete_receiver.recv().unwrap();
897}
898
899pub fn sys_perf_event_open(
900    locked: &mut Locked<Unlocked>,
901    current_task: &CurrentTask,
902    attr: UserRef<perf_event_attr>,
903    // Note that this is pid in Linux docs.
904    tid: tid_t,
905    cpu: i32,
906    group_fd: FdNumber,
907    _flags: u64,
908) -> Result<SyscallResult, Errno> {
909    // So far, the implementation only sets the read_data_format according to the "Reading results"
910    // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
911    // Other features will be added in the future (see below track_stubs).
912    let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
913
914    if tid == -1 && cpu == -1 {
915        return error!(EINVAL);
916    }
917
918    let target_task_type = match tid {
919        -1 => TargetTaskType::AllTasks,
920        0 => TargetTaskType::CurrentTask,
921        _ => {
922            track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
923            return error!(ENOSYS);
924        }
925    };
926    security::check_perf_event_open_access(
927        current_task,
928        target_task_type,
929        &perf_event_attrs,
930        perf_event_attrs.type_.try_into()?,
931    )?;
932
933    // Channel used to send info between notifier and spawned task thread.
934    // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
935    // quick succession (instead of something lower).
936    let (sender, mut receiver) = future_mpsc::channel(8);
937
938    let page_size = zx::system_get_page_size() as u64;
939    let mut perf_event_file = PerfEventFileState::new(
940        perf_event_attrs,
941        0,
942        perf_event_attrs.disabled(),
943        perf_event_attrs.sample_type,
944        zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
945        page_size, // Start with this amount of offset, we can increment as we write.
946        sender,
947    );
948
949    let read_format = perf_event_attrs.read_format;
950
951    if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
952        || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
953    {
954        // Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
955        // as otherwise this param shouldn't be used for calculating anything.
956        if perf_event_file.disabled == 0 {
957            perf_event_file.most_recent_enabled_time =
958                zx::MonotonicInstant::get().into_nanos() as u64;
959        }
960        // Initialize this to 0 as we will need to return a time duration later during read().
961        perf_event_file.total_time_running = 0;
962    }
963
964    let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
965    perf_event_file.rf_id = event_id;
966
967    if group_fd.raw() == -1 {
968        perf_event_file.sample_id = event_id;
969    } else {
970        let group_file = current_task.files.get(group_fd)?;
971        let group_file_object_id = group_file.id;
972        let perf_state = get_perf_state(&current_task.kernel);
973        let events = perf_state.format_id_lookup_table.lock();
974        if let Some(rf_id) = events.get(&group_file_object_id) {
975            perf_event_file.sample_id = *rf_id;
976        } else {
977            return error!(EINVAL);
978        }
979    }
980
981    if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
982        track_stub!(
983            TODO("https://fxbug.dev/402238049"),
984            "[perf_event_open] implement read_format group"
985        );
986        return error!(ENOSYS);
987    }
988    if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
989        track_stub!(
990            TODO("https://fxbug.dev/402260383"),
991            "[perf_event_open] implement read_format lost"
992        );
993    }
994
995    // Set up notifier for handling ioctl calls to enable/disable sampling.
996    let mut vmo_handle_copy =
997        perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS);
998
999    // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
1000    // This is always sound regardless of the union's tag.
1001    let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
1002    // The sample period from the PERF_COUNT_SW_CPU_CLOCK is
1003    // 1 nanosecond per tick. Convert this duration into zx::duration.
1004    let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
1005
1006    let data_head_pointer = Arc::new(AtomicPtr::new(std::ptr::null_mut::<u64>()));
1007    // Pass cloned into the thread.
1008    let cloned_data_head_pointer = Arc::clone(&data_head_pointer);
1009
1010    let closure = async move |_: LockedAndTask<'_>| {
1011        // This loop will wait for messages from the sender.
1012        while let Some((command, profiling_complete_receiver)) = receiver.next().await {
1013            match command {
1014                IoctlOp::Enable => {
1015                    match set_up_profiler(zx_sample_period).await {
1016                        Ok((session_proxy, client)) => {
1017                            track_stub!(
1018                                TODO("https://fxbug.dev/422502681"),
1019                                "[perf_event_open] don't hardcode profiling duration"
1020                            );
1021
1022                            let handle = vmo_handle_copy
1023                                .as_mut()
1024                                .expect("Failed to get VMO handle")
1025                                .as_handle_ref()
1026                                .duplicate(zx::Rights::SAME_RIGHTS)
1027                                .unwrap();
1028
1029                            let _ = collect_sample(
1030                                session_proxy,
1031                                client,
1032                                Duration::from_millis(100),
1033                                &zx::Vmo::from(handle),
1034                                &*cloned_data_head_pointer,
1035                                perf_event_file.sample_type,
1036                                perf_event_file.sample_id,
1037                                sample_period_in_ticks,
1038                                perf_event_file.vmo_write_offset,
1039                            )
1040                            .await;
1041                            // Send notification that profiler session is over.
1042                            let _ = profiling_complete_receiver.send(());
1043                        }
1044                        Err(e) => {
1045                            log_warn!("Failed to profile: {}", e);
1046                        }
1047                    };
1048                }
1049            }
1050        }
1051        ()
1052    };
1053    let req = SpawnRequestBuilder::new()
1054        .with_debug_name("perf-event-sampler")
1055        .with_async_closure(closure)
1056        .build();
1057    current_task.kernel().kthreads.spawner().spawn_from_request(req);
1058
1059    let file = Box::new(PerfEventFile {
1060        _tid: tid,
1061        _cpu: cpu,
1062        perf_event_file: RwLock::new(perf_event_file),
1063        security_state: security::perf_event_alloc(current_task),
1064        data_head_pointer: data_head_pointer,
1065    });
1066    // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
1067    let file_handle =
1068        Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
1069    let file_object_id = file_handle.id;
1070    let file_descriptor: Result<FdNumber, Errno> =
1071        current_task.add_file(locked, file_handle, FdFlags::empty());
1072
1073    match file_descriptor {
1074        Ok(fd) => {
1075            if group_fd.raw() == -1 {
1076                let perf_state = get_perf_state(&current_task.kernel);
1077                let mut events = perf_state.format_id_lookup_table.lock();
1078                events.insert(file_object_id, event_id);
1079            }
1080            Ok(fd.into())
1081        }
1082        Err(_) => {
1083            track_stub!(
1084                TODO("https://fxbug.dev/402453955"),
1085                "[perf_event_open] implement remaining error handling"
1086            );
1087            error!(EMFILE)
1088        }
1089    }
1090}
1091// Syscalls for arch32 usage
1092#[cfg(target_arch = "aarch64")]
1093mod arch32 {
1094    pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
1095}
1096
1097#[cfg(target_arch = "aarch64")]
1098pub use arch32::*;
1099
1100use crate::mm::memory::MemoryObject;
1101use crate::mm::{MemoryAccessorExt, ProtectionFlags};
1102use crate::task::CurrentTask;
1103use crate::vfs::{
1104    Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
1105    OutputBuffer,
1106};
1107use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};