starnix_core/perf/
mod.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
6use anyhow::Context;
7use fuchsia_component::client::connect_to_protocol;
8use futures::StreamExt;
9use futures::channel::mpsc as future_mpsc;
10use regex::Regex;
11use std::collections::HashMap;
12use std::error::Error;
13use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};
14use std::sync::{Arc, mpsc as sync_mpsc};
15use std::time::Duration;
16use zerocopy::{Immutable, IntoBytes};
17use zx::AsHandleRef;
18use {fidl_fuchsia_cpu_profiler as profiler, fuchsia_async};
19
20use futures::io::{AsyncReadExt, Cursor};
21use fxt::TraceRecord;
22use fxt::profiler::ProfilerRecord;
23use fxt::session::SessionParser;
24use seq_lock::SeqLock;
25use starnix_logging::{log_info, log_warn, track_stub};
26use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked};
27use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
28use starnix_uapi::arch32::{
29    PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
30    PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
31    PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
32    PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
33    perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
34    perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
35    perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
36    perf_event_type_PERF_RECORD_SAMPLE,
37};
38use starnix_uapi::errors::Errno;
39use starnix_uapi::open_flags::OpenFlags;
40use starnix_uapi::user_address::UserRef;
41use starnix_uapi::{
42    error, perf_event_attr, perf_event_header, perf_event_mmap_page__bindgen_ty_1,
43    perf_event_read_format_PERF_FORMAT_GROUP, perf_event_read_format_PERF_FORMAT_ID,
44    perf_event_read_format_PERF_FORMAT_LOST, perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
45    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
46};
47
48use crate::security::{self, TargetTaskType};
49use crate::task::{Kernel, LockedAndTask};
50
51static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
52// Default buffer size to read from socket (for sampling data).
53const DEFAULT_CHUNK_SIZE: usize = 4096;
54const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960; // 4096 * 10, page size * 10.
55// perf_event_header struct size: 32 + 16 + 16 = 8 bytes.
56const PERF_EVENT_HEADER_SIZE: u16 = 8;
57// FXT magic bytes (little endian).
58const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
59
60mod event;
61pub use event::{TraceEvent, TraceEventQueue};
62
63#[repr(C)]
64#[derive(Copy, Clone, IntoBytes, Immutable)]
65struct PerfMetadataHeader {
66    version: u32,
67    compat_version: u32,
68}
69
70#[repr(C, packed)]
71#[derive(Copy, Clone, IntoBytes, Immutable)]
72struct PerfMetadataValue {
73    index: u32,
74    offset: i64,
75    time_enabled: u64,
76    time_running: u64,
77    __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
78    pmc_width: u16,
79    time_shift: u16,
80    time_mult: u32,
81    time_offset: u64,
82    time_zero: u64,
83    size: u32,
84    __reserved_1: u32,
85    time_cycles: u64,
86    time_mask: u64,
87    __reserved: [u8; 928usize],
88    data_head: u64,
89    data_tail: u64,
90    data_offset: u64,
91    data_size: u64,
92    aux_head: u64,
93    aux_tail: u64,
94    aux_offset: u64,
95    aux_size: u64,
96}
97
98struct PerfState {
99    // This table maps a group leader's file object id to its unique u64 "format ID".
100    //
101    // When a sample is generated for any event in a group, we use this
102    // "format ID" from the group leader as the value for *both* the
103    // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
104    format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>,
105}
106
107impl Default for PerfState {
108    fn default() -> Self {
109        Self { format_id_lookup_table: Mutex::new(HashMap::new()) }
110    }
111}
112
113fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
114    kernel.expando.get_or_init(PerfState::default)
115}
116
117uapi::check_arch_independent_layout! {
118    perf_event_attr {
119        type_, // "type" is a reserved keyword so add a trailing underscore.
120        size,
121        config,
122        __bindgen_anon_1,
123        sample_type,
124        read_format,
125        _bitfield_1,
126        __bindgen_anon_2,
127        bp_type,
128        __bindgen_anon_3,
129        __bindgen_anon_4,
130        branch_sample_type,
131        sample_regs_user,
132        sample_stack_user,
133        clockid,
134        sample_regs_intr,
135        aux_watermark,
136        sample_max_stack,
137        __reserved_2,
138        aux_sample_size,
139        __reserved_3,
140        sig_data,
141        config3,
142    }
143}
144
145#[derive(Clone, Copy, Debug, PartialEq)]
146enum IoctlOp {
147    Enable,
148}
149
150struct PerfEventFileState {
151    attr: perf_event_attr,
152    rf_value: u64, // "count" for the config we passed in for the event.
153    // The most recent timestamp (ns) where we changed into an enabled state
154    // i.e. the most recent time we got an ENABLE ioctl().
155    most_recent_enabled_time: u64,
156    // Sum of all previous enablement segment durations (ns). If we are
157    // currently in an enabled state, explicitly does NOT include the current
158    // segment.
159    total_time_running: u64,
160    rf_id: u64,
161    sample_id: u64,
162    _rf_lost: u64,
163    disabled: u64,
164    sample_type: u64,
165    // Handle to blob that stores all the perf data that a user may want.
166    // At the moment it only stores some metadata and backtraces (bts).
167    perf_data_vmo: zx::Vmo,
168    // Remember to increment this offset as the number of pages increases.
169    // Currently we just have a bound of 1 page_size of information.
170    vmo_write_offset: u64,
171    // Channel used to send IoctlOps to start/stop sampling.
172    ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
173}
174
175// Have an implementation for PerfEventFileState because VMO
176// doesn't have Default so we can't derive it.
177impl PerfEventFileState {
178    fn new(
179        attr: perf_event_attr,
180        rf_value: u64,
181        disabled: u64,
182        sample_type: u64,
183        perf_data_vmo: zx::Vmo,
184        vmo_write_offset: u64,
185        ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
186    ) -> PerfEventFileState {
187        PerfEventFileState {
188            attr,
189            rf_value,
190            most_recent_enabled_time: 0,
191            total_time_running: 0,
192            rf_id: 0,
193            sample_id: 0,
194            _rf_lost: 0,
195            disabled,
196            sample_type,
197            perf_data_vmo,
198            vmo_write_offset,
199            ioctl_sender,
200        }
201    }
202}
203
204pub struct PerfEventFile {
205    _tid: tid_t,
206    _cpu: i32,
207    perf_event_file: RwLock<PerfEventFileState>,
208    // The security state for this PerfEventFile.
209    pub security_state: security::PerfEventState,
210    // Pointer to the perf_event_mmap_page metadata's data_head.
211    // TODO(https://fxbug.dev/460203776) Remove Arc after figuring out
212    // "borrowed value does not live long enough" issue.
213    data_head_pointer: Arc<AtomicPtr<u64>>,
214}
215
216// PerfEventFile object that implements FileOps.
217// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
218// implementation details.
219// This object can be saved as a FileDescriptor.
220impl FileOps for PerfEventFile {
221    // Don't need to implement seek or sync for PerfEventFile.
222    fileops_impl_nonseekable!();
223    fileops_impl_noop_sync!();
224
225    fn close(
226        self: Box<Self>,
227        _locked: &mut Locked<FileOpsCore>,
228        file: &FileObjectState,
229        current_task: &CurrentTask,
230    ) {
231        let perf_state = get_perf_state(&current_task.kernel);
232        let mut events = perf_state.format_id_lookup_table.lock();
233        events.remove(&file.id);
234    }
235
236    // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
237    fn read(
238        &self,
239        _locked: &mut Locked<FileOpsCore>,
240        _file: &FileObject,
241        current_task: &CurrentTask,
242        _offset: usize,
243        data: &mut dyn OutputBuffer,
244    ) -> Result<usize, Errno> {
245        // Create/calculate and return the ReadFormatData object.
246        // If we create it earlier we might want to change it and it's immutable once created.
247        let read_format_data = {
248            // Once we get the `value` or count from kernel, we can change this to a read()
249            // call instead of write().
250            let mut perf_event_file = self.perf_event_file.write();
251
252            security::check_perf_event_read_access(current_task, &self)?;
253
254            let mut total_time_running_including_curr = perf_event_file.total_time_running;
255
256            // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
257            if perf_event_file.disabled == 0 {
258                // Calculate the value or "count" of the config we're interested in.
259                // This value should reflect the value we are counting (defined in the config).
260                // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
261                // For now we just return rf_value + 1.
262                track_stub!(
263                    TODO("https://fxbug.dev/402938671"),
264                    "[perf_event_open] implement read_format value"
265                );
266                perf_event_file.rf_value += 1;
267
268                // Update time duration.
269                let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
270                total_time_running_including_curr +=
271                    curr_time - perf_event_file.most_recent_enabled_time;
272            }
273
274            let mut output = Vec::<u8>::new();
275            let value = perf_event_file.rf_value.to_ne_bytes();
276            output.extend(value);
277
278            let read_format = perf_event_file.attr.read_format;
279
280            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
281                // Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
282                output.extend(total_time_running_including_curr.to_ne_bytes());
283            }
284            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
285                // Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
286                output.extend(total_time_running_including_curr.to_ne_bytes());
287            }
288            if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
289                // Adds a 64-bit unique value that corresponds to the event group.
290                output.extend(perf_event_file.rf_id.to_ne_bytes());
291            }
292
293            output
294        };
295
296        // The regular read() call allows the case where the bytes-we-want-to-read-in won't
297        // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
298        // into a buffer that is not big enough to hold the data, the error ENOSPC results."
299        if data.available() < read_format_data.len() {
300            return error!(ENOSPC);
301        }
302        track_stub!(
303            TODO("https://fxbug.dev/402453955"),
304            "[perf_event_open] implement remaining error handling"
305        );
306
307        data.write(&read_format_data)
308    }
309
310    fn ioctl(
311        &self,
312        _locked: &mut Locked<Unlocked>,
313        _file: &FileObject,
314        current_task: &CurrentTask,
315        op: u32,
316        _arg: SyscallArg,
317    ) -> Result<SyscallResult, Errno> {
318        track_stub!(
319            TODO("https://fxbug.dev/405463320"),
320            "[perf_event_open] implement PERF_IOC_FLAG_GROUP"
321        );
322        security::check_perf_event_write_access(current_task, &self)?;
323        let mut perf_event_file = self.perf_event_file.write();
324        match op {
325            PERF_EVENT_IOC_ENABLE => {
326                if perf_event_file.disabled != 0 {
327                    perf_event_file.disabled = 0; // 0 = false.
328                    perf_event_file.most_recent_enabled_time =
329                        zx::MonotonicInstant::get().into_nanos() as u64;
330                }
331
332                // If we are sampling, invoke the profiler and collect a sample.
333                // Currently this is an example sample collection.
334                track_stub!(
335                    TODO("https://fxbug.dev/398914921"),
336                    "[perf_event_open] implement full sampling features"
337                );
338                if perf_event_file.attr.freq() == 0
339                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
340                // This is always sound regardless of the union's tag.
341                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
342                {
343                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
344                }
345                return Ok(SUCCESS);
346            }
347            PERF_EVENT_IOC_DISABLE => {
348                if perf_event_file.disabled == 0 {
349                    perf_event_file.disabled = 1; // 1 = true.
350
351                    // Update total_time_running now that the segment has ended.
352                    let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
353                    perf_event_file.total_time_running +=
354                        curr_time - perf_event_file.most_recent_enabled_time;
355                }
356                track_stub!(
357                    TODO("https://fxbug.dev/422502681"),
358                    "[perf_event_open] implement Disable to not hardcode profiling"
359                );
360                return Ok(SUCCESS);
361            }
362            PERF_EVENT_IOC_RESET => {
363                perf_event_file.rf_value = 0;
364                return Ok(SUCCESS);
365            }
366            PERF_EVENT_IOC_REFRESH
367            | PERF_EVENT_IOC_PERIOD
368            | PERF_EVENT_IOC_SET_OUTPUT
369            | PERF_EVENT_IOC_SET_FILTER
370            | PERF_EVENT_IOC_ID
371            | PERF_EVENT_IOC_SET_BPF
372            | PERF_EVENT_IOC_PAUSE_OUTPUT
373            | PERF_EVENT_IOC_MODIFY_ATTRIBUTES
374            | PERF_EVENT_IOC_QUERY_BPF => {
375                track_stub!(
376                    TODO("https://fxbug.dev/404941053"),
377                    "[perf_event_open] implement remaining ioctl() calls"
378                );
379                return error!(ENOSYS);
380            }
381            _ => error!(ENOTTY),
382        }
383    }
384
385    // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
386    // Gets called when mmap() is called.
387    // Immediately before sampling, this should get called by the user (e.g. the test
388    // or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
389    fn get_memory(
390        &self,
391        _locked: &mut Locked<FileOpsCore>,
392        _file: &FileObject,
393        current_task: &CurrentTask,
394        length: Option<usize>,
395        _prot: ProtectionFlags,
396    ) -> Result<Arc<MemoryObject>, Errno> {
397        let buffer_size: u64 = length.unwrap_or(0) as u64;
398        if buffer_size == 0 {
399            return error!(EINVAL);
400        }
401        let page_size = zx::system_get_page_size() as u64;
402
403        security::check_perf_event_read_access(current_task, &self)?;
404
405        // TODO(https://fxbug.dev/460246292) confirm when to create metadata.
406        // Create metadata structs. Currently we hardcode everything just to get
407        // something E2E working.
408        let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
409        let metadata_value = PerfMetadataValue {
410            index: 2,
411            offset: 19337,
412            time_enabled: 0,
413            time_running: 0,
414            __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
415            pmc_width: 0,
416            time_shift: 0,
417            time_mult: 0,
418            time_offset: 0,
419            time_zero: 0,
420            size: 0,
421            __reserved_1: 0,
422            time_cycles: 0,
423            time_mask: 0,
424            __reserved: [0; 928usize],
425            data_head: page_size,
426            // Start reading from 0; it is the user's responsibility to increment on their end.
427            data_tail: 0,
428            data_offset: page_size,
429            data_size: (buffer_size - page_size) as u64,
430            aux_head: 0,
431            aux_tail: 0,
432            aux_offset: 0,
433            aux_size: 0,
434        };
435
436        // Then, wrap metadata in a SeqLock so that user can be made aware of updates.
437        // SeqLock is formatted thusly:
438        //   header_struct : any size, values should not change
439        //   sequence_counter : u32
440        //   value_struct : any size, needs locking because each value can change
441        // We split our perf_event_mmap_page accordingly. The `version` and `compat_version`
442        // should not change while the params below the `lock` may change.
443        // Sequence counter for `lock` param gets inserted between these via
444        // the `SeqLock` implementation.
445        let perf_event_file = self.perf_event_file.read();
446        // VMO does not implement Copy trait. We duplicate the VMO handle
447        // so that we can pass it to the SeqLock and the MemoryObject.
448        let vmo_handle_copy = match perf_event_file
449            .perf_data_vmo
450            .as_handle_ref()
451            .duplicate(zx::Rights::SAME_RIGHTS)
452        {
453            Ok(h) => h,
454            Err(_) => return error!(EINVAL),
455        };
456
457        // SAFETY: This is ok right now because we are the only reference to this memory.
458        // Once there are multiple references we should update this comment to confirm that
459        // there are only atomic accesses to this memory (see seq_lock lib.rs for details).
460        let mut seq_lock = match unsafe {
461            SeqLock::new_from_vmo(metadata_header, metadata_value, vmo_handle_copy.into())
462        } {
463            Ok(s) => s,
464            Err(_) => return error!(EINVAL),
465        };
466
467        // Now, the perf_data_vmo contains the full metadata page enclosed in a SeqLock.
468        // Save data_head pointer so that we can write atomically to it after profiling.
469        let metadata_struct = seq_lock.get_map_address() as *mut PerfMetadataValue;
470        // SAFETY: This is ok as we previously set the exact format (PerfMetadataValue).
471        let data_head_pointer = unsafe { std::ptr::addr_of_mut!((*metadata_struct).data_head) };
472        self.data_head_pointer.store(data_head_pointer, Ordering::Release);
473
474        match perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS) {
475            Ok(vmo) => {
476                let memory = MemoryObject::Vmo(vmo.into());
477                return Ok(Arc::new(memory));
478            }
479            Err(_) => {
480                track_stub!(
481                    TODO("https://fxbug.dev/416323134"),
482                    "[perf_event_open] handle get_memory() errors"
483                );
484                return error!(EINVAL);
485            }
486        };
487    }
488
489    fn write(
490        &self,
491        _locked: &mut Locked<FileOpsCore>,
492        _file: &FileObject,
493        _current_task: &CurrentTask,
494        _offset: usize,
495        _data: &mut dyn InputBuffer,
496    ) -> Result<usize, Errno> {
497        track_stub!(
498            TODO("https://fxbug.dev/394960158"),
499            "[perf_event_open] implement perf event functions"
500        );
501        error!(ENOSYS)
502    }
503}
504
505// Given a PerfRecordSample struct, write it via the correct output format
506// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
507// We don't currently support all the sample_types listed in the docs.
508// Input:
509//    PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
510// Human-understandable output:
511//    9 1 40 111 5 10 3 111 222 333
512// Actual output (no spaces or \n in real output, just making it more readable):
513//    0x0000 0x0009                 <-- starts at `offset` bytes
514//    0x0001
515//    0x0040
516//    0x0000 0x0000 0x0000 0x006F   <-- starts at `offset` + 8 bytes
517//    0x0000 0x0000 0x0000 0x0005
518//    0x0000 0x0000 0x0000 0x0010
519//    0x0000 0x0000 0x0000 0x0003
520//    0x0000 0x0000 0x0000 0x006F
521//    0x0000 0x0000 0x0000 0x00DE
522//    0x0000 0x0000 0x0000 0x014D
523//
524//    Returns the length of bytes written. In above case, 8 + 28 = 36.
525//    This information is used to increment the global offset.
526fn write_record_to_vmo(
527    perf_record_sample: PerfRecordSample,
528    perf_data_vmo: &zx::Vmo,
529    _data_head_pointer: &AtomicPtr<u64>,
530    sample_type: u64,
531    sample_id: u64,
532    sample_period: u64,
533    offset: u64,
534) -> u64 {
535    // Write header.
536    track_stub!(
537        TODO("https://fxbug.dev/432501467"),
538        "[perf_event_open] determines whether the record is KERNEL or USER"
539    );
540    let perf_event_header = perf_event_header {
541        type_: perf_event_type_PERF_RECORD_SAMPLE,
542        misc: PERF_RECORD_MISC_KERNEL as u16,
543        size: PERF_EVENT_HEADER_SIZE,
544    };
545
546    match perf_data_vmo.write(&perf_event_header.as_bytes(), offset) {
547        Ok(_) => (),
548        Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
549    }
550
551    // Write sample.
552    let mut sample = Vec::<u8>::new();
553    // sample_id
554    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
555        sample.extend(sample_id.to_ne_bytes());
556    }
557    // ip
558    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
559        sample.extend(perf_record_sample.ips[0].to_ne_bytes());
560    }
561
562    if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
563        // pid
564        sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
565        // tid
566        sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
567    }
568
569    // id
570    if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
571        sample.extend(sample_id.to_ne_bytes());
572    }
573
574    // sample period
575    if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
576        sample.extend(sample_period.to_ne_bytes());
577    }
578
579    if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
580        // nr
581        sample.extend(perf_record_sample.ips.len().to_ne_bytes());
582
583        // ips[nr] - list of ips, u64 per ip.
584        for i in perf_record_sample.ips {
585            sample.extend(i.to_ne_bytes());
586        }
587    }
588    // The remaining data are not defined for now.
589
590    match perf_data_vmo.write(&sample, offset + (std::mem::size_of::<perf_event_header>() as u64)) {
591        Ok(_) => {
592            let bytes_written: u64 =
593                (std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
594
595            // TODO(http://fuchsia.dev/460203776) implement this better before enabling
596            // any setting of data_head value.
597            // Update data_head because we have now written to the VMO.
598            // Ordering::Release pushes update that this (and, transitively, the sample
599            // too) has updated.
600            // data_head_pointer.fetch_add(bytes_written, Ordering::Release);
601
602            // Return the total size we wrote (header + sample) so that we can
603            // increment offset counter.
604            return bytes_written;
605        }
606        Err(e) => {
607            log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
608            // Failed to write. Don't increment offset counter.
609            return 0;
610        }
611    }
612}
613
614#[derive(Debug, Clone)]
615struct PerfRecordSample {
616    pid: Option<u32>,
617    tid: Option<u32>,
618    // Instruction pointers (currently this is the address). First one is `ip` param.
619    ips: Vec<u64>,
620}
621
622// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
623//
624// 1234                     pid
625// 5555                     tid
626// {{{bt:0:0x1111:pc}}}    {{{bt:frame_number:address:type}}}
627// {{{bt:1:0x2222:ra}}}
628// {{{bt:2:0x3333:ra}}}
629//
630// Results in:
631// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
632
633fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
634    let mut pid: Option<u32> = None;
635    let mut tid: Option<u32> = None;
636    let mut ips: Vec<u64> = Vec::new();
637    let mut numbers_found = 0;
638    track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
639    let backtrace_regex =
640        Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
641
642    for line in backtrace.lines() {
643        let trimmed_line = line.trim();
644        // Try to parse as a raw number (for PID/TID).
645        if numbers_found < 2 {
646            if let Ok(num) = trimmed_line.parse::<u32>() {
647                if numbers_found == 0 {
648                    pid = Some(num);
649                } else {
650                    tid = Some(num);
651                }
652                numbers_found += 1;
653                continue;
654            }
655        }
656
657        // Try to parse as a backtrace line.
658        if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
659            let address_str = parsed_bt.get(1).unwrap().as_str();
660            if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
661                ips.push(ip_addr);
662            }
663        }
664    }
665
666    if pid == None || tid == None || ips.is_empty() {
667        // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
668        log_info!("No ips while getting PerfRecordSample");
669        None
670    } else {
671        Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
672    }
673}
674
675async fn set_up_profiler(
676    sample_period: zx::MonotonicDuration,
677) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
678    // Configuration for how we want to sample.
679    let sample = profiler::Sample {
680        callgraph: Some(profiler::CallgraphConfig {
681            strategy: Some(profiler::CallgraphStrategy::FramePointer),
682            ..Default::default()
683        }),
684        ..Default::default()
685    };
686
687    let sampling_config = profiler::SamplingConfig {
688        period: Some(sample_period.into_nanos() as u64),
689        timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
690        sample: Some(sample),
691        ..Default::default()
692    };
693
694    let tasks = vec![
695        // Should return ~300 samples for 100 millis.
696        profiler::Task::SystemWide(profiler::SystemWide {}),
697    ];
698    let targets = profiler::TargetConfig::Tasks(tasks);
699    let config = profiler::Config {
700        configs: Some(vec![sampling_config]),
701        target: Some(targets),
702        ..Default::default()
703    };
704    let (client, server) = fidl::Socket::create_stream();
705    let configure = profiler::SessionConfigureRequest {
706        output: Some(server),
707        config: Some(config),
708        ..Default::default()
709    };
710
711    let proxy = connect_to_protocol::<profiler::SessionMarker>()
712        .context("Error connecting to Profiler protocol");
713    let session_proxy: profiler::SessionProxy = match proxy {
714        Ok(p) => p.clone(),
715        Err(e) => return error!(EINVAL, e),
716    };
717
718    // Must configure before sampling start().
719    let config_request = session_proxy.configure(configure).await;
720    match config_request {
721        Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
722        Err(e) => return error!(EINVAL, e),
723    }
724}
725
726// Collects samples and puts backtrace in VMO.
727// - Starts and stops sampling for a duration.
728// - Reads in the buffer from the socket for that duration in chunks.
729// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
730// - Writes the PERF_RECORD_SAMPLE into VMO.
731async fn collect_sample(
732    session_proxy: profiler::SessionProxy,
733    mut client: fidl::AsyncSocket,
734    duration: Duration,
735    perf_data_vmo: &zx::Vmo,
736    data_head_pointer: &AtomicPtr<u64>,
737    sample_type: u64,
738    sample_id: u64,
739    sample_period: u64,
740    vmo_write_offset: u64,
741) -> Result<(), Errno> {
742    let start_request = profiler::SessionStartRequest {
743        buffer_results: Some(true),
744        buffer_size_mb: Some(8 as u64),
745        ..Default::default()
746    };
747    let _ = session_proxy.start(&start_request).await.expect("Failed to start profiling");
748
749    // Hardcode a duration so that samples can be collected. This is currently solely used to
750    // demonstrate that an E2E implementation of sample collection works.
751    track_stub!(
752        TODO("https://fxbug.dev/428974888"),
753        "[perf_event_open] don't hardcode sleep; test/user should decide sample duration"
754    );
755    let _ = fuchsia_async::Timer::new(duration).await;
756
757    let stats = session_proxy.stop().await;
758    let samples_collected = match stats {
759        Ok(stats) => stats.samples_collected.unwrap(),
760        Err(e) => return error!(EINVAL, e),
761    };
762
763    track_stub!(
764        TODO("https://fxbug.dev/422502681"),
765        "[perf_event_open] symbolize sample output and delete the below log_info"
766    );
767    log_info!("profiler samples_collected: {:?}", samples_collected);
768
769    // Peek at the first 8 bytes to determine if it's FXT or text.
770    let mut header = [0; 8];
771    let mut bytes_read = 0;
772    while bytes_read < 8 {
773        match client.read(&mut header[bytes_read..]).await {
774            Ok(0) => {
775                // Peer closed the socket. This is the normal end of the stream.
776                log_info!("[perf_event_open] Finished reading fxt record from socket.");
777                break;
778            }
779            Ok(n) => bytes_read += n,
780            Err(e) => {
781                log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
782                break;
783            }
784        }
785    }
786
787    if bytes_read > 0 {
788        if bytes_read == 8 && header == FXT_MAGIC_BYTES {
789            // FXT format.
790            let header_cursor = Cursor::new(header);
791            let reader = header_cursor.chain(client);
792            let (mut stream, _task) = SessionParser::new_async(reader);
793            while let Some(record_result) = stream.next().await {
794                match record_result {
795                    Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
796                        let ips: Vec<u64> = backtrace.data;
797                        let pid = Some(backtrace.process.0 as u32);
798                        let tid = Some(backtrace.thread.0 as u32);
799                        let perf_record_sample = PerfRecordSample { pid, tid, ips };
800                        write_record_to_vmo(
801                            perf_record_sample,
802                            perf_data_vmo,
803                            data_head_pointer,
804                            sample_type,
805                            sample_id,
806                            sample_period,
807                            vmo_write_offset,
808                        );
809                    }
810                    Ok(_) => {
811                        // Ignore other records.
812                    }
813                    Err(e) => {
814                        log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
815                        break;
816                    }
817                }
818            }
819        } else {
820            // Text format.
821            // Read chunks of sampling data from socket in this buffer temporarily. We will parse
822            // the data and write it into the output VMO (the one mmap points to).
823            let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
824
825            loop {
826                // Attempt to read data. This awaits until data is available, EOF, or error.
827                // Ignore the first 8 bytes as it's the {{{reset}}} marker.
828                let socket_data = client.read(&mut buffer).await;
829
830                match socket_data {
831                    Ok(0) => {
832                        // Peer closed the socket. This is the normal end of the stream.
833                        log_info!("[perf_event_open] Finished reading from socket.");
834                        break;
835                    }
836                    Ok(bytes_read) => {
837                        // Receive data in format {{{...}}}.
838                        let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
839                            Ok(data) => data,
840                            Err(e) => return error!(EINVAL, e),
841                        };
842                        // Parse data to PerfRecordSample struct.
843                        if let Some(perf_record_sample) =
844                            parse_perf_record_sample_format(received_data)
845                        {
846                            write_record_to_vmo(
847                                perf_record_sample,
848                                perf_data_vmo,
849                                data_head_pointer,
850                                sample_type,
851                                sample_id,
852                                sample_period,
853                                vmo_write_offset,
854                            );
855                        }
856                    }
857                    Err(e) => {
858                        log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
859                        break;
860                    }
861                }
862            }
863        }
864    }
865
866    let reset_status = session_proxy.reset().await;
867    return match reset_status {
868        Ok(_) => Ok(()),
869        Err(e) => error!(EINVAL, e),
870    };
871}
872
873// Notifies other thread that we should start/stop sampling.
874// Once sampling is complete, that profiler session is no longer needed.
875// At that point, send back notification so that this is no longer blocking
876// (e.g. so that other profiler sessions can start).
877fn ping_receiver(
878    mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
879    command: IoctlOp,
880) {
881    log_info!("[perf_event_open] Received sampling command: {:?}", command);
882    let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
883    match ioctl_sender.try_send((command, profiling_complete_sender)) {
884        Ok(_) => (),
885        Err(e) => {
886            if e.is_full() {
887                log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
888            } else if e.is_disconnected() {
889                log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
890            } else {
891                log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
892            }
893        }
894    };
895    // Block on / wait until profiling is complete before returning.
896    // This notifies that the profiler is free to be used for another session.
897    let _ = profiling_complete_receiver.recv().unwrap();
898}
899
900pub fn sys_perf_event_open(
901    locked: &mut Locked<Unlocked>,
902    current_task: &CurrentTask,
903    attr: UserRef<perf_event_attr>,
904    // Note that this is pid in Linux docs.
905    tid: tid_t,
906    cpu: i32,
907    group_fd: FdNumber,
908    _flags: u64,
909) -> Result<SyscallResult, Errno> {
910    // So far, the implementation only sets the read_data_format according to the "Reading results"
911    // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
912    // Other features will be added in the future (see below track_stubs).
913    let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
914
915    if tid == -1 && cpu == -1 {
916        return error!(EINVAL);
917    }
918
919    let target_task_type = match tid {
920        -1 => TargetTaskType::AllTasks,
921        0 => TargetTaskType::CurrentTask,
922        _ => {
923            track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
924            return error!(ENOSYS);
925        }
926    };
927    security::check_perf_event_open_access(
928        current_task,
929        target_task_type,
930        &perf_event_attrs,
931        perf_event_attrs.type_.try_into()?,
932    )?;
933
934    // Channel used to send info between notifier and spawned task thread.
935    // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
936    // quick succession (instead of something lower).
937    let (sender, mut receiver) = future_mpsc::channel(8);
938
939    let page_size = zx::system_get_page_size() as u64;
940    let mut perf_event_file = PerfEventFileState::new(
941        perf_event_attrs,
942        0,
943        perf_event_attrs.disabled(),
944        perf_event_attrs.sample_type,
945        zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
946        page_size, // Start with this amount of offset, we can increment as we write.
947        sender,
948    );
949
950    let read_format = perf_event_attrs.read_format;
951
952    if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
953        || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
954    {
955        // Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
956        // as otherwise this param shouldn't be used for calculating anything.
957        if perf_event_file.disabled == 0 {
958            perf_event_file.most_recent_enabled_time =
959                zx::MonotonicInstant::get().into_nanos() as u64;
960        }
961        // Initialize this to 0 as we will need to return a time duration later during read().
962        perf_event_file.total_time_running = 0;
963    }
964
965    let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
966    perf_event_file.rf_id = event_id;
967
968    if group_fd.raw() == -1 {
969        perf_event_file.sample_id = event_id;
970    } else {
971        let group_file = current_task.files.get(group_fd)?;
972        let group_file_object_id = group_file.id;
973        let perf_state = get_perf_state(&current_task.kernel);
974        let events = perf_state.format_id_lookup_table.lock();
975        if let Some(rf_id) = events.get(&group_file_object_id) {
976            perf_event_file.sample_id = *rf_id;
977        } else {
978            return error!(EINVAL);
979        }
980    }
981
982    if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
983        track_stub!(
984            TODO("https://fxbug.dev/402238049"),
985            "[perf_event_open] implement read_format group"
986        );
987        return error!(ENOSYS);
988    }
989    if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
990        track_stub!(
991            TODO("https://fxbug.dev/402260383"),
992            "[perf_event_open] implement read_format lost"
993        );
994    }
995
996    // Set up notifier for handling ioctl calls to enable/disable sampling.
997    let mut vmo_handle_copy =
998        perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS);
999
1000    // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
1001    // This is always sound regardless of the union's tag.
1002    let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
1003    // The sample period from the PERF_COUNT_SW_CPU_CLOCK is
1004    // 1 nanosecond per tick. Convert this duration into zx::duration.
1005    let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
1006
1007    let data_head_pointer = Arc::new(AtomicPtr::new(std::ptr::null_mut::<u64>()));
1008    // Pass cloned into the thread.
1009    let cloned_data_head_pointer = Arc::clone(&data_head_pointer);
1010
1011    let closure = async move |_: LockedAndTask<'_>| {
1012        // This loop will wait for messages from the sender.
1013        while let Some((command, profiling_complete_receiver)) = receiver.next().await {
1014            match command {
1015                IoctlOp::Enable => {
1016                    match set_up_profiler(zx_sample_period).await {
1017                        Ok((session_proxy, client)) => {
1018                            track_stub!(
1019                                TODO("https://fxbug.dev/422502681"),
1020                                "[perf_event_open] don't hardcode profiling duration"
1021                            );
1022
1023                            let handle = vmo_handle_copy
1024                                .as_mut()
1025                                .expect("Failed to get VMO handle")
1026                                .as_handle_ref()
1027                                .duplicate(zx::Rights::SAME_RIGHTS)
1028                                .unwrap();
1029
1030                            let _ = collect_sample(
1031                                session_proxy,
1032                                client,
1033                                Duration::from_millis(100),
1034                                &zx::Vmo::from(handle),
1035                                &*cloned_data_head_pointer,
1036                                perf_event_file.sample_type,
1037                                perf_event_file.sample_id,
1038                                sample_period_in_ticks,
1039                                perf_event_file.vmo_write_offset,
1040                            )
1041                            .await;
1042                            // Send notification that profiler session is over.
1043                            let _ = profiling_complete_receiver.send(());
1044                        }
1045                        Err(e) => {
1046                            log_warn!("Failed to profile: {}", e);
1047                        }
1048                    };
1049                }
1050            }
1051        }
1052        ()
1053    };
1054    let req = SpawnRequestBuilder::new()
1055        .with_debug_name("perf-event-sampler")
1056        .with_async_closure(closure)
1057        .build();
1058    current_task.kernel().kthreads.spawner().spawn_from_request(req);
1059
1060    let file = Box::new(PerfEventFile {
1061        _tid: tid,
1062        _cpu: cpu,
1063        perf_event_file: RwLock::new(perf_event_file),
1064        security_state: security::perf_event_alloc(current_task),
1065        data_head_pointer: data_head_pointer,
1066    });
1067    // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
1068    let file_handle =
1069        Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
1070    let file_object_id = file_handle.id;
1071    let file_descriptor: Result<FdNumber, Errno> =
1072        current_task.add_file(locked, file_handle, FdFlags::empty());
1073
1074    match file_descriptor {
1075        Ok(fd) => {
1076            if group_fd.raw() == -1 {
1077                let perf_state = get_perf_state(&current_task.kernel);
1078                let mut events = perf_state.format_id_lookup_table.lock();
1079                events.insert(file_object_id, event_id);
1080            }
1081            Ok(fd.into())
1082        }
1083        Err(_) => {
1084            track_stub!(
1085                TODO("https://fxbug.dev/402453955"),
1086                "[perf_event_open] implement remaining error handling"
1087            );
1088            error!(EMFILE)
1089        }
1090    }
1091}
1092// Syscalls for arch32 usage
1093#[cfg(target_arch = "aarch64")]
1094mod arch32 {
1095    pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
1096}
1097
1098#[cfg(target_arch = "aarch64")]
1099pub use arch32::*;
1100
1101use crate::mm::memory::MemoryObject;
1102use crate::mm::{MemoryAccessorExt, ProtectionFlags};
1103use crate::task::CurrentTask;
1104use crate::vfs::{
1105    Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
1106    OutputBuffer,
1107};
1108use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};