Skip to main content

starnix_core/perf/
mod.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
6use anyhow::Context;
7use fidl_fuchsia_cpu_profiler as profiler;
8use fuchsia_component::client::connect_to_protocol;
9use futures::StreamExt;
10use futures::channel::mpsc as future_mpsc;
11use regex_lite::Regex;
12use std::collections::HashMap;
13use std::error::Error;
14use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};
15use std::sync::{Arc, mpsc as sync_mpsc};
16use zerocopy::{Immutable, IntoBytes};
17
18use futures::io::{AsyncReadExt, Cursor};
19use fxt::TraceRecord;
20use fxt::profiler::ProfilerRecord;
21use fxt::session::SessionParser;
22use seq_lock::SeqLock;
23use starnix_logging::{log_info, log_warn, track_stub};
24use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked};
25use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
26use starnix_uapi::arch32::{
27    PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
28    PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
29    PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
30    PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
31    perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
32    perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
33    perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
34    perf_event_type_PERF_RECORD_SAMPLE,
35};
36use starnix_uapi::errors::Errno;
37use starnix_uapi::open_flags::OpenFlags;
38use starnix_uapi::user_address::UserRef;
39use starnix_uapi::{
40    error, perf_event_attr, perf_event_header, perf_event_mmap_page__bindgen_ty_1,
41    perf_event_read_format_PERF_FORMAT_GROUP, perf_event_read_format_PERF_FORMAT_ID,
42    perf_event_read_format_PERF_FORMAT_LOST, perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
43    perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
44};
45
46use crate::security::{self, TargetTaskType};
47use crate::task::{Kernel, LockedAndTask};
48
49static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
50// Default buffer size to read from socket (for sampling data).
51const DEFAULT_CHUNK_SIZE: usize = 4096;
52// 4096 * 10, page size * 10.
53// If tests flake due to running out of buffer space, or if the profiling duration is
54// significantly increased, this buffer size may need further adjustment (expansion).
55const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960;
56// perf_event_header struct size: 32 + 16 + 16 = 8 bytes.
57const PERF_EVENT_HEADER_SIZE: u16 = 8;
58// FXT magic bytes (little endian).
59const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
60
61mod event;
62pub use event::{TraceEvent, TraceEventQueue};
63
64#[repr(C)]
65#[derive(Copy, Clone, IntoBytes, Immutable)]
66struct PerfMetadataHeader {
67    version: u32,
68    compat_version: u32,
69}
70
71#[repr(C, packed)]
72#[derive(Copy, Clone, IntoBytes, Immutable)]
73struct PerfMetadataValue {
74    index: u32,
75    offset: i64,
76    time_enabled: u64,
77    time_running: u64,
78    __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
79    pmc_width: u16,
80    time_shift: u16,
81    time_mult: u32,
82    time_offset: u64,
83    time_zero: u64,
84    size: u32,
85    __reserved_1: u32,
86    time_cycles: u64,
87    time_mask: u64,
88    __reserved: [u8; 928usize],
89    data_head: u64,
90    data_tail: u64,
91    data_offset: u64,
92    data_size: u64,
93    aux_head: u64,
94    aux_tail: u64,
95    aux_offset: u64,
96    aux_size: u64,
97}
98
99struct PerfState {
100    // This table maps a group leader's file object id to its unique u64 "format ID".
101    //
102    // When a sample is generated for any event in a group, we use this
103    // "format ID" from the group leader as the value for *both* the
104    // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
105    format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>,
106}
107
108impl Default for PerfState {
109    fn default() -> Self {
110        Self { format_id_lookup_table: Mutex::new(HashMap::new()) }
111    }
112}
113
114fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
115    kernel.expando.get_or_init(PerfState::default)
116}
117
118uapi::check_arch_independent_layout! {
119    perf_event_attr {
120        type_, // "type" is a reserved keyword so add a trailing underscore.
121        size,
122        config,
123        __bindgen_anon_1,
124        sample_type,
125        read_format,
126        _bitfield_1,
127        __bindgen_anon_2,
128        bp_type,
129        __bindgen_anon_3,
130        __bindgen_anon_4,
131        branch_sample_type,
132        sample_regs_user,
133        sample_stack_user,
134        clockid,
135        sample_regs_intr,
136        aux_watermark,
137        sample_max_stack,
138        __reserved_2,
139        aux_sample_size,
140        __reserved_3,
141        sig_data,
142        config3,
143    }
144}
145
146#[derive(Clone, Copy, Debug, PartialEq)]
147enum IoctlOp {
148    Enable,
149    Disable,
150}
151
152struct PerfEventFileState {
153    attr: perf_event_attr,
154    rf_value: u64, // "count" for the config we passed in for the event.
155    // The most recent timestamp (ns) where we changed into an enabled state
156    // i.e. the most recent time we got an ENABLE ioctl().
157    most_recent_enabled_time: u64,
158    // Sum of all previous enablement segment durations (ns). If we are
159    // currently in an enabled state, explicitly does NOT include the current
160    // segment.
161    total_time_running: u64,
162    rf_id: u64,
163    sample_id: u64,
164    _rf_lost: u64,
165    disabled: u64,
166    sample_type: u64,
167    // Handle to blob that stores all the perf data that a user may want.
168    // At the moment it only stores some metadata and backtraces (bts).
169    perf_data_vmo: zx::Vmo,
170    // Remember to increment this offset as the number of pages increases.
171    // Currently we just have a bound of 1 page_size of information.
172    vmo_write_offset: u64,
173    // Channel used to send IoctlOps to start/stop sampling.
174    ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
175}
176
177// Have an implementation for PerfEventFileState because VMO
178// doesn't have Default so we can't derive it.
179impl PerfEventFileState {
180    fn new(
181        attr: perf_event_attr,
182        rf_value: u64,
183        disabled: u64,
184        sample_type: u64,
185        perf_data_vmo: zx::Vmo,
186        vmo_write_offset: u64,
187        ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
188    ) -> PerfEventFileState {
189        PerfEventFileState {
190            attr,
191            rf_value,
192            most_recent_enabled_time: 0,
193            total_time_running: 0,
194            rf_id: 0,
195            sample_id: 0,
196            _rf_lost: 0,
197            disabled,
198            sample_type,
199            perf_data_vmo,
200            vmo_write_offset,
201            ioctl_sender,
202        }
203    }
204}
205
206pub struct PerfEventFile {
207    _tid: tid_t,
208    _cpu: i32,
209    perf_event_file: RwLock<PerfEventFileState>,
210    // The security state for this PerfEventFile.
211    pub security_state: security::PerfEventState,
212    // Pointer to the perf_event_mmap_page metadata's data_head.
213    // TODO(https://fxbug.dev/460203776) Remove Arc after figuring out
214    // "borrowed value does not live long enough" issue.
215    data_head_pointer: Arc<AtomicPtr<u64>>,
216}
217
218// PerfEventFile object that implements FileOps.
219// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
220// implementation details.
221// This object can be saved as a FileDescriptor.
222impl FileOps for PerfEventFile {
223    // Don't need to implement seek or sync for PerfEventFile.
224    fileops_impl_nonseekable!();
225    fileops_impl_noop_sync!();
226
227    fn close(
228        self: Box<Self>,
229        _locked: &mut Locked<FileOpsCore>,
230        file: &FileObjectState,
231        current_task: &CurrentTask,
232    ) {
233        let perf_state = get_perf_state(&current_task.kernel);
234        let mut events = perf_state.format_id_lookup_table.lock();
235        events.remove(&file.id);
236    }
237
238    // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
239    fn read(
240        &self,
241        _locked: &mut Locked<FileOpsCore>,
242        _file: &FileObject,
243        current_task: &CurrentTask,
244        _offset: usize,
245        data: &mut dyn OutputBuffer,
246    ) -> Result<usize, Errno> {
247        // Create/calculate and return the ReadFormatData object.
248        // If we create it earlier we might want to change it and it's immutable once created.
249        let read_format_data = {
250            // Once we get the `value` or count from kernel, we can change this to a read()
251            // call instead of write().
252            let mut perf_event_file = self.perf_event_file.write();
253
254            security::check_perf_event_read_access(current_task, &self)?;
255
256            let mut total_time_running_including_curr = perf_event_file.total_time_running;
257
258            // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
259            if perf_event_file.disabled == 0 {
260                // Calculate the value or "count" of the config we're interested in.
261                // This value should reflect the value we are counting (defined in the config).
262                // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
263                // For now we just return rf_value + 1.
264                track_stub!(
265                    TODO("https://fxbug.dev/402938671"),
266                    "[perf_event_open] implement read_format value"
267                );
268                perf_event_file.rf_value += 1;
269
270                // Update time duration.
271                let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
272                total_time_running_including_curr +=
273                    curr_time - perf_event_file.most_recent_enabled_time;
274            }
275
276            let mut output = Vec::<u8>::new();
277            let value = perf_event_file.rf_value.to_ne_bytes();
278            output.extend(value);
279
280            let read_format = perf_event_file.attr.read_format;
281
282            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
283                // Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
284                output.extend(total_time_running_including_curr.to_ne_bytes());
285            }
286            if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
287                // Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
288                output.extend(total_time_running_including_curr.to_ne_bytes());
289            }
290            if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
291                // Adds a 64-bit unique value that corresponds to the event group.
292                output.extend(perf_event_file.rf_id.to_ne_bytes());
293            }
294
295            output
296        };
297
298        // The regular read() call allows the case where the bytes-we-want-to-read-in won't
299        // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
300        // into a buffer that is not big enough to hold the data, the error ENOSPC results."
301        if data.available() < read_format_data.len() {
302            return error!(ENOSPC);
303        }
304        track_stub!(
305            TODO("https://fxbug.dev/402453955"),
306            "[perf_event_open] implement remaining error handling"
307        );
308
309        data.write(&read_format_data)
310    }
311
312    fn ioctl(
313        &self,
314        _locked: &mut Locked<Unlocked>,
315        _file: &FileObject,
316        current_task: &CurrentTask,
317        op: u32,
318        _arg: SyscallArg,
319    ) -> Result<SyscallResult, Errno> {
320        track_stub!(
321            TODO("https://fxbug.dev/405463320"),
322            "[perf_event_open] implement PERF_IOC_FLAG_GROUP"
323        );
324        security::check_perf_event_write_access(current_task, &self)?;
325        let mut perf_event_file = self.perf_event_file.write();
326        match op {
327            PERF_EVENT_IOC_ENABLE => {
328                if perf_event_file.disabled != 0 {
329                    perf_event_file.disabled = 0; // 0 = false.
330                    perf_event_file.most_recent_enabled_time =
331                        zx::MonotonicInstant::get().into_nanos() as u64;
332                }
333
334                // If we are sampling, invoke the profiler and collect a sample.
335                // Currently this is an example sample collection.
336                track_stub!(
337                    TODO("https://fxbug.dev/398914921"),
338                    "[perf_event_open] implement full sampling features"
339                );
340                if perf_event_file.attr.freq() == 0
341                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
342                // This is always sound regardless of the union's tag.
343                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
344                {
345                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
346                }
347                return Ok(SUCCESS);
348            }
349            PERF_EVENT_IOC_DISABLE => {
350                if perf_event_file.disabled == 0 {
351                    perf_event_file.disabled = 1; // 1 = true.
352
353                    // Update total_time_running now that the segment has ended.
354                    let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
355                    perf_event_file.total_time_running +=
356                        curr_time - perf_event_file.most_recent_enabled_time;
357                }
358                if perf_event_file.attr.freq() == 0
359                // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
360                // This is always sound regardless of the union's tag.
361                    && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
362                {
363                    ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Disable);
364                }
365                return Ok(SUCCESS);
366            }
367            PERF_EVENT_IOC_RESET => {
368                perf_event_file.rf_value = 0;
369                return Ok(SUCCESS);
370            }
371            PERF_EVENT_IOC_REFRESH
372            | PERF_EVENT_IOC_PERIOD
373            | PERF_EVENT_IOC_SET_OUTPUT
374            | PERF_EVENT_IOC_SET_FILTER
375            | PERF_EVENT_IOC_ID
376            | PERF_EVENT_IOC_SET_BPF
377            | PERF_EVENT_IOC_PAUSE_OUTPUT
378            | PERF_EVENT_IOC_MODIFY_ATTRIBUTES
379            | PERF_EVENT_IOC_QUERY_BPF => {
380                track_stub!(
381                    TODO("https://fxbug.dev/404941053"),
382                    "[perf_event_open] implement remaining ioctl() calls"
383                );
384                return error!(ENOSYS);
385            }
386            _ => error!(ENOTTY),
387        }
388    }
389
390    // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
391    // Gets called when mmap() is called.
392    // Immediately before sampling, this should get called by the user (e.g. the test
393    // or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
394    fn get_memory(
395        &self,
396        _locked: &mut Locked<FileOpsCore>,
397        _file: &FileObject,
398        current_task: &CurrentTask,
399        length: Option<usize>,
400        _prot: ProtectionFlags,
401    ) -> Result<Arc<MemoryObject>, Errno> {
402        let buffer_size: u64 = length.unwrap_or(0) as u64;
403        if buffer_size == 0 {
404            return error!(EINVAL);
405        }
406        let page_size = zx::system_get_page_size() as u64;
407
408        security::check_perf_event_read_access(current_task, &self)?;
409
410        // TODO(https://fxbug.dev/460246292) confirm when to create metadata.
411        // Create metadata structs. Currently we hardcode everything just to get
412        // something E2E working.
413        let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
414        let metadata_value = PerfMetadataValue {
415            index: 2,
416            offset: 19337,
417            time_enabled: 0,
418            time_running: 0,
419            __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
420            pmc_width: 0,
421            time_shift: 0,
422            time_mult: 0,
423            time_offset: 0,
424            time_zero: 0,
425            size: 0,
426            __reserved_1: 0,
427            time_cycles: 0,
428            time_mask: 0,
429            __reserved: [0; 928usize],
430            data_head: page_size,
431            // Start reading from 0; it is the user's responsibility to increment on their end.
432            data_tail: 0,
433            data_offset: page_size,
434            data_size: (buffer_size - page_size) as u64,
435            aux_head: 0,
436            aux_tail: 0,
437            aux_offset: 0,
438            aux_size: 0,
439        };
440
441        // Then, wrap metadata in a SeqLock so that user can be made aware of updates.
442        // SeqLock is formatted thusly:
443        //   header_struct : any size, values should not change
444        //   sequence_counter : u32
445        //   value_struct : any size, needs locking because each value can change
446        // We split our perf_event_mmap_page accordingly. The `version` and `compat_version`
447        // should not change while the params below the `lock` may change.
448        // Sequence counter for `lock` param gets inserted between these via
449        // the `SeqLock` implementation.
450        let perf_event_file = self.perf_event_file.read();
451        // VMO does not implement Copy trait. We duplicate the VMO handle
452        // so that we can pass it to the SeqLock and the MemoryObject.
453        let vmo_handle_copy = match perf_event_file
454            .perf_data_vmo
455            .as_handle_ref()
456            .duplicate(zx::Rights::SAME_RIGHTS)
457        {
458            Ok(h) => h,
459            Err(_) => return error!(EINVAL),
460        };
461
462        // SAFETY: This is ok right now because we are the only reference to this memory.
463        // Once there are multiple references we should update this comment to confirm that
464        // there are only atomic accesses to this memory (see seq_lock lib.rs for details).
465        let mut seq_lock = match unsafe {
466            SeqLock::new_from_vmo(metadata_header, metadata_value, vmo_handle_copy.into())
467        } {
468            Ok(s) => s,
469            Err(_) => return error!(EINVAL),
470        };
471
472        // Now, the perf_data_vmo contains the full metadata page enclosed in a SeqLock.
473        // Save data_head pointer so that we can write atomically to it after profiling.
474        let metadata_struct = seq_lock.get_map_address() as *mut PerfMetadataValue;
475        // SAFETY: This is ok as we previously set the exact format (PerfMetadataValue).
476        let data_head_pointer = unsafe { std::ptr::addr_of_mut!((*metadata_struct).data_head) };
477        self.data_head_pointer.store(data_head_pointer, Ordering::Release);
478
479        match perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS) {
480            Ok(vmo) => {
481                let memory = MemoryObject::Vmo(vmo.into());
482                return Ok(Arc::new(memory));
483            }
484            Err(_) => {
485                track_stub!(
486                    TODO("https://fxbug.dev/416323134"),
487                    "[perf_event_open] handle get_memory() errors"
488                );
489                return error!(EINVAL);
490            }
491        };
492    }
493
494    fn write(
495        &self,
496        _locked: &mut Locked<FileOpsCore>,
497        _file: &FileObject,
498        _current_task: &CurrentTask,
499        _offset: usize,
500        _data: &mut dyn InputBuffer,
501    ) -> Result<usize, Errno> {
502        track_stub!(
503            TODO("https://fxbug.dev/394960158"),
504            "[perf_event_open] implement perf event functions"
505        );
506        error!(ENOSYS)
507    }
508}
509
510// Given a PerfRecordSample struct, write it via the correct output format
511// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
512// We don't currently support all the sample_types listed in the docs.
513// Input:
514//    PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
515// Human-understandable output:
516//    9 1 40 111 5 10 3 111 222 333
517// Actual output (no spaces or \n in real output, just making it more readable):
518//    0x0000 0x0009                 <-- starts at `offset` bytes
519//    0x0001
520//    0x0040
521//    0x0000 0x0000 0x0000 0x006F   <-- starts at `offset` + 8 bytes
522//    0x0000 0x0000 0x0000 0x0005
523//    0x0000 0x0000 0x0000 0x0010
524//    0x0000 0x0000 0x0000 0x0003
525//    0x0000 0x0000 0x0000 0x006F
526//    0x0000 0x0000 0x0000 0x00DE
527//    0x0000 0x0000 0x0000 0x014D
528//
529//    Returns the length of bytes written. In above case, 8 + 28 = 36.
530//    This information is used to increment the global offset.
531fn write_record_to_vmo(
532    perf_record_sample: PerfRecordSample,
533    perf_data_vmo: &zx::Vmo,
534    _data_head_pointer: &AtomicPtr<u64>,
535    sample_type: u64,
536    sample_id: u64,
537    sample_period: u64,
538    offset: u64,
539) -> u64 {
540    // Write header.
541    track_stub!(
542        TODO("https://fxbug.dev/432501467"),
543        "[perf_event_open] determines whether the record is KERNEL or USER"
544    );
545    let perf_event_header = perf_event_header {
546        type_: perf_event_type_PERF_RECORD_SAMPLE,
547        misc: PERF_RECORD_MISC_KERNEL as u16,
548        size: PERF_EVENT_HEADER_SIZE,
549    };
550
551    match perf_data_vmo.write(&perf_event_header.as_bytes(), offset) {
552        Ok(_) => (),
553        Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
554    }
555
556    // Write sample.
557    let mut sample = Vec::<u8>::new();
558    // sample_id
559    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
560        sample.extend(sample_id.to_ne_bytes());
561    }
562    // ip
563    if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
564        sample.extend(perf_record_sample.ips[0].to_ne_bytes());
565    }
566
567    if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
568        // pid
569        sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
570        // tid
571        sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
572    }
573
574    // id
575    if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
576        sample.extend(sample_id.to_ne_bytes());
577    }
578
579    // sample period
580    if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
581        sample.extend(sample_period.to_ne_bytes());
582    }
583
584    if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
585        // nr
586        sample.extend(perf_record_sample.ips.len().to_ne_bytes());
587
588        // ips[nr] - list of ips, u64 per ip.
589        for i in perf_record_sample.ips {
590            sample.extend(i.to_ne_bytes());
591        }
592    }
593    // The remaining data are not defined for now.
594
595    match perf_data_vmo.write(&sample, offset + (std::mem::size_of::<perf_event_header>() as u64)) {
596        Ok(_) => {
597            let bytes_written: u64 =
598                (std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
599
600            // TODO(http://fuchsia.dev/460203776) implement this better before enabling
601            // any setting of data_head value.
602            // Update data_head because we have now written to the VMO.
603            // Ordering::Release pushes update that this (and, transitively, the sample
604            // too) has updated.
605            // data_head_pointer.fetch_add(bytes_written, Ordering::Release);
606
607            // Return the total size we wrote (header + sample) so that we can
608            // increment offset counter.
609            return bytes_written;
610        }
611        Err(e) => {
612            log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
613            // Failed to write. Don't increment offset counter.
614            return 0;
615        }
616    }
617}
618
619#[derive(Debug, Clone)]
620struct PerfRecordSample {
621    pid: Option<u32>,
622    tid: Option<u32>,
623    // Instruction pointers (currently this is the address). First one is `ip` param.
624    ips: Vec<u64>,
625}
626
627// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
628//
629// 1234                     pid
630// 5555                     tid
631// {{{bt:0:0x1111:pc}}}    {{{bt:frame_number:address:type}}}
632// {{{bt:1:0x2222:ra}}}
633// {{{bt:2:0x3333:ra}}}
634//
635// Results in:
636// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
637
638fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
639    let mut pid: Option<u32> = None;
640    let mut tid: Option<u32> = None;
641    let mut ips: Vec<u64> = Vec::new();
642    let mut numbers_found = 0;
643    track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
644    let backtrace_regex =
645        Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
646
647    for line in backtrace.lines() {
648        let trimmed_line = line.trim();
649        // Try to parse as a raw number (for PID/TID).
650        if numbers_found < 2 {
651            if let Ok(num) = trimmed_line.parse::<u32>() {
652                if numbers_found == 0 {
653                    pid = Some(num);
654                } else {
655                    tid = Some(num);
656                }
657                numbers_found += 1;
658                continue;
659            }
660        }
661
662        // Try to parse as a backtrace line.
663        if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
664            let address_str = parsed_bt.get(1).unwrap().as_str();
665            if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
666                ips.push(ip_addr);
667            }
668        }
669    }
670
671    if pid == None || tid == None || ips.is_empty() {
672        // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
673        log_info!("No ips while getting PerfRecordSample");
674        None
675    } else {
676        Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
677    }
678}
679
680async fn set_up_profiler(
681    sample_period: zx::MonotonicDuration,
682) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
683    // Configuration for how we want to sample.
684    let sample = profiler::Sample {
685        callgraph: Some(profiler::CallgraphConfig {
686            strategy: Some(profiler::CallgraphStrategy::FramePointer),
687            ..Default::default()
688        }),
689        ..Default::default()
690    };
691
692    let sampling_config = profiler::SamplingConfig {
693        period: Some(sample_period.into_nanos() as u64),
694        timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
695        sample: Some(sample),
696        ..Default::default()
697    };
698
699    let tasks = vec![
700        // Should return ~300 samples for 100 millis.
701        profiler::Task::SystemWide(profiler::SystemWide {}),
702    ];
703    let targets = profiler::TargetConfig::Tasks(tasks);
704    let config = profiler::Config {
705        configs: Some(vec![sampling_config]),
706        target: Some(targets),
707        ..Default::default()
708    };
709    let (client, server) = fidl::Socket::create_stream();
710    let configure = profiler::SessionConfigureRequest {
711        output: Some(server),
712        config: Some(config),
713        ..Default::default()
714    };
715
716    let proxy = connect_to_protocol::<profiler::SessionMarker>()
717        .context("Error connecting to Profiler protocol");
718    let session_proxy: profiler::SessionProxy = match proxy {
719        Ok(p) => p.clone(),
720        Err(e) => return error!(EINVAL, e),
721    };
722
723    // Must configure before sampling start().
724    let config_request = session_proxy.configure(configure).await;
725    match config_request {
726        Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
727        Err(e) => return error!(EINVAL, e),
728    }
729}
730
731// Collects samples and puts backtrace in VMO.
732// - Reads in the buffer from the socket for that duration in chunks.
733// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
734// - Writes the PERF_RECORD_SAMPLE into VMO.
735async fn stop_and_collect_samples(
736    session_proxy: profiler::SessionProxy,
737    mut client: fidl::AsyncSocket,
738    perf_data_vmo: &zx::Vmo,
739    data_head_pointer: &AtomicPtr<u64>,
740    sample_type: u64,
741    sample_id: u64,
742    sample_period: u64,
743    vmo_write_offset: u64,
744) -> Result<(), Errno> {
745    let stats = session_proxy.stop().await;
746    let samples_collected = match stats {
747        Ok(stats) => stats.samples_collected.unwrap(),
748        Err(e) => return error!(EINVAL, e),
749    };
750
751    track_stub!(
752        TODO("https://fxbug.dev/422502681"),
753        "[perf_event_open] symbolize sample output and delete the below log_info"
754    );
755    log_info!("profiler samples_collected: {:?}", samples_collected);
756
757    // Peek at the first 8 bytes to determine if it's FXT or text.
758    let mut header = [0; 8];
759    let mut bytes_read = 0;
760    while bytes_read < 8 {
761        match client.read(&mut header[bytes_read..]).await {
762            Ok(0) => {
763                // Peer closed the socket. This is the normal end of the stream.
764                log_info!("[perf_event_open] Finished reading fxt record from socket.");
765                break;
766            }
767            Ok(n) => bytes_read += n,
768            Err(e) => {
769                log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
770                break;
771            }
772        }
773    }
774
775    if bytes_read > 0 {
776        if bytes_read == 8 && header == FXT_MAGIC_BYTES {
777            // FXT format.
778            let header_cursor = Cursor::new(header);
779            let reader = header_cursor.chain(client);
780            let (mut stream, _task) = SessionParser::new_async(reader);
781            while let Some(record_result) = stream.next().await {
782                match record_result {
783                    Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
784                        let ips: Vec<u64> = backtrace.data;
785                        let pid = Some(backtrace.process.0 as u32);
786                        let tid = Some(backtrace.thread.0 as u32);
787                        let perf_record_sample = PerfRecordSample { pid, tid, ips };
788                        write_record_to_vmo(
789                            perf_record_sample,
790                            perf_data_vmo,
791                            data_head_pointer,
792                            sample_type,
793                            sample_id,
794                            sample_period,
795                            vmo_write_offset,
796                        );
797                    }
798                    Ok(_) => {
799                        // Ignore other records.
800                    }
801                    Err(e) => {
802                        log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
803                        break;
804                    }
805                }
806            }
807        } else {
808            // Text format.
809            // Read chunks of sampling data from socket in this buffer temporarily. We will parse
810            // the data and write it into the output VMO (the one mmap points to).
811            let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
812
813            loop {
814                // Attempt to read data. This awaits until data is available, EOF, or error.
815                // Ignore the first 8 bytes as it's the {{{reset}}} marker.
816                let socket_data = client.read(&mut buffer).await;
817
818                match socket_data {
819                    Ok(0) => {
820                        // Peer closed the socket. This is the normal end of the stream.
821                        log_info!("[perf_event_open] Finished reading from socket.");
822                        break;
823                    }
824                    Ok(bytes_read) => {
825                        // Receive data in format {{{...}}}.
826                        let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
827                            Ok(data) => data,
828                            Err(e) => return error!(EINVAL, e),
829                        };
830                        // Parse data to PerfRecordSample struct.
831                        if let Some(perf_record_sample) =
832                            parse_perf_record_sample_format(received_data)
833                        {
834                            write_record_to_vmo(
835                                perf_record_sample,
836                                perf_data_vmo,
837                                data_head_pointer,
838                                sample_type,
839                                sample_id,
840                                sample_period,
841                                vmo_write_offset,
842                            );
843                        }
844                    }
845                    Err(e) => {
846                        log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
847                        break;
848                    }
849                }
850            }
851        }
852    }
853
854    let reset_status = session_proxy.reset().await;
855    return match reset_status {
856        Ok(_) => Ok(()),
857        Err(e) => error!(EINVAL, e),
858    };
859}
860
861// Notifies other thread that we should start/stop sampling.
862// Once sampling is complete, that profiler session is no longer needed.
863// At that point, send back notification so that this is no longer blocking
864// (e.g. so that other profiler sessions can start).
865fn ping_receiver(
866    mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
867    command: IoctlOp,
868) {
869    log_info!("[perf_event_open] Received sampling command: {:?}", command);
870    let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
871    match ioctl_sender.try_send((command, profiling_complete_sender)) {
872        Ok(_) => (),
873        Err(e) => {
874            if e.is_full() {
875                log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
876            } else if e.is_disconnected() {
877                log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
878            } else {
879                log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
880            }
881        }
882    };
883    // Block on / wait until profiling is complete before returning.
884    // This notifies that the profiler is free to be used for another session.
885    let _ = profiling_complete_receiver.recv().unwrap();
886}
887
888pub fn sys_perf_event_open(
889    locked: &mut Locked<Unlocked>,
890    current_task: &CurrentTask,
891    attr: UserRef<perf_event_attr>,
892    // Note that this is pid in Linux docs.
893    tid: tid_t,
894    cpu: i32,
895    group_fd: FdNumber,
896    _flags: u64,
897) -> Result<SyscallResult, Errno> {
898    // So far, the implementation only sets the read_data_format according to the "Reading results"
899    // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
900    // Other features will be added in the future (see below track_stubs).
901    let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
902
903    if tid == -1 && cpu == -1 {
904        return error!(EINVAL);
905    }
906
907    let target_task_type = match tid {
908        -1 => TargetTaskType::AllTasks,
909        0 => TargetTaskType::CurrentTask,
910        _ => {
911            track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
912            return error!(ENOSYS);
913        }
914    };
915    security::check_perf_event_open_access(
916        current_task,
917        target_task_type,
918        &perf_event_attrs,
919        perf_event_attrs.type_.try_into()?,
920    )?;
921
922    // Channel used to send info between notifier and spawned task thread.
923    // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
924    // quick succession (instead of something lower).
925    let (sender, mut receiver) = future_mpsc::channel(8);
926
927    let page_size = zx::system_get_page_size() as u64;
928    let mut perf_event_file = PerfEventFileState::new(
929        perf_event_attrs,
930        0,
931        perf_event_attrs.disabled(),
932        perf_event_attrs.sample_type,
933        zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
934        page_size, // Start with this amount of offset, we can increment as we write.
935        sender,
936    );
937
938    let read_format = perf_event_attrs.read_format;
939
940    if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
941        || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
942    {
943        // Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
944        // as otherwise this param shouldn't be used for calculating anything.
945        if perf_event_file.disabled == 0 {
946            perf_event_file.most_recent_enabled_time =
947                zx::MonotonicInstant::get().into_nanos() as u64;
948        }
949        // Initialize this to 0 as we will need to return a time duration later during read().
950        perf_event_file.total_time_running = 0;
951    }
952
953    let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
954    perf_event_file.rf_id = event_id;
955
956    if group_fd.raw() == -1 {
957        perf_event_file.sample_id = event_id;
958    } else {
959        let group_file = current_task.get_file(group_fd)?;
960        let group_file_object_id = group_file.id;
961        let perf_state = get_perf_state(&current_task.kernel);
962        let events = perf_state.format_id_lookup_table.lock();
963        if let Some(rf_id) = events.get(&group_file_object_id) {
964            perf_event_file.sample_id = *rf_id;
965        } else {
966            return error!(EINVAL);
967        }
968    }
969
970    if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
971        track_stub!(
972            TODO("https://fxbug.dev/402238049"),
973            "[perf_event_open] implement read_format group"
974        );
975        return error!(ENOSYS);
976    }
977    if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
978        track_stub!(
979            TODO("https://fxbug.dev/402260383"),
980            "[perf_event_open] implement read_format lost"
981        );
982    }
983
984    // Set up notifier for handling ioctl calls to enable/disable sampling.
985    let mut vmo_handle_copy =
986        perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS);
987
988    // SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
989    // This is always sound regardless of the union's tag.
990    let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
991    // The sample period from the PERF_COUNT_SW_CPU_CLOCK is
992    // 1 nanosecond per tick. Convert this duration into zx::duration.
993    let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
994
995    let data_head_pointer = Arc::new(AtomicPtr::new(std::ptr::null_mut::<u64>()));
996    // Pass cloned into the thread.
997    let cloned_data_head_pointer = Arc::clone(&data_head_pointer);
998
999    let closure = async move |_: LockedAndTask<'_>| {
1000        let mut profiler_state: Option<(profiler::SessionProxy, fidl::AsyncSocket)> = None;
1001
1002        // This loop will wait for messages from the sender.
1003        while let Some((command, profiling_complete_receiver)) = receiver.next().await {
1004            match command {
1005                IoctlOp::Enable => {
1006                    match set_up_profiler(zx_sample_period).await {
1007                        Ok((session_proxy, client)) => {
1008                            let start_request = profiler::SessionStartRequest {
1009                                buffer_results: Some(true),
1010                                buffer_size_mb: Some(8 as u64),
1011                                ..Default::default()
1012                            };
1013                            if let Err(e) = session_proxy.start(&start_request).await {
1014                                log_warn!("Failed to start profiling: {}", e);
1015                            } else {
1016                                profiler_state = Some((session_proxy, client));
1017                            }
1018                        }
1019                        Err(e) => {
1020                            log_warn!("Failed to profile: {}", e);
1021                        }
1022                    };
1023                    // Send notification anyway to unblock the ioctl caller.
1024                    let _ = profiling_complete_receiver.send(());
1025                }
1026                IoctlOp::Disable => {
1027                    if let Some((session_proxy, client)) = profiler_state.take() {
1028                        let handle = vmo_handle_copy
1029                            .as_mut()
1030                            .expect("Failed to get VMO handle")
1031                            .as_handle_ref()
1032                            .duplicate(zx::Rights::SAME_RIGHTS)
1033                            .unwrap();
1034
1035                        if let Err(e) = stop_and_collect_samples(
1036                            session_proxy,
1037                            client,
1038                            &zx::Vmo::from(handle),
1039                            &*cloned_data_head_pointer,
1040                            perf_event_file.sample_type,
1041                            perf_event_file.sample_id,
1042                            sample_period_in_ticks,
1043                            perf_event_file.vmo_write_offset,
1044                        )
1045                        .await
1046                        {
1047                            log_warn!("Failed to collect sample: {:?}", e);
1048                        }
1049                    }
1050                    // Send notification anyway to unblock the ioctl caller.
1051                    let _ = profiling_complete_receiver.send(());
1052                }
1053            }
1054        }
1055        ()
1056    };
1057    let req = SpawnRequestBuilder::new()
1058        .with_debug_name("perf-event-sampler")
1059        .with_async_closure(closure)
1060        .build();
1061    current_task.kernel().kthreads.spawner().spawn_from_request(req);
1062
1063    let file = Box::new(PerfEventFile {
1064        _tid: tid,
1065        _cpu: cpu,
1066        perf_event_file: RwLock::new(perf_event_file),
1067        security_state: security::perf_event_alloc(current_task),
1068        data_head_pointer: data_head_pointer,
1069    });
1070    // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
1071    let file_handle =
1072        Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
1073    let file_object_id = file_handle.id;
1074    let file_descriptor: Result<FdNumber, Errno> =
1075        current_task.add_file(locked, file_handle, FdFlags::empty());
1076
1077    match file_descriptor {
1078        Ok(fd) => {
1079            if group_fd.raw() == -1 {
1080                let perf_state = get_perf_state(&current_task.kernel);
1081                let mut events = perf_state.format_id_lookup_table.lock();
1082                events.insert(file_object_id, event_id);
1083            }
1084            Ok(fd.into())
1085        }
1086        Err(_) => {
1087            track_stub!(
1088                TODO("https://fxbug.dev/402453955"),
1089                "[perf_event_open] implement remaining error handling"
1090            );
1091            error!(EMFILE)
1092        }
1093    }
1094}
1095// Syscalls for arch32 usage
1096#[cfg(target_arch = "aarch64")]
1097mod arch32 {
1098    pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
1099}
1100
1101#[cfg(target_arch = "aarch64")]
1102pub use arch32::*;
1103
1104use crate::mm::memory::MemoryObject;
1105use crate::mm::{MemoryAccessorExt, ProtectionFlags};
1106use crate::task::CurrentTask;
1107use crate::vfs::{
1108    Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
1109    OutputBuffer,
1110};
1111use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};