Skip to main content

starnix_core/perf/
event.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::sync::Arc;
6use std::sync::atomic::{AtomicBool, Ordering};
7
8use crate::perf::lockless_ring_buffer::LocklessRingBuffer;
9use crate::task::Kernel;
10use crate::vfs::OutputBuffer;
11use starnix_logging::log_error;
12use starnix_uapi::errors::Errno;
13use zerocopy::native_endian::{I32, U16, U32, U64};
14use zerocopy::{Immutable, IntoBytes, Unaligned};
15use zx::BootTimeline;
16
17// The default ring buffer size (2MB).
18// TODO(https://fxbug.dev/357665908): This should be based on /sys/kernel/tracing/buffer_size_kb.
19const DEFAULT_RING_BUFFER_SIZE_BYTES: usize = 2097152;
20
21// The event id for atrace events.
22const FTRACE_PRINT_ID: U16 = U16::new(5);
23
24// Used for inspect tracking.
25const DROPPED_PAGES: &str = "dropped_pages";
26
27const MAX_TIME_DELTA_NANOS: u32 = (1 << 27) - 1;
28
29#[repr(C)]
30#[derive(Debug, Default, IntoBytes, Immutable, Unaligned)]
31struct PrintEventHeader {
32    common_type: U16,
33    common_flags: u8,
34    common_preempt_count: u8,
35    common_pid: I32,
36    ip: U64,
37}
38
39#[repr(C)]
40#[derive(Debug, IntoBytes, Immutable, Unaligned)]
41struct PrintEvent {
42    header: PrintEventHeader,
43}
44
45impl PrintEvent {
46    fn new(pid: i32) -> Self {
47        Self {
48            header: PrintEventHeader {
49                common_type: FTRACE_PRINT_ID,
50                common_pid: I32::new(pid),
51                // Perfetto doesn't care about any other field.
52                ..Default::default()
53            },
54        }
55    }
56
57    fn size(&self) -> usize {
58        std::mem::size_of::<PrintEventHeader>()
59    }
60}
61
62#[repr(C)]
63#[derive(Debug, Default, IntoBytes, PartialEq, Immutable, Unaligned)]
64struct TraceEventHeader {
65    // u32 where:
66    //   type_or_length: bottom 5 bits. If 0, `data` is read for length. Always set to 0 for now.
67    //   time_delta: top 27 bits
68    time_delta: U32,
69
70    // If type_or_length is 0, holds the length of the trace message.
71    // We always write length here for simplicity.
72    data: U32,
73}
74
75impl TraceEventHeader {
76    fn new(size: usize) -> Self {
77        // The size reported in the event's header includes the size of `size` (a u32) and the size
78        // of the event data.
79        let size = (std::mem::size_of::<u32>() + size) as u32;
80        Self { time_delta: U32::new(0), data: U32::new(size) }
81    }
82
83    fn set_time_delta(&mut self, nanos: u32) {
84        // The max delta is capped at MAX_TIME_DELTA_NANOS so it fits into 27 bits.
85        // another option is to just shift it over, but that could lead to other misleading
86        // deltas if the value is large.
87        let saturated_nanos = nanos.min(MAX_TIME_DELTA_NANOS);
88        // Move into the high 27 bits reserving the lower 5 bits for the type_or_length value.
89        self.time_delta = U32::new(saturated_nanos << 5);
90    }
91}
92
93#[repr(C)]
94#[derive(Debug, IntoBytes, Immutable, Unaligned)]
95pub struct TraceEvent {
96    /// Common metadata among all trace event types.
97    header: TraceEventHeader, // u64
98
99    /// The event data.
100    ///
101    /// Atrace events are reported as PrintFtraceEvents. When we support multiple types of events,
102    /// this can be updated to be more generic.
103    event: PrintEvent,
104}
105
106impl TraceEvent {
107    pub fn new(pid: i32, data_len: usize) -> Self {
108        let event = PrintEvent::new(pid);
109        // +1 because we append a trailing '\n' to the data when we serialize.
110        let header = TraceEventHeader::new(event.size() + data_len + 1);
111        Self { header, event }
112    }
113
114    fn size(&self) -> usize {
115        // The header's data size doesn't include the time_delta size.
116        std::mem::size_of::<u32>() + self.header.data.get() as usize
117    }
118}
119
120/// Stores all trace events.
121pub struct TraceEventQueue {
122    /// The trace events.
123    ring_buffer: Arc<LocklessRingBuffer>,
124
125    /// Async ID for read track grouping.
126    pub async_id_read: fuchsia_trace::Id,
127
128    /// Async ID for write track grouping.
129    pub async_id_write: fuchsia_trace::Id,
130
131    /// CPU ID for trace tracks.
132    pub cpu_id: u32,
133}
134
135const STATIC_READ_TRACK_NAMES: [&str; 8] = [
136    "Tracefs read 0",
137    "Tracefs read 1",
138    "Tracefs read 2",
139    "Tracefs read 3",
140    "Tracefs read 4",
141    "Tracefs read 5",
142    "Tracefs read 6",
143    "Tracefs read 7",
144];
145
146const STATIC_WRITE_TRACK_NAMES: [&str; 8] = [
147    "Tracefs write 0",
148    "Tracefs write 1",
149    "Tracefs write 2",
150    "Tracefs write 3",
151    "Tracefs write 4",
152    "Tracefs write 5",
153    "Tracefs write 6",
154    "Tracefs write 7",
155];
156
157impl<'a> TraceEventQueue {
158    pub(crate) fn new(cpu_id: u32) -> Result<Self, Errno> {
159        let async_id_read = fuchsia_trace::Id::new();
160        let async_id_write = fuchsia_trace::Id::new();
161        let ring_buffer = Arc::new(
162            LocklessRingBuffer::new(DEFAULT_RING_BUFFER_SIZE_BYTES, true, async_id_write)
163                .map_err(|_| starnix_uapi::errno!(ENOMEM))?,
164        );
165        ring_buffer.disable()?;
166
167        Ok(Self { ring_buffer, async_id_read, async_id_write, cpu_id })
168    }
169
170    pub fn read_track_name(&self) -> std::borrow::Cow<'static, str> {
171        if let Some(&name) = STATIC_READ_TRACK_NAMES.get(self.cpu_id as usize) {
172            std::borrow::Cow::Borrowed(name)
173        } else {
174            std::borrow::Cow::Owned(format!("Tracefs read {}", self.cpu_id))
175        }
176    }
177
178    pub fn write_track_name(&self) -> std::borrow::Cow<'static, str> {
179        if let Some(&name) = STATIC_WRITE_TRACK_NAMES.get(self.cpu_id as usize) {
180            std::borrow::Cow::Borrowed(name)
181        } else {
182            std::borrow::Cow::Owned(format!("Tracefs write {}", self.cpu_id))
183        }
184    }
185
186    fn enable(&self) -> Result<zx::BootInstant, Errno> {
187        self.ring_buffer.enable()
188    }
189
190    /// Disables the event queue and resets it to empty.
191    /// The number of dropped pages are recorded for reading via tracefs.
192    fn disable(&self) -> Result<u64, Errno> {
193        self.ring_buffer.disable()
194    }
195
196    /// Reads a page worth of events. Currently only reads pages that are full.
197    ///
198    /// From https://docs.kernel.org/trace/ring-buffer-design.html, when memory is mapped, a reader
199    /// page can be swapped with the header page to avoid copying memory.
200    pub fn read(&self, buf: &mut dyn OutputBuffer) -> Result<usize, Errno> {
201        self.ring_buffer.read(buf)
202    }
203
204    /// Write `event` into `ring_buffer`.
205    /// If `event` does not fit in the current page, move on to the next.
206    ///
207    /// Should eventually allow for a writer to preempt another writer.
208    /// See https://docs.kernel.org/trace/ring-buffer-design.html.
209    /// Returns the delta duration between this event and the previous event written.
210    pub fn push_event(
211        &self,
212        mut event: TraceEvent,
213        data: &[u8],
214    ) -> Result<zx::Duration<BootTimeline>, Errno> {
215        let size = event.size();
216
217        let (res, _timestamp, delta) = match self.ring_buffer.reserve(size) {
218            Ok(res) => res,
219            Err(e) if e == starnix_uapi::errno!(EINVAL) => {
220                log_error!("Invalid reservation size: {}", size);
221                return Err(starnix_uapi::errno!(EINVAL));
222            }
223            Err(e) if e == starnix_uapi::errno!(ENOSPC) => {
224                log_error!("Ring buffer full, dropping event of size: {}", size);
225                return Err(starnix_uapi::errno!(ENOSPC));
226            }
227            Err(e) => return Err(e),
228        };
229
230        let nanos = delta.into_nanos().try_into().unwrap_or(u32::MAX);
231        event.header.set_time_delta(nanos);
232
233        let bytes = event.as_bytes();
234        res.write_at(0, bytes);
235        res.write_at(bytes.len(), data);
236        res.write_at(bytes.len() + data.len(), b"\n");
237
238        self.ring_buffer.commit(res);
239
240        Ok(delta)
241    }
242}
243
244pub struct TraceEventQueueList {
245    pub queues: Vec<Arc<TraceEventQueue>>,
246    tracing_enabled: Arc<AtomicBool>,
247    tracefs_node: fuchsia_inspect::Node,
248}
249
250impl TraceEventQueueList {
251    pub fn from(kernel: &Kernel) -> Arc<Self> {
252        kernel.expando.get_or_init(|| {
253            let num_cpus = zx::system_get_num_cpus();
254            let tracing_enabled = Arc::new(AtomicBool::new(false));
255            let tracefs_node = kernel.inspect_node.create_child("tracefs");
256
257            let mut queues = vec![];
258            for cpu in 0..num_cpus {
259                let queue = Arc::new(TraceEventQueue::new(cpu as u32).expect("create queue"));
260                queues.push(queue);
261            }
262            Self { queues, tracing_enabled, tracefs_node }
263        })
264    }
265
266    pub fn is_enabled(&self) -> bool {
267        self.tracing_enabled.load(Ordering::Acquire)
268    }
269
270    pub fn enable(&self) -> Result<(), Errno> {
271        let mut first_error = None;
272        for queue in &self.queues {
273            if let Err(e) = queue.enable() {
274                if first_error.is_none() {
275                    first_error = Some(e);
276                }
277            }
278        }
279        if let Some(e) = first_error {
280            return Err(e);
281        }
282        self.tracing_enabled.store(true, Ordering::Release);
283        Ok(())
284    }
285
286    pub fn disable(&self) -> Result<(), Errno> {
287        // Set disabled to stop new access to the queue, then clean up each one.
288        self.tracing_enabled.store(false, Ordering::Release);
289        let mut first_error = None;
290        let mut total_dropped = 0;
291        for queue in &self.queues {
292            match queue.disable() {
293                Ok(dropped) => total_dropped += dropped,
294                Err(e) => {
295                    if first_error.is_none() {
296                        first_error = Some(e);
297                    }
298                }
299            }
300        }
301
302        self.tracefs_node.record_uint(DROPPED_PAGES, total_dropped);
303
304        if let Some(e) = first_error {
305            return Err(e);
306        }
307        Ok(())
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::{DEFAULT_RING_BUFFER_SIZE_BYTES, TraceEvent, TraceEventQueue};
314    use crate::vfs::OutputBuffer;
315    use crate::vfs::buffers::VecOutputBuffer;
316
317    use starnix_types::PAGE_SIZE;
318    use starnix_uapi::error;
319
320    #[fuchsia::test]
321    fn trace_event_queue_empty_errors() {
322        let queue = TraceEventQueue::new(0).unwrap();
323
324        let mut buffer = VecOutputBuffer::new(*PAGE_SIZE as usize);
325        assert_eq!(queue.read(&mut buffer), error!(EAGAIN));
326
327        let data = b"B|1234|slice_name";
328        let event = TraceEvent::new(1234, data.len());
329        assert_eq!(queue.push_event(event, data), error!(ENOMEM));
330    }
331
332    #[fuchsia::test]
333    fn read_empty_queue() {
334        let queue = TraceEventQueue::new(0).expect("create queue");
335        let mut buffer = VecOutputBuffer::new(*PAGE_SIZE as usize);
336        assert_eq!(queue.read(&mut buffer), error!(EAGAIN));
337    }
338
339    #[fuchsia::test]
340    fn enable_disable_queue() {
341        let queue = TraceEventQueue::new(0).expect("create queue");
342        assert!(!queue.ring_buffer.is_enabled());
343
344        // Enable tracing and check the queue's state.
345        assert!(queue.enable().is_ok());
346        assert_eq!(queue.ring_buffer.size_bytes(), DEFAULT_RING_BUFFER_SIZE_BYTES);
347
348        // Confirm we can push an event.
349        let data = b"B|1234|slice_name";
350        let event = TraceEvent::new(1234, data.len());
351        let result = queue.push_event(event, data);
352
353        assert!(result.is_ok());
354        assert_eq!(result.as_ref().unwrap().into_nanos(), 0);
355
356        // Disable tracing and check that the queue's state has been reset.
357        assert!(queue.disable().is_ok());
358        assert!(!queue.ring_buffer.is_enabled());
359    }
360
361    #[fuchsia::test]
362    fn create_trace_event() {
363        // Create an event.
364        let event = TraceEvent::new(1234, b"B|1234|slice_name".len());
365        let event_size = event.size();
366        assert_eq!(event_size, 42);
367    }
368
369    // This can be removed when we support reading incomplete pages.
370    #[fuchsia::test]
371    fn single_trace_event_fails_read() {
372        let queue = TraceEventQueue::new(0).expect("create queue");
373        queue.enable().expect("enable queue");
374        // Create an event.
375        let data = b"B|1234|slice_name";
376        let event = TraceEvent::new(1234, data.len());
377
378        // Push the event into the queue.
379        let result = queue.push_event(event, data);
380        assert!(result.is_ok());
381        assert_eq!(result.ok().expect("delta").into_nanos(), 0);
382
383        let mut buffer = VecOutputBuffer::new(*PAGE_SIZE as usize);
384        assert_eq!(queue.read(&mut buffer), error!(EAGAIN));
385    }
386
387    #[fuchsia::test]
388    fn page_overflow() {
389        let queue = TraceEventQueue::new(0).expect("create queue");
390        let queue_start_timestamp = queue.enable().expect("enable queue");
391
392        let pid = 1234;
393        let data = b"B|1234|loooooooooooooooooooooooooooooooooooooooooooooooooooooooooo\
394        ooooooooooooooooooooooooooooooooooooooooooooooooooooooooongevent";
395        let expected_event = TraceEvent::new(pid, data.len());
396        assert_eq!(expected_event.size(), 155);
397
398        // Push the event into the queue.
399        for i in 0..27 {
400            let event = TraceEvent::new(pid, data.len());
401            let result = queue.push_event(event, data);
402            assert!(result.is_ok());
403            let delta = result.ok().expect("delta").into_nanos();
404            // The first event on Page 0 (i == 0) and the first event on Page 1 (i == 26,
405            // due to overflow since a page holds exactly 26 events of size 155) must
406            // have a time delta of exactly 0 in their event headers as per the Ftrace format.
407            if i == 0 || i == 26 {
408                assert_eq!(delta, 0);
409            } else {
410                assert!(delta > 0);
411            }
412        }
413
414        // Read a page of data.
415        let mut buffer = VecOutputBuffer::new(*PAGE_SIZE as usize);
416        assert_eq!(queue.read(&mut buffer), Ok(*PAGE_SIZE as usize));
417        assert_eq!(buffer.bytes_written() as u64, *PAGE_SIZE);
418
419        // Verify timestamp is monotonic
420        let actual_ts_bytes = &buffer.data()[0..8];
421        let actual_ts = u64::from_le_bytes(actual_ts_bytes.try_into().unwrap());
422        assert!(actual_ts >= queue_start_timestamp.into_nanos() as u64);
423
424        // Verify size of events
425        let actual_size_bytes = &buffer.data()[8..16];
426        let expected_size_bytes = &(expected_event.size() * 26).to_le_bytes();
427        assert_eq!(actual_size_bytes, expected_size_bytes);
428
429        // Try reading another page.
430        let mut buffer = VecOutputBuffer::new(*PAGE_SIZE as usize);
431        assert_eq!(queue.read(&mut buffer), error!(EAGAIN));
432    }
433}