netstack3_tcp/
base.rs

1// Copyright 2022 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! The Transmission Control Protocol (TCP).
6
7use core::num::NonZeroU8;
8use core::time::Duration;
9
10use net_types::ip::{GenericOverIp, Ip, Mtu};
11use net_types::SpecifiedAddr;
12use netstack3_base::{
13    IcmpErrorCode, Icmpv4ErrorCode, Icmpv6ErrorCode, IpExt, Marks, UnscaledWindowSize,
14    WeakDeviceIdentifier, WindowSize,
15};
16use netstack3_ip::socket::{RouteResolutionOptions, SendOptions};
17use packet_formats::icmp::{
18    Icmpv4DestUnreachableCode, Icmpv4TimeExceededCode, Icmpv6DestUnreachableCode,
19};
20use packet_formats::ip::DscpAndEcn;
21use packet_formats::utils::NonZeroDuration;
22use rand::Rng;
23
24use crate::internal::buffer::BufferLimits;
25use crate::internal::counters::{TcpCountersWithSocket, TcpCountersWithoutSocket};
26use crate::internal::socket::isn::IsnGenerator;
27use crate::internal::socket::{DualStackIpExt, Sockets, TcpBindingsTypes};
28use crate::internal::state::DEFAULT_MAX_SYN_RETRIES;
29
30/// Default lifetime for a orphaned connection in FIN_WAIT2.
31pub const DEFAULT_FIN_WAIT2_TIMEOUT: Duration = Duration::from_secs(60);
32
33/// Errors surfaced to the user.
34#[derive(Copy, Clone, Debug, PartialEq, Eq)]
35pub enum ConnectionError {
36    /// The connection was refused, RST segment received while in SYN_SENT state.
37    ConnectionRefused,
38    /// The connection was reset because of a RST segment.
39    ConnectionReset,
40    /// The connection was closed because the network is unreachable.
41    NetworkUnreachable,
42    /// The connection was closed because the host is unreachable.
43    HostUnreachable,
44    /// The connection was closed because the protocol is unreachable.
45    ProtocolUnreachable,
46    /// The connection was closed because the port is unreachable.
47    PortUnreachable,
48    /// The connection was closed because the host is down.
49    DestinationHostDown,
50    /// The connection was closed because the source route failed.
51    SourceRouteFailed,
52    /// The connection was closed because the source host is isolated.
53    SourceHostIsolated,
54    /// The connection was closed because of a time out.
55    TimedOut,
56    /// The connection was closed because of a lack of required permissions.
57    PermissionDenied,
58    /// The connection was closed because there was a protocol error.
59    ProtocolError,
60}
61
62impl ConnectionError {
63    // Notes: the following mappings are guided by the packetimpact test here:
64    // https://cs.opensource.google/gvisor/gvisor/+/master:test/packetimpact/tests/tcp_network_unreachable_test.go;drc=611e6e1247a0691f5fd198f411c68b3bc79d90af
65    pub(crate) fn try_from_icmp_error(err: IcmpErrorCode) -> Option<Self> {
66        match err {
67            IcmpErrorCode::V4(Icmpv4ErrorCode::DestUnreachable(code)) => match code {
68                Icmpv4DestUnreachableCode::DestNetworkUnreachable => {
69                    Some(ConnectionError::NetworkUnreachable)
70                }
71                Icmpv4DestUnreachableCode::DestHostUnreachable => {
72                    Some(ConnectionError::HostUnreachable)
73                }
74                Icmpv4DestUnreachableCode::DestProtocolUnreachable => {
75                    Some(ConnectionError::ProtocolUnreachable)
76                }
77                Icmpv4DestUnreachableCode::DestPortUnreachable => {
78                    Some(ConnectionError::PortUnreachable)
79                }
80                // TODO(https://fxbug.dev/404628798): update PMTU/MSS.
81                Icmpv4DestUnreachableCode::FragmentationRequired => None,
82                Icmpv4DestUnreachableCode::SourceRouteFailed => {
83                    Some(ConnectionError::SourceRouteFailed)
84                }
85                Icmpv4DestUnreachableCode::DestNetworkUnknown => {
86                    Some(ConnectionError::NetworkUnreachable)
87                }
88                Icmpv4DestUnreachableCode::DestHostUnknown => {
89                    Some(ConnectionError::DestinationHostDown)
90                }
91                Icmpv4DestUnreachableCode::SourceHostIsolated => {
92                    Some(ConnectionError::SourceHostIsolated)
93                }
94                Icmpv4DestUnreachableCode::NetworkAdministrativelyProhibited => {
95                    Some(ConnectionError::NetworkUnreachable)
96                }
97                Icmpv4DestUnreachableCode::HostAdministrativelyProhibited => {
98                    Some(ConnectionError::HostUnreachable)
99                }
100                Icmpv4DestUnreachableCode::NetworkUnreachableForToS => {
101                    Some(ConnectionError::NetworkUnreachable)
102                }
103                Icmpv4DestUnreachableCode::HostUnreachableForToS => {
104                    Some(ConnectionError::HostUnreachable)
105                }
106                Icmpv4DestUnreachableCode::CommAdministrativelyProhibited => {
107                    Some(ConnectionError::HostUnreachable)
108                }
109                Icmpv4DestUnreachableCode::HostPrecedenceViolation => {
110                    Some(ConnectionError::HostUnreachable)
111                }
112                Icmpv4DestUnreachableCode::PrecedenceCutoffInEffect => {
113                    Some(ConnectionError::HostUnreachable)
114                }
115            },
116            IcmpErrorCode::V4(Icmpv4ErrorCode::ParameterProblem(_)) => {
117                Some(ConnectionError::ProtocolError)
118            }
119            IcmpErrorCode::V4(Icmpv4ErrorCode::TimeExceeded(
120                Icmpv4TimeExceededCode::TtlExpired,
121            )) => Some(ConnectionError::HostUnreachable),
122            IcmpErrorCode::V4(Icmpv4ErrorCode::TimeExceeded(
123                Icmpv4TimeExceededCode::FragmentReassemblyTimeExceeded,
124            )) => Some(ConnectionError::TimedOut),
125            IcmpErrorCode::V4(Icmpv4ErrorCode::Redirect(_)) => None,
126            IcmpErrorCode::V6(Icmpv6ErrorCode::DestUnreachable(code)) => match code {
127                Icmpv6DestUnreachableCode::NoRoute => Some(ConnectionError::NetworkUnreachable),
128                Icmpv6DestUnreachableCode::CommAdministrativelyProhibited => {
129                    Some(ConnectionError::PermissionDenied)
130                }
131                Icmpv6DestUnreachableCode::BeyondScope => Some(ConnectionError::HostUnreachable),
132                Icmpv6DestUnreachableCode::AddrUnreachable => {
133                    Some(ConnectionError::HostUnreachable)
134                }
135                Icmpv6DestUnreachableCode::PortUnreachable => {
136                    Some(ConnectionError::PortUnreachable)
137                }
138                Icmpv6DestUnreachableCode::SrcAddrFailedPolicy => {
139                    Some(ConnectionError::PermissionDenied)
140                }
141                Icmpv6DestUnreachableCode::RejectRoute => Some(ConnectionError::PermissionDenied),
142            },
143            IcmpErrorCode::V6(Icmpv6ErrorCode::ParameterProblem(_)) => {
144                Some(ConnectionError::ProtocolError)
145            }
146            IcmpErrorCode::V6(Icmpv6ErrorCode::TimeExceeded(_)) => {
147                Some(ConnectionError::HostUnreachable)
148            }
149            // TODO(https://fxbug.dev/404628798): update PMTU/MSS.
150            IcmpErrorCode::V6(Icmpv6ErrorCode::PacketTooBig) => None,
151        }
152    }
153}
154
155/// Stack wide state supporting TCP.
156#[derive(GenericOverIp)]
157#[generic_over_ip(I, Ip)]
158pub struct TcpState<I: DualStackIpExt, D: WeakDeviceIdentifier, BT: TcpBindingsTypes> {
159    /// The initial sequence number generator.
160    pub isn_generator: IsnGenerator<BT::Instant>,
161    /// TCP sockets state.
162    pub sockets: Sockets<I, D, BT>,
163    /// TCP counters that cannot be attributed to a specific socket.
164    pub counters_without_socket: TcpCountersWithoutSocket<I>,
165    /// TCP counters that can be attributed to a specific socket.
166    pub counters_with_socket: TcpCountersWithSocket<I>,
167}
168
169impl<I: DualStackIpExt, D: WeakDeviceIdentifier, BT: TcpBindingsTypes> TcpState<I, D, BT> {
170    /// Creates a new TCP stack state.
171    pub fn new(now: BT::Instant, rng: &mut impl Rng) -> Self {
172        Self {
173            isn_generator: IsnGenerator::new(now, rng),
174            sockets: Sockets::new(),
175            counters_without_socket: Default::default(),
176            counters_with_socket: Default::default(),
177        }
178    }
179}
180
181/// Named tuple for holding sizes of buffers for a socket.
182#[derive(Copy, Clone, Debug)]
183#[cfg_attr(test, derive(Eq, PartialEq))]
184pub struct BufferSizes {
185    /// The size of the send buffer.
186    pub send: usize,
187    /// The size of the receive buffer.
188    pub receive: usize,
189}
190/// Sensible defaults only for testing.
191#[cfg(any(test, feature = "testutils"))]
192impl Default for BufferSizes {
193    fn default() -> Self {
194        BufferSizes { send: WindowSize::DEFAULT.into(), receive: WindowSize::DEFAULT.into() }
195    }
196}
197
198impl BufferSizes {
199    pub(crate) fn rcv_limits(&self) -> BufferLimits {
200        let Self { send: _, receive } = self;
201        BufferLimits { capacity: *receive, len: 0 }
202    }
203
204    pub(crate) fn rwnd(&self) -> WindowSize {
205        let Self { send: _, receive } = *self;
206        WindowSize::new(receive).unwrap_or(WindowSize::MAX)
207    }
208
209    pub(crate) fn rwnd_unscaled(&self) -> UnscaledWindowSize {
210        let Self { send: _, receive } = *self;
211        UnscaledWindowSize::from(u16::try_from(receive).unwrap_or(u16::MAX))
212    }
213}
214
215/// A mutable reference to buffer configuration.
216pub(crate) enum BuffersRefMut<'a, R, S> {
217    /// All buffers are dropped.
218    NoBuffers,
219    /// Buffer sizes are configured but not instantiated yet.
220    Sizes(&'a mut BufferSizes),
221    /// Buffers are instantiated and mutable references are provided.
222    Both { send: &'a mut S, recv: &'a mut R },
223    /// Only the send buffer is still instantiated, which happens in Closing
224    /// states.
225    SendOnly(&'a mut S),
226    /// Only the receive buffer is still instantiated, which happens in Finwait
227    /// states.
228    RecvOnly(&'a mut R),
229}
230
231impl<'a, R, S> BuffersRefMut<'a, R, S> {
232    pub(crate) fn into_send_buffer(self) -> Option<&'a mut S> {
233        match self {
234            Self::NoBuffers | Self::Sizes(_) | Self::RecvOnly(_) => None,
235            Self::Both { send, recv: _ } | Self::SendOnly(send) => Some(send),
236        }
237    }
238
239    pub(crate) fn into_receive_buffer(self) -> Option<&'a mut R> {
240        match self {
241            Self::NoBuffers | Self::Sizes(_) | Self::SendOnly(_) => None,
242            Self::Both { send: _, recv } | Self::RecvOnly(recv) => Some(recv),
243        }
244    }
245}
246
247/// The IP sock options used by TCP.
248#[derive(Clone, Copy, Default, Debug, PartialEq, Eq)]
249pub struct TcpIpSockOptions {
250    /// Socket marks used for routing.
251    pub marks: Marks,
252}
253
254impl<I: Ip> RouteResolutionOptions<I> for TcpIpSockOptions {
255    fn marks(&self) -> &Marks {
256        &self.marks
257    }
258
259    fn transparent(&self) -> bool {
260        false
261    }
262}
263
264impl<I: IpExt> SendOptions<I> for TcpIpSockOptions {
265    fn hop_limit(&self, _destination: &SpecifiedAddr<I::Addr>) -> Option<NonZeroU8> {
266        None
267    }
268
269    fn multicast_loop(&self) -> bool {
270        false
271    }
272
273    fn allow_broadcast(&self) -> Option<I::BroadcastMarker> {
274        None
275    }
276
277    fn dscp_and_ecn(&self) -> DscpAndEcn {
278        DscpAndEcn::default()
279    }
280
281    fn mtu(&self) -> Mtu {
282        Mtu::no_limit()
283    }
284}
285
286/// TCP socket options.
287///
288/// This only stores options that are trivial to get and set.
289#[derive(Clone, Copy, Debug, PartialEq, Eq)]
290pub struct SocketOptions {
291    /// Socket options that control TCP keep-alive mechanism, see [`KeepAlive`].
292    pub keep_alive: KeepAlive,
293    /// Switch to turn nagle algorithm on/off.
294    pub nagle_enabled: bool,
295    /// The period of time after which the connection should be aborted if no
296    /// ACK is received.
297    pub user_timeout: Option<NonZeroDuration>,
298    /// Switch to turn delayed ACK on/off.
299    pub delayed_ack: bool,
300    /// The period of time after with a dangling FIN_WAIT2 state should be
301    /// reclaimed.
302    pub fin_wait2_timeout: Option<Duration>,
303    /// The maximum SYN retransmissions before aborting a connection.
304    pub max_syn_retries: NonZeroU8,
305    /// Ip socket options.
306    pub ip_options: TcpIpSockOptions,
307}
308
309impl Default for SocketOptions {
310    fn default() -> Self {
311        Self {
312            keep_alive: KeepAlive::default(),
313            // RFC 9293 Section 3.7.4:
314            //   A TCP implementation SHOULD implement the Nagle algorithm to
315            //   coalesce short segments
316            nagle_enabled: true,
317            user_timeout: None,
318            delayed_ack: true,
319            fin_wait2_timeout: Some(DEFAULT_FIN_WAIT2_TIMEOUT),
320            max_syn_retries: DEFAULT_MAX_SYN_RETRIES,
321            ip_options: TcpIpSockOptions::default(),
322        }
323    }
324}
325
326/// Options that are related to TCP keep-alive.
327#[derive(Clone, Copy, Debug, PartialEq, Eq)]
328pub struct KeepAlive {
329    /// The amount of time for an idle connection to wait before sending out
330    /// probes.
331    pub idle: NonZeroDuration,
332    /// Interval between consecutive probes.
333    pub interval: NonZeroDuration,
334    /// Maximum number of probes we send before considering the connection dead.
335    ///
336    /// `u8` is enough because if a connection doesn't hear back from the peer
337    /// after 256 probes, then chances are that the connection is already dead.
338    pub count: NonZeroU8,
339    /// Only send probes if keep-alive is enabled.
340    pub enabled: bool,
341}
342
343impl Default for KeepAlive {
344    fn default() -> Self {
345        // Default values inspired by Linux's TCP implementation:
346        // https://github.com/torvalds/linux/blob/0326074ff4652329f2a1a9c8685104576bd8d131/include/net/tcp.h#L155-L157
347        const DEFAULT_IDLE_DURATION: NonZeroDuration =
348            NonZeroDuration::from_secs(2 * 60 * 60).unwrap();
349        const DEFAULT_INTERVAL: NonZeroDuration = NonZeroDuration::from_secs(75).unwrap();
350        const DEFAULT_COUNT: NonZeroU8 = NonZeroU8::new(9).unwrap();
351
352        Self {
353            idle: DEFAULT_IDLE_DURATION,
354            interval: DEFAULT_INTERVAL,
355            count: DEFAULT_COUNT,
356            // Per RFC 9293(https://datatracker.ietf.org/doc/html/rfc9293#section-3.8.4):
357            //   ... they MUST default to off.
358            enabled: false,
359        }
360    }
361}
362
363#[cfg(test)]
364pub(crate) mod testutil {
365    use netstack3_base::Mss;
366    /// Per RFC 879 section 1 (https://tools.ietf.org/html/rfc879#section-1):
367    ///
368    /// THE TCP MAXIMUM SEGMENT SIZE IS THE IP MAXIMUM DATAGRAM SIZE MINUS
369    /// FORTY.
370    ///   The default IP Maximum Datagram Size is 576.
371    ///   The default TCP Maximum Segment Size is 536.
372    pub(crate) const DEFAULT_IPV4_MAXIMUM_SEGMENT_SIZE_USIZE: usize = 536;
373    pub(crate) const DEFAULT_IPV4_MAXIMUM_SEGMENT_SIZE: Mss =
374        Mss(core::num::NonZeroU16::new(DEFAULT_IPV4_MAXIMUM_SEGMENT_SIZE_USIZE as u16).unwrap());
375}