criterion/stats/univariate/outliers/
tukey.rs

1//! Tukey's method
2//!
3//! The original method uses two "fences" to classify the data. All the observations "inside" the
4//! fences are considered "normal", and the rest are considered outliers.
5//!
6//! The fences are computed from the quartiles of the sample, according to the following formula:
7//!
8//! ``` ignore
9//! // q1, q3 are the first and third quartiles
10//! let iqr = q3 - q1;  // The interquartile range
11//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr);  // the "fences"
12//!
13//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
14//! ```
15//!
16//! The classifier provided here adds two extra outer fences:
17//!
18//! ``` ignore
19//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr);  // the outer "fences"
20//! ```
21//!
22//! The extra fences add a sense of "severity" to the classification. Data points outside of the
23//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
24//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
25//! "normal" data.
26//!
27//! Some ASCII art for the visually oriented people:
28//!
29//! ``` ignore
30//!          LOW-ish                NORMAL-ish                 HIGH-ish
31//!         x   |       +    |  o o  o    o   o o  o  |        +   |   x
32//!             f3           f1                       f2           f4
33//!
34//! Legend:
35//! o: "normal" data (not an outlier)
36//! +: "mild" outlier
37//! x: "severe" outlier
38//! ```
39
40use std::iter::IntoIterator;
41use std::ops::{Deref, Index};
42use std::slice;
43
44use cast;
45use stats::float::Float;
46
47use stats::univariate::Sample;
48
49use self::Label::*;
50
51/// A classified/labeled sample.
52///
53/// The labeled data can be accessed using the indexing operator. The order of the data points is
54/// retained.
55///
56/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
57/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
58/// pair.
59#[derive(Clone, Copy)]
60pub struct LabeledSample<'a, A>
61where
62    A: 'a + Float,
63{
64    fences: (A, A, A, A),
65    sample: &'a Sample<A>,
66}
67
68impl<'a, A> LabeledSample<'a, A>
69where
70    A: Float,
71{
72    /// Returns the number of data points per label
73    ///
74    /// - Time: `O(length)`
75    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
76    pub fn count(&self) -> (usize, usize, usize, usize, usize) {
77        let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);
78
79        for (_, label) in self {
80            match label {
81                LowSevere => {
82                    los += 1;
83                }
84                LowMild => {
85                    lom += 1;
86                }
87                NotAnOutlier => {
88                    noa += 1;
89                }
90                HighMild => {
91                    him += 1;
92                }
93                HighSevere => {
94                    his += 1;
95                }
96            }
97        }
98
99        (los, lom, noa, him, his)
100    }
101
102    /// Returns the fences used to classify the outliers
103    pub fn fences(&self) -> (A, A, A, A) {
104        self.fences
105    }
106
107    /// Returns an iterator over the labeled data
108    pub fn iter(&self) -> Iter<'a, A> {
109        Iter {
110            fences: self.fences,
111            iter: self.sample.iter(),
112        }
113    }
114}
115
116impl<'a, A> Deref for LabeledSample<'a, A>
117where
118    A: Float,
119{
120    type Target = Sample<A>;
121
122    fn deref(&self) -> &Sample<A> {
123        self.sample
124    }
125}
126
127// FIXME Use the `IndexGet` trait
128impl<'a, A> Index<usize> for LabeledSample<'a, A>
129where
130    A: Float,
131{
132    type Output = Label;
133
134    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
135    fn index(&self, i: usize) -> &Label {
136        static LOW_SEVERE: Label = LowSevere;
137        static LOW_MILD: Label = LowMild;
138        static HIGH_MILD: Label = HighMild;
139        static HIGH_SEVERE: Label = HighSevere;
140        static NOT_AN_OUTLIER: Label = NotAnOutlier;
141
142        let x = self.sample[i];
143        let (lost, lomt, himt, hist) = self.fences;
144
145        if x < lost {
146            &LOW_SEVERE
147        } else if x > hist {
148            &HIGH_SEVERE
149        } else if x < lomt {
150            &LOW_MILD
151        } else if x > himt {
152            &HIGH_MILD
153        } else {
154            &NOT_AN_OUTLIER
155        }
156    }
157}
158
159impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A>
160where
161    A: Float,
162{
163    type Item = (A, Label);
164    type IntoIter = Iter<'a, A>;
165
166    fn into_iter(self) -> Iter<'a, A> {
167        self.iter()
168    }
169}
170
171/// Iterator over the labeled data
172pub struct Iter<'a, A>
173where
174    A: 'a + Float,
175{
176    fences: (A, A, A, A),
177    iter: slice::Iter<'a, A>,
178}
179
180impl<'a, A> Iterator for Iter<'a, A>
181where
182    A: Float,
183{
184    type Item = (A, Label);
185
186    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
187    fn next(&mut self) -> Option<(A, Label)> {
188        self.iter.next().map(|&x| {
189            let (lost, lomt, himt, hist) = self.fences;
190
191            let label = if x < lost {
192                LowSevere
193            } else if x > hist {
194                HighSevere
195            } else if x < lomt {
196                LowMild
197            } else if x > himt {
198                HighMild
199            } else {
200                NotAnOutlier
201            };
202
203            (x, label)
204        })
205    }
206
207    fn size_hint(&self) -> (usize, Option<usize>) {
208        self.iter.size_hint()
209    }
210}
211
212/// Labels used to classify outliers
213pub enum Label {
214    /// A "mild" outlier in the "high" spectrum
215    HighMild,
216    /// A "severe" outlier in the "high" spectrum
217    HighSevere,
218    /// A "mild" outlier in the "low" spectrum
219    LowMild,
220    /// A "severe" outlier in the "low" spectrum
221    LowSevere,
222    /// A normal data point
223    NotAnOutlier,
224}
225
226impl Label {
227    /// Checks if the data point has an "unusually" high value
228    pub fn is_high(&self) -> bool {
229        match *self {
230            HighMild | HighSevere => true,
231            _ => false,
232        }
233    }
234
235    /// Checks if the data point is labeled as a "mild" outlier
236    pub fn is_mild(&self) -> bool {
237        match *self {
238            HighMild | LowMild => true,
239            _ => false,
240        }
241    }
242
243    /// Checks if the data point has an "unusually" low value
244    pub fn is_low(&self) -> bool {
245        match *self {
246            LowMild | LowSevere => true,
247            _ => false,
248        }
249    }
250
251    /// Checks if the data point is labeled as an outlier
252    pub fn is_outlier(&self) -> bool {
253        match *self {
254            NotAnOutlier => false,
255            _ => true,
256        }
257    }
258
259    /// Checks if the data point is labeled as a "severe" outlier
260    pub fn is_severe(&self) -> bool {
261        match *self {
262            HighSevere | LowSevere => true,
263            _ => false,
264        }
265    }
266}
267
268/// Classifies the sample, and returns a labeled sample.
269///
270/// - Time: `O(N log N) where N = length`
271pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<A>
272where
273    A: Float,
274    usize: cast::From<A, Output = Result<usize, cast::Error>>,
275{
276    let (q1, _, q3) = sample.percentiles().quartiles();
277    let iqr = q3 - q1;
278
279    // Mild
280    let k_m = A::cast(1.5_f32);
281    // Severe
282    let k_s = A::cast(3);
283
284    LabeledSample {
285        fences: (
286            q1 - k_s * iqr,
287            q1 - k_m * iqr,
288            q3 + k_m * iqr,
289            q3 + k_s * iqr,
290        ),
291        sample,
292    }
293}