1//! Tukey's method
2//!
3//! The original method uses two "fences" to classify the data. All the observations "inside" the
4//! fences are considered "normal", and the rest are considered outliers.
5//!
6//! The fences are computed from the quartiles of the sample, according to the following formula:
7//!
8//! ``` ignore
9//! // q1, q3 are the first and third quartiles
10//! let iqr = q3 - q1; // The interquartile range
11//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr); // the "fences"
12//!
13//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
14//! ```
15//!
16//! The classifier provided here adds two extra outer fences:
17//!
18//! ``` ignore
19//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr); // the outer "fences"
20//! ```
21//!
22//! The extra fences add a sense of "severity" to the classification. Data points outside of the
23//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
24//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
25//! "normal" data.
26//!
27//! Some ASCII art for the visually oriented people:
28//!
29//! ``` ignore
30//! LOW-ish NORMAL-ish HIGH-ish
31//! x | + | o o o o o o o | + | x
32//! f3 f1 f2 f4
33//!
34//! Legend:
35//! o: "normal" data (not an outlier)
36//! +: "mild" outlier
37//! x: "severe" outlier
38//! ```
3940use std::iter::IntoIterator;
41use std::ops::{Deref, Index};
42use std::slice;
4344use cast;
45use stats::float::Float;
4647use stats::univariate::Sample;
4849use self::Label::*;
5051/// A classified/labeled sample.
52///
53/// The labeled data can be accessed using the indexing operator. The order of the data points is
54/// retained.
55///
56/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
57/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
58/// pair.
59#[derive(Clone, Copy)]
60pub struct LabeledSample<'a, A>
61where
62A: 'a + Float,
63{
64 fences: (A, A, A, A),
65 sample: &'a Sample<A>,
66}
6768impl<'a, A> LabeledSample<'a, A>
69where
70A: Float,
71{
72/// Returns the number of data points per label
73 ///
74 /// - Time: `O(length)`
75#[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
76pub fn count(&self) -> (usize, usize, usize, usize, usize) {
77let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);
7879for (_, label) in self {
80match label {
81 LowSevere => {
82 los += 1;
83 }
84 LowMild => {
85 lom += 1;
86 }
87 NotAnOutlier => {
88 noa += 1;
89 }
90 HighMild => {
91 him += 1;
92 }
93 HighSevere => {
94 his += 1;
95 }
96 }
97 }
9899 (los, lom, noa, him, his)
100 }
101102/// Returns the fences used to classify the outliers
103pub fn fences(&self) -> (A, A, A, A) {
104self.fences
105 }
106107/// Returns an iterator over the labeled data
108pub fn iter(&self) -> Iter<'a, A> {
109 Iter {
110 fences: self.fences,
111 iter: self.sample.iter(),
112 }
113 }
114}
115116impl<'a, A> Deref for LabeledSample<'a, A>
117where
118A: Float,
119{
120type Target = Sample<A>;
121122fn deref(&self) -> &Sample<A> {
123self.sample
124 }
125}
126127// FIXME Use the `IndexGet` trait
128impl<'a, A> Index<usize> for LabeledSample<'a, A>
129where
130A: Float,
131{
132type Output = Label;
133134#[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
135fn index(&self, i: usize) -> &Label {
136static LOW_SEVERE: Label = LowSevere;
137static LOW_MILD: Label = LowMild;
138static HIGH_MILD: Label = HighMild;
139static HIGH_SEVERE: Label = HighSevere;
140static NOT_AN_OUTLIER: Label = NotAnOutlier;
141142let x = self.sample[i];
143let (lost, lomt, himt, hist) = self.fences;
144145if x < lost {
146&LOW_SEVERE
147 } else if x > hist {
148&HIGH_SEVERE
149 } else if x < lomt {
150&LOW_MILD
151 } else if x > himt {
152&HIGH_MILD
153 } else {
154&NOT_AN_OUTLIER
155 }
156 }
157}
158159impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A>
160where
161A: Float,
162{
163type Item = (A, Label);
164type IntoIter = Iter<'a, A>;
165166fn into_iter(self) -> Iter<'a, A> {
167self.iter()
168 }
169}
170171/// Iterator over the labeled data
172pub struct Iter<'a, A>
173where
174A: 'a + Float,
175{
176 fences: (A, A, A, A),
177 iter: slice::Iter<'a, A>,
178}
179180impl<'a, A> Iterator for Iter<'a, A>
181where
182A: Float,
183{
184type Item = (A, Label);
185186#[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
187fn next(&mut self) -> Option<(A, Label)> {
188self.iter.next().map(|&x| {
189let (lost, lomt, himt, hist) = self.fences;
190191let label = if x < lost {
192 LowSevere
193 } else if x > hist {
194 HighSevere
195 } else if x < lomt {
196 LowMild
197 } else if x > himt {
198 HighMild
199 } else {
200 NotAnOutlier
201 };
202203 (x, label)
204 })
205 }
206207fn size_hint(&self) -> (usize, Option<usize>) {
208self.iter.size_hint()
209 }
210}
211212/// Labels used to classify outliers
213pub enum Label {
214/// A "mild" outlier in the "high" spectrum
215HighMild,
216/// A "severe" outlier in the "high" spectrum
217HighSevere,
218/// A "mild" outlier in the "low" spectrum
219LowMild,
220/// A "severe" outlier in the "low" spectrum
221LowSevere,
222/// A normal data point
223NotAnOutlier,
224}
225226impl Label {
227/// Checks if the data point has an "unusually" high value
228pub fn is_high(&self) -> bool {
229match *self {
230 HighMild | HighSevere => true,
231_ => false,
232 }
233 }
234235/// Checks if the data point is labeled as a "mild" outlier
236pub fn is_mild(&self) -> bool {
237match *self {
238 HighMild | LowMild => true,
239_ => false,
240 }
241 }
242243/// Checks if the data point has an "unusually" low value
244pub fn is_low(&self) -> bool {
245match *self {
246 LowMild | LowSevere => true,
247_ => false,
248 }
249 }
250251/// Checks if the data point is labeled as an outlier
252pub fn is_outlier(&self) -> bool {
253match *self {
254 NotAnOutlier => false,
255_ => true,
256 }
257 }
258259/// Checks if the data point is labeled as a "severe" outlier
260pub fn is_severe(&self) -> bool {
261match *self {
262 HighSevere | LowSevere => true,
263_ => false,
264 }
265 }
266}
267268/// Classifies the sample, and returns a labeled sample.
269///
270/// - Time: `O(N log N) where N = length`
271pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<A>
272where
273A: Float,
274 usize: cast::From<A, Output = Result<usize, cast::Error>>,
275{
276let (q1, _, q3) = sample.percentiles().quartiles();
277let iqr = q3 - q1;
278279// Mild
280let k_m = A::cast(1.5_f32);
281// Severe
282let k_s = A::cast(3);
283284 LabeledSample {
285 fences: (
286 q1 - k_s * iqr,
287 q1 - k_m * iqr,
288 q3 + k_m * iqr,
289 q3 + k_s * iqr,
290 ),
291 sample,
292 }
293}