char_set/
lib.rs

// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

use char_collection::CharCollection;
use std::cmp::Ordering;

type BitmapElement = u64;
const BITMAP_ELEMENT_SIZE: usize = 64;
/// This value is optimal for memory use, based on some non-scientific experimentation with
/// real-world font files. (Though compared to `2048`, it does add a few nanoseconds to the average
/// `contains` call.)
const MAX_RANGE_GAP: u32 = 256;

/// Represents an ordered set of code points that begin at [CharSetRange.start]. The largest
/// allowed discontinuity between two consecutive code points in the set is [MAX_RANGE_GAP].
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
struct CharSetRange {
    start: u32,
    bitmap: Vec<BitmapElement>,
}

impl CharSetRange {
    fn new() -> CharSetRange {
        CharSetRange { start: 0, bitmap: vec![] }
    }

    fn start(&self) -> u32 {
        self.start
    }

    fn end(&self) -> u32 {
        self.start + (self.bitmap.len() * BITMAP_ELEMENT_SIZE) as u32
    }

    fn is_empty(&self) -> bool {
        self.bitmap.is_empty()
    }

    fn add(&mut self, val: u32) {
        assert!(val >= self.start);
        assert!(char::try_from(val).is_ok());

        if self.bitmap.is_empty() {
            self.start = val;
        }

        let pos = (val - self.start) as usize;
        let element_pos = pos / BITMAP_ELEMENT_SIZE;

        if element_pos >= self.bitmap.len() {
            self.bitmap.resize(element_pos + 1, 0);
        }

        self.bitmap[element_pos] |= 1 << (pos % BITMAP_ELEMENT_SIZE);
    }

    fn contains(&self, c: u32) -> bool {
        if c < self.start || c >= self.end() {
            false
        } else {
            let index = c as usize - self.start as usize;
            (self.bitmap[index / BITMAP_ELEMENT_SIZE] & (1 << (index % BITMAP_ELEMENT_SIZE))) > 0
        }
    }

    fn iter(&self) -> CharSetRangeIterator<'_> {
        CharSetRangeIterator { char_set_range: &self, position: self.start.clone() }
    }
}

struct CharSetRangeIterator<'a> {
    char_set_range: &'a CharSetRange,
    position: u32,
}

impl Iterator for CharSetRangeIterator<'_> {
    type Item = char;

    fn next(&mut self) -> Option<char> {
        while self.position < self.char_set_range.end() {
            if self.char_set_range.contains(self.position) {
                self.position += 1;
                return Some(std::char::from_u32(self.position - 1).unwrap());
            }
            self.position += 1;
        }
        None
    }
}

/// Represents an ordered set of code points.
///
/// TODO(kpozin): Add factory method that takes lazy `Iterator<Item = char>` instead of `Vec`.
/// TODO(kpozin): Enforce `char` values.
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
pub struct CharSet {
    ranges: Vec<CharSetRange>,
    len: usize,
}

impl CharSet {
    pub fn new(mut code_points: Vec<u32>) -> CharSet {
        let len = code_points.len();
        code_points.sort_unstable();

        let mut ranges = vec![];
        let mut range = CharSetRange::new();
        for c in code_points {
            if c != 0 && !range.is_empty() && c >= range.end() + MAX_RANGE_GAP {
                ranges.push(range);
                range = CharSetRange::new();
            }
            range.add(c);
        }
        if !range.is_empty() {
            ranges.push(range)
        }
        CharSet { ranges, len }
    }

    pub fn contains(&self, c: u32) -> bool {
        match self.ranges.binary_search_by(|r| {
            if r.end() < c {
                Ordering::Less
            } else if r.start() > c {
                Ordering::Greater
            } else {
                Ordering::Equal
            }
        }) {
            Ok(r) => self.ranges[r].contains(c),
            Err(_) => false,
        }
    }

    pub fn is_empty(&self) -> bool {
        self.ranges.is_empty()
    }

    /// Returns the number of code points in the set.
    pub fn len(&self) -> usize {
        self.len
    }

    /// Iterate over all the characters in the the `CharSet` in code point order.
    pub fn iter(&self) -> impl Iterator<Item = char> + '_ {
        self.ranges.iter().flat_map(CharSetRange::iter)
    }
}

impl Default for CharSet {
    fn default() -> Self {
        CharSet::new(vec![])
    }
}

impl Into<CharCollection> for &CharSet {
    fn into(self) -> CharCollection {
        // Unwrapping is safe because we know `CharSet` iterates in order.
        CharCollection::from_sorted_chars(self.iter()).unwrap()
    }
}

impl From<CharCollection> for CharSet {
    fn from(value: CharCollection) -> CharSet {
        CharSet::from(&value)
    }
}

impl From<&CharCollection> for CharSet {
    fn from(value: &CharCollection) -> CharSet {
        CharSet::new(value.iter().map(|ch| ch as u32).collect::<Vec<u32>>())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use char_collection::char_collect;

    #[test]
    fn test_charset() {
        let charset = CharSet::new(vec![1, 2, 3, 10, 500, 5000, 5001, 10000]);
        assert!(!charset.contains(0));
        assert!(charset.contains(1));
        assert!(charset.contains(2));
        assert!(charset.contains(3));
        assert!(!charset.contains(4));
        assert!(!charset.contains(9));
        assert!(charset.contains(10));
        assert!(charset.contains(500));
        assert!(!charset.contains(501));
        assert!(charset.contains(5000));
        assert!(charset.contains(5001));
        assert!(!charset.contains(5002));
        assert!(!charset.contains(9999));
        assert!(charset.contains(10000));
        assert!(!charset.contains(10001));

        assert_eq!(
            charset.iter().map(|ch| ch as u32).collect::<Vec<u32>>(),
            vec![1, 2, 3, 10, 500, 5000, 5001, 10000]
        );
    }

    #[test]
    fn test_charset_from_char_collection() {
        let collection = char_collect!(0..=0, 2..=2, 13..=13, 32..=126);
        let charset = CharSet::from(&collection);
        assert!([0, 2, 13, 32, 54, 126].iter().all(|c| charset.contains(*c)));
        assert!([1, 11, 19, 127, 10000].iter().all(|c| !charset.contains(*c)));
    }
}
char_set/lib.rs

char_set/
lib.rs