unicode_normalization/
lib.rs

1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode character composition and decomposition utilities
12//! as described in
13//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
14//!
15//! ```rust
16//! extern crate unicode_normalization;
17//!
18//! use unicode_normalization::char::compose;
19//! use unicode_normalization::UnicodeNormalization;
20//!
21//! fn main() {
22//!     assert_eq!(compose('A','\u{30a}'), Some('Å'));
23//!
24//!     let s = "ÅΩ";
25//!     let c = s.nfc().collect::<String>();
26//!     assert_eq!(c, "ÅΩ");
27//! }
28//! ```
29//!
30//! # crates.io
31//!
32//! You can use this package in your project by adding the following
33//! to your `Cargo.toml`:
34//!
35//! ```toml
36//! [dependencies]
37//! unicode-normalization = "0.1.19"
38//! ```
39
40#![deny(missing_docs, unsafe_code)]
41#![doc(
42    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
43    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
44)]
45#![cfg_attr(not(feature = "std"), no_std)]
46
47#[cfg(not(feature = "std"))]
48extern crate alloc;
49
50#[cfg(feature = "std")]
51extern crate core;
52
53extern crate tinyvec;
54
55pub use crate::decompose::Decompositions;
56pub use crate::quick_check::{
57    is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
58    is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
59    IsNormalized,
60};
61pub use crate::recompose::Recompositions;
62pub use crate::replace::Replacements;
63pub use crate::stream_safe::StreamSafe;
64pub use crate::tables::UNICODE_VERSION;
65use core::str::Chars;
66
67mod no_std_prelude;
68
69mod decompose;
70mod lookups;
71mod normalize;
72mod perfect_hash;
73mod quick_check;
74mod recompose;
75mod replace;
76mod stream_safe;
77
78#[rustfmt::skip]
79mod tables;
80
81#[doc(hidden)]
82pub mod __test_api;
83#[cfg(test)]
84mod test;
85
86/// Methods for composing and decomposing characters.
87pub mod char {
88    pub use crate::normalize::{
89        compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
90    };
91
92    pub use crate::lookups::{canonical_combining_class, is_combining_mark};
93
94    /// Return whether the given character is assigned (`General_Category` != `Unassigned`)
95    /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
96    /// of Unicode.
97    pub use crate::tables::is_public_assigned;
98}
99
100/// Methods for iterating over strings while applying Unicode normalizations
101/// as described in
102/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
103pub trait UnicodeNormalization<I: Iterator<Item = char>> {
104    /// Returns an iterator over the string in Unicode Normalization Form D
105    /// (canonical decomposition).
106    fn nfd(self) -> Decompositions<I>;
107
108    /// Returns an iterator over the string in Unicode Normalization Form KD
109    /// (compatibility decomposition).
110    fn nfkd(self) -> Decompositions<I>;
111
112    /// An Iterator over the string in Unicode Normalization Form C
113    /// (canonical decomposition followed by canonical composition).
114    fn nfc(self) -> Recompositions<I>;
115
116    /// An Iterator over the string in Unicode Normalization Form KC
117    /// (compatibility decomposition followed by canonical composition).
118    fn nfkc(self) -> Recompositions<I>;
119
120    /// A transformation which replaces CJK Compatibility Ideograph codepoints
121    /// with normal forms using Standardized Variation Sequences. This is not
122    /// part of the canonical or compatibility decomposition algorithms, but
123    /// performing it before those algorithms produces normalized output which
124    /// better preserves the intent of the original text.
125    ///
126    /// Note that many systems today ignore variation selectors, so these
127    /// may not immediately help text display as intended, but they at
128    /// least preserve the information in a standardized form, giving
129    /// implementations the option to recognize them.
130    fn cjk_compat_variants(self) -> Replacements<I>;
131
132    /// An Iterator over the string with Conjoining Grapheme Joiner characters
133    /// inserted according to the Stream-Safe Text Process (UAX15-D4)
134    fn stream_safe(self) -> StreamSafe<I>;
135}
136
137impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
138    #[inline]
139    fn nfd(self) -> Decompositions<Chars<'a>> {
140        decompose::new_canonical(self.chars())
141    }
142
143    #[inline]
144    fn nfkd(self) -> Decompositions<Chars<'a>> {
145        decompose::new_compatible(self.chars())
146    }
147
148    #[inline]
149    fn nfc(self) -> Recompositions<Chars<'a>> {
150        recompose::new_canonical(self.chars())
151    }
152
153    #[inline]
154    fn nfkc(self) -> Recompositions<Chars<'a>> {
155        recompose::new_compatible(self.chars())
156    }
157
158    #[inline]
159    fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
160        replace::new_cjk_compat_variants(self.chars())
161    }
162
163    #[inline]
164    fn stream_safe(self) -> StreamSafe<Chars<'a>> {
165        StreamSafe::new(self.chars())
166    }
167}
168
169impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
170    #[inline]
171    fn nfd(self) -> Decompositions<I> {
172        decompose::new_canonical(self)
173    }
174
175    #[inline]
176    fn nfkd(self) -> Decompositions<I> {
177        decompose::new_compatible(self)
178    }
179
180    #[inline]
181    fn nfc(self) -> Recompositions<I> {
182        recompose::new_canonical(self)
183    }
184
185    #[inline]
186    fn nfkc(self) -> Recompositions<I> {
187        recompose::new_compatible(self)
188    }
189
190    #[inline]
191    fn cjk_compat_variants(self) -> Replacements<I> {
192        replace::new_cjk_compat_variants(self)
193    }
194
195    #[inline]
196    fn stream_safe(self) -> StreamSafe<I> {
197        StreamSafe::new(self)
198    }
199}