pest/unicode/
mod.rs

1//! Character inclusion in binary or General_Category value Unicode sets.
2//!
3//! We rely on dead code elimination to remove the tables that aren't needed.
4
5#![allow(bad_style)]
6#![allow(clippy::all)]
7
8use alloc::boxed::Box;
9
10macro_rules! property_functions {
11    ($module:ident, $property_names:ident, [$(
12        $prop:ident,
13    )*]) => {
14        #[allow(unused)]
15        mod $module;
16        // unicode::ALPHABETIC('a')
17        $(pub fn $prop(c: char) -> bool {
18            self::$module::$prop.contains_char(c)
19        })*
20
21        pub static $property_names: &[&str] = &[
22            $(stringify!($prop),)*
23        ];
24    };
25}
26
27macro_rules! char_property_functions {
28    // For define custom property names
29    {$(
30        mod $module:ident;
31        static $property_names:ident = [$(
32            $prop:ident,
33        )*];
34    )*} => {$(
35        property_functions!($module, $property_names, [$(
36            $prop,
37        )*]);
38    )*};
39    // For define property by copy BY_NAME values from `ucd-generate` generated.
40    {$(
41        mod $module:ident;
42        static $property_names:ident = [$(
43            ($_name:tt, $prop:ident),
44        )*];
45    )*} => {$(
46        property_functions!($module, $property_names, [$(
47            $prop,
48        )*]);
49    )*};
50}
51
52char_property_functions! {
53    mod binary;
54    static BINARY_PROPERTY_NAMES = [
55        // ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII
56        ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED,
57        CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED,
58        CHANGES_WHEN_UPPERCASED, DASH, DEFAULT_IGNORABLE_CODE_POINT, DEPRECATED, DIACRITIC,
59        EMOJI, EMOJI_COMPONENT, EMOJI_MODIFIER, EMOJI_MODIFIER_BASE, EMOJI_PRESENTATION, EXTENDED_PICTOGRAPHIC,
60        EXTENDER, GRAPHEME_BASE, GRAPHEME_EXTEND, GRAPHEME_LINK, HEX_DIGIT, HYPHEN,
61        IDS_BINARY_OPERATOR, IDS_TRINARY_OPERATOR, ID_CONTINUE, ID_START, IDEOGRAPHIC, JOIN_CONTROL,
62        LOGICAL_ORDER_EXCEPTION, LOWERCASE, MATH, NONCHARACTER_CODE_POINT, OTHER_ALPHABETIC,
63        OTHER_DEFAULT_IGNORABLE_CODE_POINT, OTHER_GRAPHEME_EXTEND, OTHER_ID_CONTINUE,
64        OTHER_ID_START, OTHER_LOWERCASE, OTHER_MATH, OTHER_UPPERCASE, PATTERN_SYNTAX,
65        PATTERN_WHITE_SPACE, PREPENDED_CONCATENATION_MARK, QUOTATION_MARK, RADICAL,
66        REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH,
67        UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START,
68    ];
69}
70
71char_property_functions! {
72    mod category;
73    // Copy from category::BY_NAME
74    static CATEGORY_PROPERTY_NAMES = [
75        ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
76        ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL),
77        ("Currency_Symbol", CURRENCY_SYMBOL),
78        ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER),
79        ("Enclosing_Mark", ENCLOSING_MARK),
80        ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT),
81        ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER),
82        ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR),
83        ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK),
84        ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER),
85        ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK),
86        ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION),
87        ("Other", OTHER), ("Other_Letter", OTHER_LETTER),
88        ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION),
89        ("Other_Symbol", OTHER_SYMBOL),
90        ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE),
91        ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR),
92        ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK),
93        ("Surrogate", SURROGATE), ("Symbol", SYMBOL),
94        ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED),
95        ("Uppercase_Letter", UPPERCASE_LETTER),
96    ];
97
98    mod script;
99    // Copy from script::BY_NAME
100    static SCRIPT_PROPERTY_NAMES = [
101        ("Adlam", ADLAM),
102        ("Ahom", AHOM),
103        ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
104        ("Arabic", ARABIC),
105        ("Armenian", ARMENIAN),
106        ("Avestan", AVESTAN),
107        ("Balinese", BALINESE),
108        ("Bamum", BAMUM),
109        ("Bassa_Vah", BASSA_VAH),
110        ("Batak", BATAK),
111        ("Bengali", BENGALI),
112        ("Bhaiksuki", BHAIKSUKI),
113        ("Bopomofo", BOPOMOFO),
114        ("Brahmi", BRAHMI),
115        ("Braille", BRAILLE),
116        ("Buginese", BUGINESE),
117        ("Buhid", BUHID),
118        ("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
119        ("Carian", CARIAN),
120        ("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
121        ("Chakma", CHAKMA),
122        ("Cham", CHAM),
123        ("Cherokee", CHEROKEE),
124        ("Chorasmian", CHORASMIAN),
125        ("Common", COMMON),
126        ("Coptic", COPTIC),
127        ("Cuneiform", CUNEIFORM),
128        ("Cypriot", CYPRIOT),
129        ("Cypro_Minoan", CYPRO_MINOAN),
130        ("Cyrillic", CYRILLIC),
131        ("Deseret", DESERET),
132        ("Devanagari", DEVANAGARI),
133        ("Dives_Akuru", DIVES_AKURU),
134        ("Dogra", DOGRA),
135        ("Duployan", DUPLOYAN),
136        ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
137        ("Elbasan", ELBASAN),
138        ("Elymaic", ELYMAIC),
139        ("Ethiopic", ETHIOPIC),
140        ("Georgian", GEORGIAN),
141        ("Glagolitic", GLAGOLITIC),
142        ("Gothic", GOTHIC),
143        ("Grantha", GRANTHA),
144        ("Greek", GREEK),
145        ("Gujarati", GUJARATI),
146        ("Gunjala_Gondi", GUNJALA_GONDI),
147        ("Gurmukhi", GURMUKHI),
148        ("Han", HAN),
149        ("Hangul", HANGUL),
150        ("Hanifi_Rohingya", HANIFI_ROHINGYA),
151        ("Hanunoo", HANUNOO),
152        ("Hatran", HATRAN),
153        ("Hebrew", HEBREW),
154        ("Hiragana", HIRAGANA),
155        ("Imperial_Aramaic", IMPERIAL_ARAMAIC),
156        ("Inherited", INHERITED),
157        ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
158        ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
159        ("Javanese", JAVANESE),
160        ("Kaithi", KAITHI),
161        ("Kannada", KANNADA),
162        ("Katakana", KATAKANA),
163        ("Kawi", KAWI),
164        ("Kayah_Li", KAYAH_LI),
165        ("Kharoshthi", KHAROSHTHI),
166        ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
167        ("Khmer", KHMER),
168        ("Khojki", KHOJKI),
169        ("Khudawadi", KHUDAWADI),
170        ("Lao", LAO),
171        ("Latin", LATIN),
172        ("Lepcha", LEPCHA),
173        ("Limbu", LIMBU),
174        ("Linear_A", LINEAR_A),
175        ("Linear_B", LINEAR_B),
176        ("Lisu", LISU),
177        ("Lycian", LYCIAN),
178        ("Lydian", LYDIAN),
179        ("Mahajani", MAHAJANI),
180        ("Makasar", MAKASAR),
181        ("Malayalam", MALAYALAM),
182        ("Mandaic", MANDAIC),
183        ("Manichaean", MANICHAEAN),
184        ("Marchen", MARCHEN),
185        ("Masaram_Gondi", MASARAM_GONDI),
186        ("Medefaidrin", MEDEFAIDRIN),
187        ("Meetei_Mayek", MEETEI_MAYEK),
188        ("Mende_Kikakui", MENDE_KIKAKUI),
189        ("Meroitic_Cursive", MEROITIC_CURSIVE),
190        ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
191        ("Miao", MIAO),
192        ("Modi", MODI),
193        ("Mongolian", MONGOLIAN),
194        ("Mro", MRO),
195        ("Multani", MULTANI),
196        ("Myanmar", MYANMAR),
197        ("Nabataean", NABATAEAN),
198        ("Nag_Mundari", NAG_MUNDARI),
199        ("Nandinagari", NANDINAGARI),
200        ("New_Tai_Lue", NEW_TAI_LUE),
201        ("Newa", NEWA),
202        ("Nko", NKO),
203        ("Nushu", NUSHU),
204        ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
205        ("Ogham", OGHAM),
206        ("Ol_Chiki", OL_CHIKI),
207        ("Old_Hungarian", OLD_HUNGARIAN),
208        ("Old_Italic", OLD_ITALIC),
209        ("Old_North_Arabian", OLD_NORTH_ARABIAN),
210        ("Old_Permic", OLD_PERMIC),
211        ("Old_Persian", OLD_PERSIAN),
212        ("Old_Sogdian", OLD_SOGDIAN),
213        ("Old_South_Arabian", OLD_SOUTH_ARABIAN),
214        ("Old_Turkic", OLD_TURKIC),
215        ("Old_Uyghur", OLD_UYGHUR),
216        ("Oriya", ORIYA),
217        ("Osage", OSAGE),
218        ("Osmanya", OSMANYA),
219        ("Pahawh_Hmong", PAHAWH_HMONG),
220        ("Palmyrene", PALMYRENE),
221        ("Pau_Cin_Hau", PAU_CIN_HAU),
222        ("Phags_Pa", PHAGS_PA),
223        ("Phoenician", PHOENICIAN),
224        ("Psalter_Pahlavi", PSALTER_PAHLAVI),
225        ("Rejang", REJANG),
226        ("Runic", RUNIC),
227        ("Samaritan", SAMARITAN),
228        ("Saurashtra", SAURASHTRA),
229        ("Sharada", SHARADA),
230        ("Shavian", SHAVIAN),
231        ("Siddham", SIDDHAM),
232        ("SignWriting", SIGNWRITING),
233        ("Sinhala", SINHALA),
234        ("Sogdian", SOGDIAN),
235        ("Sora_Sompeng", SORA_SOMPENG),
236        ("Soyombo", SOYOMBO),
237        ("Sundanese", SUNDANESE),
238        ("Syloti_Nagri", SYLOTI_NAGRI),
239        ("Syriac", SYRIAC),
240        ("Tagalog", TAGALOG),
241        ("Tagbanwa", TAGBANWA),
242        ("Tai_Le", TAI_LE),
243        ("Tai_Tham", TAI_THAM),
244        ("Tai_Viet", TAI_VIET),
245        ("Takri", TAKRI),
246        ("Tamil", TAMIL),
247        ("Tangsa", TANGSA),
248        ("Tangut", TANGUT),
249        ("Telugu", TELUGU),
250        ("Thaana", THAANA),
251        ("Thai", THAI),
252        ("Tibetan", TIBETAN),
253        ("Tifinagh", TIFINAGH),
254        ("Tirhuta", TIRHUTA),
255        ("Toto", TOTO),
256        ("Ugaritic", UGARITIC),
257        ("Vai", VAI),
258        ("Vithkuqi", VITHKUQI),
259        ("Wancho", WANCHO),
260        ("Warang_Citi", WARANG_CITI),
261        ("Yezidi", YEZIDI),
262        ("Yi", YI),
263        ("Zanabazar_Square", ZANABAZAR_SQUARE),
264    ];
265}
266
267/// Return all available unicode property names
268pub fn unicode_property_names() -> Box<dyn Iterator<Item = &'static str>> {
269    Box::new(
270        BINARY_PROPERTY_NAMES
271            .iter()
272            .map(|name| *name)
273            .chain(CATEGORY_PROPERTY_NAMES.iter().map(|name| *name))
274            .chain(SCRIPT_PROPERTY_NAMES.iter().map(|name| *name)),
275    )
276}
277
278pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
279    for property in binary::BY_NAME {
280        if name == property.0.to_uppercase() {
281            return Some(Box::new(move |c| property.1.contains_char(c)));
282        }
283    }
284
285    for property in category::BY_NAME {
286        if name == property.0.to_uppercase() {
287            return Some(Box::new(move |c| property.1.contains_char(c)));
288        }
289    }
290
291    for property in script::BY_NAME {
292        if name == property.0.to_uppercase() {
293            return Some(Box::new(move |c| property.1.contains_char(c)));
294        }
295    }
296
297    None
298}