ttf_parser/tables/cmap/
format2.rs

1// This table has a pretty complex parsing algorithm.
2// A detailed explanation can be found here:
3// https://docs.microsoft.com/en-us/typography/opentype/spec/cmap#format-2-high-byte-mapping-through-table
4// https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6cmap.html
5// https://github.com/fonttools/fonttools/blob/a360252709a3d65f899915db0a5bd753007fdbb7/Lib/fontTools/ttLib/tables/_c_m_a_p.py#L360
6
7use core::convert::TryFrom;
8
9use crate::parser::{Stream, FromData};
10
11#[derive(Clone, Copy)]
12struct SubHeaderRecord {
13    first_code: u16,
14    entry_count: u16,
15    id_delta: i16,
16    id_range_offset: u16,
17}
18
19impl FromData for SubHeaderRecord {
20    const SIZE: usize = 8;
21
22    #[inline]
23    fn parse(data: &[u8]) -> Option<Self> {
24        let mut s = Stream::new(data);
25        Some(SubHeaderRecord {
26            first_code: s.read::<u16>()?,
27            entry_count: s.read::<u16>()?,
28            id_delta: s.read::<i16>()?,
29            id_range_offset: s.read::<u16>()?,
30        })
31    }
32}
33
34pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
35    // This subtable supports code points only in a u16 range.
36    let code_point = u16::try_from(code_point).ok()?;
37
38    let code_point = code_point;
39    let high_byte = code_point >> 8;
40    let low_byte = code_point & 0x00FF;
41
42    let mut s = Stream::new(data);
43    s.skip::<u16>(); // format
44    s.skip::<u16>(); // length
45    s.skip::<u16>(); // language
46    let sub_header_keys = s.read_array16::<u16>(256)?;
47    // The maximum index in a sub_header_keys is a sub_headers count.
48    let sub_headers_count = sub_header_keys.into_iter().map(|n| n / 8).max()? + 1;
49
50    // Remember sub_headers offset before reading. Will be used later.
51    let sub_headers_offset = s.offset();
52    let sub_headers = s.read_array16::<SubHeaderRecord>(sub_headers_count)?;
53
54    let i = if code_point < 0xff {
55        // 'SubHeader 0 is special: it is used for single-byte character codes.'
56        0
57    } else {
58        // 'Array that maps high bytes to subHeaders: value is subHeader index × 8.'
59        sub_header_keys.get(high_byte)? / 8
60    };
61
62    let sub_header = sub_headers.get(i)?;
63
64    let first_code = sub_header.first_code;
65    let range_end = first_code.checked_add(sub_header.entry_count)?;
66    if low_byte < first_code || low_byte >= range_end {
67        return None;
68    }
69
70    // SubHeaderRecord::id_range_offset points to SubHeaderRecord::first_code
71    // in the glyphIndexArray. So we have to advance to our code point.
72    let index_offset = usize::from(low_byte.checked_sub(first_code)?) * u16::SIZE;
73
74    // 'The value of the idRangeOffset is the number of bytes
75    // past the actual location of the idRangeOffset'.
76    let offset =
77        sub_headers_offset
78            // Advance to required subheader.
79            + SubHeaderRecord::SIZE * usize::from(i + 1)
80            // Move back to idRangeOffset start.
81            - u16::SIZE
82            // Use defined offset.
83            + usize::from(sub_header.id_range_offset)
84            // Advance to required index in the glyphIndexArray.
85            + index_offset;
86
87    let glyph: u16 = Stream::read_at(data, offset)?;
88    if glyph == 0 {
89        return None;
90    }
91
92    u16::try_from((i32::from(glyph) + i32::from(sub_header.id_delta)) % 65536).ok()
93}
94
95pub fn codepoints(data: &[u8], mut f: impl FnMut(u32)) -> Option<()> {
96    let mut s = Stream::new(data);
97    s.skip::<u16>(); // format
98    s.skip::<u16>(); // length
99    s.skip::<u16>(); // language
100    let sub_header_keys = s.read_array16::<u16>(256)?;
101
102    // The maximum index in a sub_header_keys is a sub_headers count.
103    let sub_headers_count = sub_header_keys.into_iter().map(|n| n / 8).max()? + 1;
104    let sub_headers = s.read_array16::<SubHeaderRecord>(sub_headers_count)?;
105
106    for first_byte in 0u16..256 {
107        let i = sub_header_keys.get(first_byte)? / 8;
108        let sub_header = sub_headers.get(i)?;
109        let first_code = sub_header.first_code;
110
111        if i == 0 {
112            // This is a single byte code.
113            let range_end = first_code.checked_add(sub_header.entry_count)?;
114            if first_byte >= first_code && first_byte < range_end {
115                f(u32::from(first_byte));
116            }
117        } else {
118            // This is a two byte code.
119            let base = first_code.checked_add(first_byte << 8)?;
120            for k in 0..sub_header.entry_count {
121                let code_point = base.checked_add(k)?;
122                f(u32::from(code_point));
123            }
124        }
125    }
126
127    Some(())
128}
129
130#[cfg(test)]
131mod tests {
132    use crate::parser::FromData;
133    use super::{parse, codepoints};
134
135    #[test]
136    fn collect_codepoints() {
137        let mut data = vec![
138            0x00, 0x02, // format: 2
139            0x02, 0x16, // subtable size: 534
140            0x00, 0x00, // language ID: 0
141        ];
142
143        // Make only high byte 0x28 multi-byte.
144        data.extend(std::iter::repeat(0x00).take(256 * u16::SIZE));
145        data[6 + 0x28 * u16::SIZE + 1] = 0x08;
146
147        data.extend(&[
148            // First sub header (for single byte mapping)
149            0x00, 0xFE, // first code: 254
150            0x00, 0x02, // entry count: 2
151            0x00, 0x00, // id delta: uninteresting
152            0x00, 0x00, // id range offset: uninteresting
153            // Second sub header (for high byte 0x28)
154            0x00, 0x10, // first code: (0x28 << 8) + 0x10 = 10256,
155            0x00, 0x03, // entry count: 3
156            0x00, 0x00, // id delta: uninteresting
157            0x00, 0x00, // id range offset: uninteresting
158        ]);
159
160        // Now only glyph ID's would follow. Not interesting for codepoints.
161
162        let mut vec = vec![];
163        codepoints(&data, |c| vec.push(c));
164        assert_eq!(vec, [10256, 10257, 10258, 254, 255]);
165    }
166
167    #[test]
168    fn codepoint_at_range_end() {
169        let mut data = vec![
170            0x00, 0x02, // format: 2
171            0x02, 0x14, // subtable size: 532
172            0x00, 0x00, // language ID: 0
173        ];
174
175        // Only single bytes.
176        data.extend(std::iter::repeat(0x00).take(256 * u16::SIZE));
177        data.extend(&[
178            // First sub header (for single byte mapping)
179            0x00, 0x28, // first code: 40
180            0x00, 0x02, // entry count: 2
181            0x00, 0x00, // id delta: 0
182            0x00, 0x02, // id range offset: 2
183            // Glyph index
184            0x00, 0x64, // glyph ID [0]: 100
185            0x03, 0xE8, // glyph ID [1]: 1000
186            0x03, 0xE8, // glyph ID [2]: 10000 (unused)
187        ]);
188
189        assert_eq!(parse(&data, 39), None);
190        assert_eq!(parse(&data, 40), Some(100));
191        assert_eq!(parse(&data, 41), Some(1000));
192        assert_eq!(parse(&data, 42), None);
193    }
194}