rust_icu_ustring/
lib.rs

1// Copyright 2019 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! # Implementation of the functions in the ICU4C `ustring.h` header.
16//!
17//! This is where the UTF-8 strings get converted back and forth to the UChar
18//! representation.
19//!
20
21use {
22    log::trace, rust_icu_common as common, rust_icu_sys as sys, rust_icu_sys::*,
23    std::convert::TryFrom, std::os::raw,
24};
25
26/// The implementation of the ICU `UChar*`.
27///
28/// While the original type is defined in `umachine.h`, most useful functions for manipulating
29/// `UChar*` are in fact here.
30///
31/// The first thing you probably want to do is to start from a UTF-8 rust string, produce a UChar.
32/// This is necessarily done with a conversion.  See the `TryFrom` implementations in this crate
33/// for that.
34///
35/// Implements `UChar*` from ICU.
36#[derive(Debug, Clone)]
37pub struct UChar {
38    rep: Vec<rust_icu_sys::UChar>,
39}
40
41/// Same as `rust_icu_common::buffered_string_method_with_retry`, but for unicode strings.
42///
43/// Example use:
44///
45/// Declares an internal function `select_impl` with a templatized type signature, which is then
46/// called in subsequent code.
47///
48/// ```rust ignore
49/// pub fn select_ustring(&self, number: f64) -> Result<ustring::UChar, common::Error> {
50///     const BUFFER_CAPACITY: usize = 20;
51///     buffered_uchar_method_with_retry!(
52///         select_impl,
53///         BUFFER_CAPACITY,
54///         [rep: *const sys::UPluralRules, number: f64,],
55///         []
56///     );
57///
58///     select_impl(
59///         versioned_function!(uplrules_select),
60///         self.rep.as_ptr(),
61///         number,
62///     )
63/// }
64/// ```
65#[macro_export]
66macro_rules! buffered_uchar_method_with_retry {
67
68    ($method_name:ident, $buffer_capacity:expr,
69     [$($before_arg:ident: $before_arg_type:ty,)*],
70     [$($after_arg:ident: $after_arg_type:ty,)*]) => {
71        fn $method_name(
72            method_to_call: unsafe extern "C" fn(
73                $($before_arg_type,)*
74                *mut sys::UChar,
75                i32,
76                $($after_arg_type,)*
77                *mut sys::UErrorCode,
78            ) -> i32,
79            $($before_arg: $before_arg_type,)*
80            $($after_arg: $after_arg_type,)*
81        ) -> Result<ustring::UChar, common::Error> {
82            let mut status = common::Error::OK_CODE;
83            let mut buf: Vec<sys::UChar> = vec![0; $buffer_capacity];
84
85            // Requires that any pointers that are passed in are valid.
86            let full_len: i32 = unsafe {
87                assert!(common::Error::is_ok(status), "status: {:?}", status);
88                method_to_call(
89                    $($before_arg,)*
90                    buf.as_mut_ptr() as *mut sys::UChar,
91                    $buffer_capacity as i32,
92                    $($after_arg,)*
93                    &mut status,
94                )
95            };
96
97            // ICU methods are inconsistent in whether they silently truncate the output or treat
98            // the overflow as an error, so we need to check both cases.
99            if status == sys::UErrorCode::U_BUFFER_OVERFLOW_ERROR ||
100               (common::Error::is_ok(status) &&
101                    full_len > $buffer_capacity
102                        .try_into()
103                        .map_err(|e| common::Error::wrapper(e))?) {
104                
105                status = common::Error::OK_CODE;
106                assert!(full_len > 0);
107                let full_len: usize = full_len
108                    .try_into()
109                    .map_err(|e| common::Error::wrapper(e))?;
110                buf.resize(full_len, 0);
111
112                // Same unsafe requirements as above, plus full_len must be exactly the output
113                // buffer size.
114                unsafe {
115                    assert!(common::Error::is_ok(status), "status: {:?}", status);
116                    method_to_call(
117                        $($before_arg,)*
118                        buf.as_mut_ptr() as *mut sys::UChar,
119                        full_len as i32,
120                        $($after_arg,)*
121                        &mut status,
122                    )
123                };
124            }
125
126            common::Error::ok_or_warning(status)?;
127
128            // Adjust the size of the buffer here.
129            if (full_len >= 0) {
130                let full_len: usize = full_len
131                    .try_into()
132                    .map_err(|e| common::Error::wrapper(e))?;
133                buf.resize(full_len, 0);
134            }
135            Ok(ustring::UChar::from(buf))
136        }
137    }
138}
139
140impl TryFrom<&str> for crate::UChar {
141    type Error = common::Error;
142
143    /// Tries to produce a string from the UTF-8 encoded thing.
144    ///
145    /// This conversion ignores warnings (e.g. warnings about unterminated buffers), since for rust
146    /// they are not relevant.
147    ///
148    /// Implements `u_strFromUTF8`.
149    fn try_from(rust_string: &str) -> Result<Self, Self::Error> {
150        let mut status = common::Error::OK_CODE;
151        let mut dest_length: i32 = 0;
152        // Preflight to see how long the buffer should be. See second call below
153        // for safety notes.
154        //
155        // TODO(fmil): Consider having a try_from variant which allocates a buffer
156        // of sufficient size instead of running the algorithm twice.
157        trace!("utf8->UChar*: {}, {:?}", rust_string.len(), rust_string);
158        // Requires that rust_string be a valid C string.
159        unsafe {
160            assert!(common::Error::is_ok(status));
161            versioned_function!(u_strFromUTF8)(
162                std::ptr::null_mut(),
163                0,
164                &mut dest_length,
165                rust_string.as_ptr() as *const raw::c_char,
166                rust_string.len() as i32,
167                &mut status,
168            );
169        }
170        trace!("before error check");
171        // We expect buffer overflow error here.  The API is weird, but there you go.
172        common::Error::ok_preflight(status)?;
173        trace!("input  utf8->UChar*: {:?}", rust_string);
174        let mut rep: Vec<sys::UChar> = vec![0; dest_length as usize];
175        let mut status = common::Error::OK_CODE;
176        // Assumes that rust_string contains a valid rust string.  It is OK for the string to have
177        // embedded zero bytes.  Assumes that 'rep' is large enough to hold the entire result.
178        unsafe {
179            assert!(common::Error::is_ok(status));
180            versioned_function!(u_strFromUTF8)(
181                rep.as_mut_ptr(),
182                rep.len() as i32,
183                &mut dest_length,
184                rust_string.as_ptr() as *const raw::c_char,
185                rust_string.len() as i32,
186                &mut status,
187            );
188        }
189        common::Error::ok_or_warning(status)?;
190        trace!("result utf8->uchar*[{}]: {:?}", dest_length, rep);
191        Ok(crate::UChar { rep })
192    }
193}
194
195impl TryFrom<&UChar> for String {
196    type Error = common::Error;
197
198    /// Tries to produce a UTF-8 encoded rust string from a UChar.
199    ///
200    /// This conversion ignores warnings and only reports actual ICU errors when
201    /// they happen.
202    ///
203    /// Implements `u_strToUTF8`.
204    fn try_from(u: &UChar) -> Result<String, Self::Error> {
205        let mut status = common::Error::OK_CODE;
206        let mut dest_length: i32 = 0;
207        // First probe for required destination length.
208        unsafe {
209            assert!(common::Error::is_ok(status));
210            versioned_function!(u_strToUTF8)(
211                std::ptr::null_mut(),
212                0,
213                &mut dest_length,
214                u.rep.as_ptr(),
215                u.rep.len() as i32,
216                &mut status,
217            );
218        }
219        trace!("preflight UChar*->utf8 buf[{}]", dest_length);
220
221        // The API doesn't really document this well, but the preflight code will report buffer
222        // overflow error even when we are explicitly just trying to check for the size of the
223        // resulting buffer.
224        common::Error::ok_preflight(status)?;
225
226        // Buffer to store the converted string.
227        let mut buf: Vec<u8> = vec![0; dest_length as usize];
228        trace!("pre:  result UChar*->utf8 buf[{}]: {:?}", buf.len(), buf);
229        let mut status = common::Error::OK_CODE;
230
231        // Requires that buf is a buffer with enough capacity to store the
232        // resulting string.
233        unsafe {
234            assert!(common::Error::is_ok(status));
235            versioned_function!(u_strToUTF8)(
236                buf.as_mut_ptr() as *mut raw::c_char,
237                buf.len() as i32,
238                &mut dest_length,
239                u.rep.as_ptr(),
240                u.rep.len() as i32,
241                &mut status,
242            );
243        }
244        trace!("post: result UChar*->utf8 buf[{}]: {:?}", buf.len(), buf);
245        common::Error::ok_or_warning(status)?;
246        let s = String::from_utf8(buf);
247        match s {
248            Err(e) => Err(e.into()),
249            Ok(x) => {
250                trace!("result UChar*->utf8: {:?}", x);
251                Ok(x)
252            }
253        }
254    }
255}
256
257impl From<Vec<sys::UChar>> for crate::UChar {
258    /// Adopts a vector of [sys::UChar] into a string.
259    fn from(rep: Vec<sys::UChar>) -> crate::UChar {
260        crate::UChar { rep }
261    }
262}
263
264impl crate::UChar {
265    /// Allocates a new UChar with given capacity.
266    ///
267    /// Capacity and size must always be the same with `UChar` when used for interacting with
268    /// low-level code.
269    pub fn new_with_capacity(capacity: usize) -> crate::UChar {
270        let rep: Vec<sys::UChar> = vec![0; capacity];
271        crate::UChar::from(rep)
272    }
273
274    /// Creates a new [crate::UChar] from its low-level representation, a buffer
275    /// pointer and a buffer size.
276    ///
277    /// Does *not* take ownership of the buffer that was passed in.
278    ///
279    /// **DO NOT USE UNLESS YOU HAVE NO OTHER CHOICE.**
280    ///
281    /// # Safety
282    ///
283    /// `rep` must point to an initialized sequence of at least `len` `UChar`s.
284    pub unsafe fn clone_from_raw_parts(rep: *mut sys::UChar, len: i32) -> crate::UChar {
285        assert!(len >= 0);
286        // Always works for len: i32 >= 0.
287        let cap = len as usize;
288
289        // View the deconstructed buffer as a vector of UChars.  Then make a
290        // copy of it to return.  This is not efficient, but is always safe.
291        let original = Vec::from_raw_parts(rep, cap, cap);
292        let copy = original.clone();
293        // Don't free the buffer we don't own.
294        std::mem::forget(original);
295        crate::UChar::from(copy)
296    }
297
298    /// Converts into a zeroed-out string.
299    ///
300    /// This is a very weird ICU API thing, where there apparently exists a zero-terminated
301    /// `UChar*`.
302    pub fn make_z(&mut self) {
303        self.rep.push(0);
304    }
305
306    /// Returns the constant pointer to the underlying C representation.
307    /// Intended for use in low-level code.
308    pub fn as_c_ptr(&self) -> *const rust_icu_sys::UChar {
309        self.rep.as_ptr()
310    }
311
312    /// Returns the length of the string, in code points.
313    pub fn len(&self) -> usize {
314        self.rep.len()
315    }
316
317    /// Returns whether the string is empty.
318    pub fn is_empty(&self) -> bool {
319        self.rep.is_empty()
320    }
321
322    /// Returns the underlying representation as a mutable C representation.  Caller MUST ensure
323    /// that the representation won't be reallocated as result of adding anything to it, and that
324    /// it is correctly sized, or bad things will happen.
325    pub fn as_mut_c_ptr(&mut self) -> *mut sys::UChar {
326        self.rep.as_mut_ptr()
327    }
328
329    /// Resizes this string to match new_size.
330    ///
331    /// If the string is made longer, the new space is filled with zeroes.
332    pub fn resize(&mut self, new_size: usize) {
333        self.rep.resize(new_size, 0);
334    }
335
336    /// Returns the equivalent UTF-8 string, useful for debugging.
337    pub fn as_string_debug(&self) -> String {
338        String::try_from(self).unwrap()
339    }
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    #[test]
347    fn round_trip_conversion() {
348        let samples = vec!["", "Hello world!", "❤  Hello world  ❤"];
349        for s in samples.iter() {
350            let uchar =
351                crate::UChar::try_from(*s).expect(&format!("forward conversion succeeds: {}", s));
352            let res =
353                String::try_from(&uchar).expect(&format!("back conversion succeeds: {:?}", uchar));
354            assert_eq!(*s, res);
355        }
356    }
357}