rust_icu_ustring/lib.rs
1// Copyright 2019 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! # Implementation of the functions in the ICU4C `ustring.h` header.
16//!
17//! This is where the UTF-8 strings get converted back and forth to the UChar
18//! representation.
19//!
20
21use {
22 log::trace, rust_icu_common as common, rust_icu_sys as sys, rust_icu_sys::*,
23 std::convert::TryFrom, std::os::raw,
24};
25
26/// The implementation of the ICU `UChar*`.
27///
28/// While the original type is defined in `umachine.h`, most useful functions for manipulating
29/// `UChar*` are in fact here.
30///
31/// The first thing you probably want to do is to start from a UTF-8 rust string, produce a UChar.
32/// This is necessarily done with a conversion. See the `TryFrom` implementations in this crate
33/// for that.
34///
35/// Implements `UChar*` from ICU.
36#[derive(Debug, Clone)]
37pub struct UChar {
38 rep: Vec<rust_icu_sys::UChar>,
39}
40
41/// Same as `rust_icu_common::buffered_string_method_with_retry`, but for unicode strings.
42///
43/// Example use:
44///
45/// Declares an internal function `select_impl` with a templatized type signature, which is then
46/// called in subsequent code.
47///
48/// ```rust ignore
49/// pub fn select_ustring(&self, number: f64) -> Result<ustring::UChar, common::Error> {
50/// const BUFFER_CAPACITY: usize = 20;
51/// buffered_uchar_method_with_retry!(
52/// select_impl,
53/// BUFFER_CAPACITY,
54/// [rep: *const sys::UPluralRules, number: f64,],
55/// []
56/// );
57///
58/// select_impl(
59/// versioned_function!(uplrules_select),
60/// self.rep.as_ptr(),
61/// number,
62/// )
63/// }
64/// ```
65#[macro_export]
66macro_rules! buffered_uchar_method_with_retry {
67
68 ($method_name:ident, $buffer_capacity:expr,
69 [$($before_arg:ident: $before_arg_type:ty,)*],
70 [$($after_arg:ident: $after_arg_type:ty,)*]) => {
71 fn $method_name(
72 method_to_call: unsafe extern "C" fn(
73 $($before_arg_type,)*
74 *mut sys::UChar,
75 i32,
76 $($after_arg_type,)*
77 *mut sys::UErrorCode,
78 ) -> i32,
79 $($before_arg: $before_arg_type,)*
80 $($after_arg: $after_arg_type,)*
81 ) -> Result<ustring::UChar, common::Error> {
82 let mut status = common::Error::OK_CODE;
83 let mut buf: Vec<sys::UChar> = vec![0; $buffer_capacity];
84
85 // Requires that any pointers that are passed in are valid.
86 let full_len: i32 = unsafe {
87 assert!(common::Error::is_ok(status), "status: {:?}", status);
88 method_to_call(
89 $($before_arg,)*
90 buf.as_mut_ptr() as *mut sys::UChar,
91 $buffer_capacity as i32,
92 $($after_arg,)*
93 &mut status,
94 )
95 };
96
97 // ICU methods are inconsistent in whether they silently truncate the output or treat
98 // the overflow as an error, so we need to check both cases.
99 if status == sys::UErrorCode::U_BUFFER_OVERFLOW_ERROR ||
100 (common::Error::is_ok(status) &&
101 full_len > $buffer_capacity
102 .try_into()
103 .map_err(|e| common::Error::wrapper(e))?) {
104
105 status = common::Error::OK_CODE;
106 assert!(full_len > 0);
107 let full_len: usize = full_len
108 .try_into()
109 .map_err(|e| common::Error::wrapper(e))?;
110 buf.resize(full_len, 0);
111
112 // Same unsafe requirements as above, plus full_len must be exactly the output
113 // buffer size.
114 unsafe {
115 assert!(common::Error::is_ok(status), "status: {:?}", status);
116 method_to_call(
117 $($before_arg,)*
118 buf.as_mut_ptr() as *mut sys::UChar,
119 full_len as i32,
120 $($after_arg,)*
121 &mut status,
122 )
123 };
124 }
125
126 common::Error::ok_or_warning(status)?;
127
128 // Adjust the size of the buffer here.
129 if (full_len >= 0) {
130 let full_len: usize = full_len
131 .try_into()
132 .map_err(|e| common::Error::wrapper(e))?;
133 buf.resize(full_len, 0);
134 }
135 Ok(ustring::UChar::from(buf))
136 }
137 }
138}
139
140impl TryFrom<&str> for crate::UChar {
141 type Error = common::Error;
142
143 /// Tries to produce a string from the UTF-8 encoded thing.
144 ///
145 /// This conversion ignores warnings (e.g. warnings about unterminated buffers), since for rust
146 /// they are not relevant.
147 ///
148 /// Implements `u_strFromUTF8`.
149 fn try_from(rust_string: &str) -> Result<Self, Self::Error> {
150 let mut status = common::Error::OK_CODE;
151 let mut dest_length: i32 = 0;
152 // Preflight to see how long the buffer should be. See second call below
153 // for safety notes.
154 //
155 // TODO(fmil): Consider having a try_from variant which allocates a buffer
156 // of sufficient size instead of running the algorithm twice.
157 trace!("utf8->UChar*: {}, {:?}", rust_string.len(), rust_string);
158 // Requires that rust_string be a valid C string.
159 unsafe {
160 assert!(common::Error::is_ok(status));
161 versioned_function!(u_strFromUTF8)(
162 std::ptr::null_mut(),
163 0,
164 &mut dest_length,
165 rust_string.as_ptr() as *const raw::c_char,
166 rust_string.len() as i32,
167 &mut status,
168 );
169 }
170 trace!("before error check");
171 // We expect buffer overflow error here. The API is weird, but there you go.
172 common::Error::ok_preflight(status)?;
173 trace!("input utf8->UChar*: {:?}", rust_string);
174 let mut rep: Vec<sys::UChar> = vec![0; dest_length as usize];
175 let mut status = common::Error::OK_CODE;
176 // Assumes that rust_string contains a valid rust string. It is OK for the string to have
177 // embedded zero bytes. Assumes that 'rep' is large enough to hold the entire result.
178 unsafe {
179 assert!(common::Error::is_ok(status));
180 versioned_function!(u_strFromUTF8)(
181 rep.as_mut_ptr(),
182 rep.len() as i32,
183 &mut dest_length,
184 rust_string.as_ptr() as *const raw::c_char,
185 rust_string.len() as i32,
186 &mut status,
187 );
188 }
189 common::Error::ok_or_warning(status)?;
190 trace!("result utf8->uchar*[{}]: {:?}", dest_length, rep);
191 Ok(crate::UChar { rep })
192 }
193}
194
195impl TryFrom<&UChar> for String {
196 type Error = common::Error;
197
198 /// Tries to produce a UTF-8 encoded rust string from a UChar.
199 ///
200 /// This conversion ignores warnings and only reports actual ICU errors when
201 /// they happen.
202 ///
203 /// Implements `u_strToUTF8`.
204 fn try_from(u: &UChar) -> Result<String, Self::Error> {
205 let mut status = common::Error::OK_CODE;
206 let mut dest_length: i32 = 0;
207 // First probe for required destination length.
208 unsafe {
209 assert!(common::Error::is_ok(status));
210 versioned_function!(u_strToUTF8)(
211 std::ptr::null_mut(),
212 0,
213 &mut dest_length,
214 u.rep.as_ptr(),
215 u.rep.len() as i32,
216 &mut status,
217 );
218 }
219 trace!("preflight UChar*->utf8 buf[{}]", dest_length);
220
221 // The API doesn't really document this well, but the preflight code will report buffer
222 // overflow error even when we are explicitly just trying to check for the size of the
223 // resulting buffer.
224 common::Error::ok_preflight(status)?;
225
226 // Buffer to store the converted string.
227 let mut buf: Vec<u8> = vec![0; dest_length as usize];
228 trace!("pre: result UChar*->utf8 buf[{}]: {:?}", buf.len(), buf);
229 let mut status = common::Error::OK_CODE;
230
231 // Requires that buf is a buffer with enough capacity to store the
232 // resulting string.
233 unsafe {
234 assert!(common::Error::is_ok(status));
235 versioned_function!(u_strToUTF8)(
236 buf.as_mut_ptr() as *mut raw::c_char,
237 buf.len() as i32,
238 &mut dest_length,
239 u.rep.as_ptr(),
240 u.rep.len() as i32,
241 &mut status,
242 );
243 }
244 trace!("post: result UChar*->utf8 buf[{}]: {:?}", buf.len(), buf);
245 common::Error::ok_or_warning(status)?;
246 let s = String::from_utf8(buf);
247 match s {
248 Err(e) => Err(e.into()),
249 Ok(x) => {
250 trace!("result UChar*->utf8: {:?}", x);
251 Ok(x)
252 }
253 }
254 }
255}
256
257impl From<Vec<sys::UChar>> for crate::UChar {
258 /// Adopts a vector of [sys::UChar] into a string.
259 fn from(rep: Vec<sys::UChar>) -> crate::UChar {
260 crate::UChar { rep }
261 }
262}
263
264impl crate::UChar {
265 /// Allocates a new UChar with given capacity.
266 ///
267 /// Capacity and size must always be the same with `UChar` when used for interacting with
268 /// low-level code.
269 pub fn new_with_capacity(capacity: usize) -> crate::UChar {
270 let rep: Vec<sys::UChar> = vec![0; capacity];
271 crate::UChar::from(rep)
272 }
273
274 /// Creates a new [crate::UChar] from its low-level representation, a buffer
275 /// pointer and a buffer size.
276 ///
277 /// Does *not* take ownership of the buffer that was passed in.
278 ///
279 /// **DO NOT USE UNLESS YOU HAVE NO OTHER CHOICE.**
280 ///
281 /// # Safety
282 ///
283 /// `rep` must point to an initialized sequence of at least `len` `UChar`s.
284 pub unsafe fn clone_from_raw_parts(rep: *mut sys::UChar, len: i32) -> crate::UChar {
285 assert!(len >= 0);
286 // Always works for len: i32 >= 0.
287 let cap = len as usize;
288
289 // View the deconstructed buffer as a vector of UChars. Then make a
290 // copy of it to return. This is not efficient, but is always safe.
291 let original = Vec::from_raw_parts(rep, cap, cap);
292 let copy = original.clone();
293 // Don't free the buffer we don't own.
294 std::mem::forget(original);
295 crate::UChar::from(copy)
296 }
297
298 /// Converts into a zeroed-out string.
299 ///
300 /// This is a very weird ICU API thing, where there apparently exists a zero-terminated
301 /// `UChar*`.
302 pub fn make_z(&mut self) {
303 self.rep.push(0);
304 }
305
306 /// Returns the constant pointer to the underlying C representation.
307 /// Intended for use in low-level code.
308 pub fn as_c_ptr(&self) -> *const rust_icu_sys::UChar {
309 self.rep.as_ptr()
310 }
311
312 /// Returns the length of the string, in code points.
313 pub fn len(&self) -> usize {
314 self.rep.len()
315 }
316
317 /// Returns whether the string is empty.
318 pub fn is_empty(&self) -> bool {
319 self.rep.is_empty()
320 }
321
322 /// Returns the underlying representation as a mutable C representation. Caller MUST ensure
323 /// that the representation won't be reallocated as result of adding anything to it, and that
324 /// it is correctly sized, or bad things will happen.
325 pub fn as_mut_c_ptr(&mut self) -> *mut sys::UChar {
326 self.rep.as_mut_ptr()
327 }
328
329 /// Resizes this string to match new_size.
330 ///
331 /// If the string is made longer, the new space is filled with zeroes.
332 pub fn resize(&mut self, new_size: usize) {
333 self.rep.resize(new_size, 0);
334 }
335
336 /// Returns the equivalent UTF-8 string, useful for debugging.
337 pub fn as_string_debug(&self) -> String {
338 String::try_from(self).unwrap()
339 }
340}
341
342#[cfg(test)]
343mod tests {
344 use super::*;
345
346 #[test]
347 fn round_trip_conversion() {
348 let samples = vec!["", "Hello world!", "❤ Hello world ❤"];
349 for s in samples.iter() {
350 let uchar =
351 crate::UChar::try_from(*s).expect(&format!("forward conversion succeeds: {}", s));
352 let res =
353 String::try_from(&uchar).expect(&format!("back conversion succeeds: {:?}", uchar));
354 assert_eq!(*s, res);
355 }
356 }
357}