1 //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). 2 //! 3 //! This library uses Rust’s type system to maintain 4 //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), 5 //! like the `String` and `&str` types do for UTF-8. 6 //! 7 //! Since [WTF-8 must not be used 8 //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), 9 //! this library deliberately does not provide access to the underlying bytes 10 //! of WTF-8 strings, 11 //! nor can it decode WTF-8 from arbitrary bytes. 12 //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. 13 14 // this module is imported from @SimonSapin's repo and has tons of dead code on 15 // unix (it's mostly used on windows), so don't worry about dead code here. 16 #![allow(dead_code)] 17 18 #[cfg(test)] 19 mod tests; 20 21 use core::char::{encode_utf16_raw, encode_utf8_raw}; 22 use core::str::next_code_point; 23 24 use crate::std::borrow::Cow; 25 use crate::std::collections::TryReserveError; 26 use crate::std::fmt; 27 use crate::std::hash::{Hash, Hasher}; 28 use crate::std::iter::FusedIterator; 29 use crate::std::mem; 30 use crate::std::ops; 31 use crate::std::rc::Rc; 32 use crate::std::slice; 33 use crate::std::str; 34 use crate::std::sync::Arc; 35 use crate::std::sys_common::AsInner; 36 37 const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; 38 39 /// A Unicode code point: from U+0000 to U+10FFFF. 40 /// 41 /// Compares with the `char` type, 42 /// which represents a Unicode scalar value: 43 /// a code point that is not a surrogate (U+D800 to U+DFFF). 44 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] 45 pub struct CodePoint { 46 value: u32, 47 } 48 49 /// Format the code point as `U+` followed by four to six hexadecimal digits. 50 /// Example: `U+1F4A9` 51 impl fmt::Debug for CodePoint { 52 #[inline] 53 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { 54 write!(formatter, "U+{:04X}", self.value) 55 } 56 } 57 58 impl CodePoint { 59 /// Unsafely creates a new `CodePoint` without checking the value. 60 /// 61 /// Only use when `value` is known to be less than or equal to 0x10FFFF. 62 #[inline] 63 pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { 64 CodePoint { value } 65 } 66 67 /// Creates a new `CodePoint` if the value is a valid code point. 68 /// 69 /// Returns `None` if `value` is above 0x10FFFF. 70 #[inline] 71 pub fn from_u32(value: u32) -> Option<CodePoint> { 72 match value { 73 0..=0x10FFFF => Some(CodePoint { value }), 74 _ => None, 75 } 76 } 77 78 /// Creates a new `CodePoint` from a `char`. 79 /// 80 /// Since all Unicode scalar values are code points, this always succeeds. 81 #[inline] 82 pub fn from_char(value: char) -> CodePoint { 83 CodePoint { 84 value: value as u32, 85 } 86 } 87 88 /// Returns the numeric value of the code point. 89 #[inline] 90 pub fn to_u32(&self) -> u32 { 91 self.value 92 } 93 94 /// Returns the numeric value of the code point if it is a leading surrogate. 95 #[inline] 96 pub fn to_lead_surrogate(&self) -> Option<u16> { 97 match self.value { 98 lead @ 0xD800..=0xDBFF => Some(lead as u16), 99 _ => None, 100 } 101 } 102 103 /// Returns the numeric value of the code point if it is a trailing surrogate. 104 #[inline] 105 pub fn to_trail_surrogate(&self) -> Option<u16> { 106 match self.value { 107 trail @ 0xDC00..=0xDFFF => Some(trail as u16), 108 _ => None, 109 } 110 } 111 112 /// Optionally returns a Unicode scalar value for the code point. 113 /// 114 /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). 115 #[inline] 116 pub fn to_char(&self) -> Option<char> { 117 match self.value { 118 0xD800..=0xDFFF => None, 119 _ => Some(unsafe { char::from_u32_unchecked(self.value) }), 120 } 121 } 122 123 /// Returns a Unicode scalar value for the code point. 124 /// 125 /// Returns `'\u{FFFD}'` (the replacement character “�”) 126 /// if the code point is a surrogate (from U+D800 to U+DFFF). 127 #[inline] 128 pub fn to_char_lossy(&self) -> char { 129 self.to_char().unwrap_or('\u{FFFD}') 130 } 131 } 132 133 /// An owned, growable string of well-formed WTF-8 data. 134 /// 135 /// Similar to `String`, but can additionally contain surrogate code points 136 /// if they’re not in a surrogate pair. 137 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] 138 pub struct Wtf8Buf { 139 bytes: Vec<u8>, 140 141 /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily 142 /// know this if we're constructed from a `String` or `&str`. 143 /// 144 /// It is possible for `bytes` to have valid UTF-8 without this being 145 /// set, such as when we're concatenating `&Wtf8`'s and surrogates become 146 /// paired, as we don't bother to rescan the entire string. 147 is_known_utf8: bool, 148 } 149 150 impl ops::Deref for Wtf8Buf { 151 type Target = Wtf8; 152 153 fn deref(&self) -> &Wtf8 { 154 self.as_slice() 155 } 156 } 157 158 impl ops::DerefMut for Wtf8Buf { 159 fn deref_mut(&mut self) -> &mut Wtf8 { 160 self.as_mut_slice() 161 } 162 } 163 164 /// Format the string with double quotes, 165 /// and surrogates as `\u` followed by four hexadecimal digits. 166 /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] 167 impl fmt::Debug for Wtf8Buf { 168 #[inline] 169 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { 170 fmt::Debug::fmt(&**self, formatter) 171 } 172 } 173 174 impl Wtf8Buf { 175 /// Creates a new, empty WTF-8 string. 176 #[inline] 177 pub fn new() -> Wtf8Buf { 178 Wtf8Buf { 179 bytes: Vec::new(), 180 is_known_utf8: true, 181 } 182 } 183 184 /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes. 185 #[inline] 186 pub fn with_capacity(capacity: usize) -> Wtf8Buf { 187 Wtf8Buf { 188 bytes: Vec::with_capacity(capacity), 189 is_known_utf8: true, 190 } 191 } 192 193 /// Creates a WTF-8 string from a WTF-8 byte vec. 194 /// 195 /// Since the byte vec is not checked for valid WTF-8, this functions is 196 /// marked unsafe. 197 #[inline] 198 pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf { 199 Wtf8Buf { 200 bytes: value, 201 is_known_utf8: false, 202 } 203 } 204 205 /// Creates a WTF-8 string from a UTF-8 `String`. 206 /// 207 /// This takes ownership of the `String` and does not copy. 208 /// 209 /// Since WTF-8 is a superset of UTF-8, this always succeeds. 210 #[inline] 211 pub fn from_string(string: String) -> Wtf8Buf { 212 Wtf8Buf { 213 bytes: string.into_bytes(), 214 is_known_utf8: true, 215 } 216 } 217 218 /// Creates a WTF-8 string from a UTF-8 `&str` slice. 219 /// 220 /// This copies the content of the slice. 221 /// 222 /// Since WTF-8 is a superset of UTF-8, this always succeeds. 223 #[inline] 224 pub fn from_str(str: &str) -> Wtf8Buf { 225 Wtf8Buf { 226 bytes: <[_]>::to_vec(str.as_bytes()), 227 is_known_utf8: true, 228 } 229 } 230 231 pub fn clear(&mut self) { 232 self.bytes.clear(); 233 self.is_known_utf8 = true; 234 } 235 236 /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. 237 /// 238 /// This is lossless: calling `.encode_wide()` on the resulting string 239 /// will always return the original code units. 240 pub fn from_wide(v: &[u16]) -> Wtf8Buf { 241 let mut string = Wtf8Buf::with_capacity(v.len()); 242 for item in char::decode_utf16(v.iter().cloned()) { 243 match item { 244 Ok(ch) => string.push_char(ch), 245 Err(surrogate) => { 246 let surrogate = surrogate.unpaired_surrogate(); 247 // Surrogates are known to be in the code point range. 248 let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; 249 // The string will now contain an unpaired surrogate. 250 string.is_known_utf8 = false; 251 // Skip the WTF-8 concatenation check, 252 // surrogate pairs are already decoded by decode_utf16 253 string.push_code_point_unchecked(code_point); 254 } 255 } 256 } 257 string 258 } 259 260 /// Copied from String::push 261 /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check. 262 fn push_code_point_unchecked(&mut self, code_point: CodePoint) { 263 let mut bytes = [0; 4]; 264 let bytes = encode_utf8_raw(code_point.value, &mut bytes); 265 self.bytes.extend_from_slice(bytes) 266 } 267 268 #[inline] 269 pub fn as_slice(&self) -> &Wtf8 { 270 unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } 271 } 272 273 #[inline] 274 pub fn as_mut_slice(&mut self) -> &mut Wtf8 { 275 // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would 276 // cause them to change from well-formed UTF-8 to ill-formed UTF-8, 277 // which would break the assumptions of the `is_known_utf8` field. 278 unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } 279 } 280 281 /// Reserves capacity for at least `additional` more bytes to be inserted 282 /// in the given `Wtf8Buf`. 283 /// The collection may reserve more space to avoid frequent reallocations. 284 /// 285 /// # Panics 286 /// 287 /// Panics if the new capacity overflows `usize`. 288 #[inline] 289 pub fn reserve(&mut self, additional: usize) { 290 self.bytes.reserve(additional) 291 } 292 293 /// Tries to reserve capacity for at least `additional` more length units 294 /// in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to avoid 295 /// frequent reallocations. After calling `try_reserve`, capacity will be 296 /// greater than or equal to `self.len() + additional`. Does nothing if 297 /// capacity is already sufficient. This method preserves the contents even 298 /// if an error occurs. 299 /// 300 /// # Errors 301 /// 302 /// If the capacity overflows, or the allocator reports a failure, then an error 303 /// is returned. 304 #[inline] 305 pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { 306 self.bytes.try_reserve(additional) 307 } 308 309 #[inline] 310 pub fn reserve_exact(&mut self, additional: usize) { 311 self.bytes.reserve_exact(additional) 312 } 313 314 /// Tries to reserve the minimum capacity for exactly `additional` 315 /// length units in the given `Wtf8Buf`. After calling 316 /// `try_reserve_exact`, capacity will be greater than or equal to 317 /// `self.len() + additional` if it returns `Ok(())`. 318 /// Does nothing if the capacity is already sufficient. 319 /// 320 /// Note that the allocator may give the `Wtf8Buf` more space than it 321 /// requests. Therefore, capacity can not be relied upon to be precisely 322 /// minimal. Prefer [`try_reserve`] if future insertions are expected. 323 /// 324 /// [`try_reserve`]: Wtf8Buf::try_reserve 325 /// 326 /// # Errors 327 /// 328 /// If the capacity overflows, or the allocator reports a failure, then an error 329 /// is returned. 330 #[inline] 331 pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { 332 self.bytes.try_reserve_exact(additional) 333 } 334 335 #[inline] 336 pub fn shrink_to_fit(&mut self) { 337 self.bytes.shrink_to_fit() 338 } 339 340 #[inline] 341 pub fn shrink_to(&mut self, min_capacity: usize) { 342 self.bytes.shrink_to(min_capacity) 343 } 344 345 /// Returns the number of bytes that this string buffer can hold without reallocating. 346 #[inline] 347 pub fn capacity(&self) -> usize { 348 self.bytes.capacity() 349 } 350 351 /// Append a UTF-8 slice at the end of the string. 352 #[inline] 353 pub fn push_str(&mut self, other: &str) { 354 self.bytes.extend_from_slice(other.as_bytes()) 355 } 356 357 /// Append a WTF-8 slice at the end of the string. 358 /// 359 /// This replaces newly paired surrogates at the boundary 360 /// with a supplementary code point, 361 /// like concatenating ill-formed UTF-16 strings effectively would. 362 #[inline] 363 pub fn push_wtf8(&mut self, other: &Wtf8) { 364 match ( 365 (&*self).final_lead_surrogate(), 366 other.initial_trail_surrogate(), 367 ) { 368 // Replace newly paired surrogates by a supplementary code point. 369 (Some(lead), Some(trail)) => { 370 let len_without_lead_surrogate = self.len() - 3; 371 self.bytes.truncate(len_without_lead_surrogate); 372 let other_without_trail_surrogate = &other.bytes[3..]; 373 // 4 bytes for the supplementary code point 374 self.bytes.reserve(4 + other_without_trail_surrogate.len()); 375 self.push_char(decode_surrogate_pair(lead, trail)); 376 self.bytes.extend_from_slice(other_without_trail_surrogate); 377 } 378 _ => { 379 // If we'll be pushing a string containing a surrogate, we may 380 // no longer have UTF-8. 381 if other.next_surrogate(0).is_some() { 382 self.is_known_utf8 = false; 383 } 384 385 self.bytes.extend_from_slice(&other.bytes); 386 } 387 } 388 } 389 390 /// Append a Unicode scalar value at the end of the string. 391 #[inline] 392 pub fn push_char(&mut self, c: char) { 393 self.push_code_point_unchecked(CodePoint::from_char(c)) 394 } 395 396 /// Append a code point at the end of the string. 397 /// 398 /// This replaces newly paired surrogates at the boundary 399 /// with a supplementary code point, 400 /// like concatenating ill-formed UTF-16 strings effectively would. 401 #[inline] 402 pub fn push(&mut self, code_point: CodePoint) { 403 if let Some(trail) = code_point.to_trail_surrogate() { 404 if let Some(lead) = (&*self).final_lead_surrogate() { 405 let len_without_lead_surrogate = self.len() - 3; 406 self.bytes.truncate(len_without_lead_surrogate); 407 self.push_char(decode_surrogate_pair(lead, trail)); 408 return; 409 } 410 411 // We're pushing a trailing surrogate. 412 self.is_known_utf8 = false; 413 } else if code_point.to_lead_surrogate().is_some() { 414 // We're pushing a leading surrogate. 415 self.is_known_utf8 = false; 416 } 417 418 // No newly paired surrogates at the boundary. 419 self.push_code_point_unchecked(code_point) 420 } 421 422 /// Shortens a string to the specified length. 423 /// 424 /// # Panics 425 /// 426 /// Panics if `new_len` > current length, 427 /// or if `new_len` is not a code point boundary. 428 #[inline] 429 pub fn truncate(&mut self, new_len: usize) { 430 assert!(is_code_point_boundary(self, new_len)); 431 self.bytes.truncate(new_len) 432 } 433 434 /// Consumes the WTF-8 string and tries to convert it to a vec of bytes. 435 #[inline] 436 pub fn into_bytes(self) -> Vec<u8> { 437 self.bytes 438 } 439 440 /// Consumes the WTF-8 string and tries to convert it to UTF-8. 441 /// 442 /// This does not copy the data. 443 /// 444 /// If the contents are not well-formed UTF-8 445 /// (that is, if the string contains surrogates), 446 /// the original WTF-8 string is returned instead. 447 pub fn into_string(self) -> Result<String, Wtf8Buf> { 448 if self.is_known_utf8 || self.next_surrogate(0).is_none() { 449 Ok(unsafe { String::from_utf8_unchecked(self.bytes) }) 450 } else { 451 Err(self) 452 } 453 } 454 455 /// Consumes the WTF-8 string and converts it lossily to UTF-8. 456 /// 457 /// This does not copy the data (but may overwrite parts of it in place). 458 /// 459 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) 460 pub fn into_string_lossy(mut self) -> String { 461 // Fast path: If we already have UTF-8, we can return it immediately. 462 if self.is_known_utf8 { 463 return unsafe { String::from_utf8_unchecked(self.bytes) }; 464 } 465 466 let mut pos = 0; 467 loop { 468 match self.next_surrogate(pos) { 469 Some((surrogate_pos, _)) => { 470 pos = surrogate_pos + 3; 471 self.bytes[surrogate_pos..pos] 472 .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); 473 } 474 None => return unsafe { String::from_utf8_unchecked(self.bytes) }, 475 } 476 } 477 } 478 479 /// Converts this `Wtf8Buf` into a boxed `Wtf8`. 480 #[inline] 481 pub fn into_box(self) -> Box<Wtf8> { 482 // SAFETY: relies on `Wtf8` being `repr(transparent)`. 483 unsafe { mem::transmute(self.bytes.into_boxed_slice()) } 484 } 485 486 /// Converts a `Box<Wtf8>` into a `Wtf8Buf`. 487 pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf { 488 let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; 489 Wtf8Buf { 490 bytes: bytes.into_vec(), 491 is_known_utf8: false, 492 } 493 } 494 } 495 496 /// Creates a new WTF-8 string from an iterator of code points. 497 /// 498 /// This replaces surrogate code point pairs with supplementary code points, 499 /// like concatenating ill-formed UTF-16 strings effectively would. 500 impl FromIterator<CodePoint> for Wtf8Buf { 501 fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf { 502 let mut string = Wtf8Buf::new(); 503 string.extend(iter); 504 string 505 } 506 } 507 508 /// Append code points from an iterator to the string. 509 /// 510 /// This replaces surrogate code point pairs with supplementary code points, 511 /// like concatenating ill-formed UTF-16 strings effectively would. 512 impl Extend<CodePoint> for Wtf8Buf { 513 fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) { 514 let iterator = iter.into_iter(); 515 let (low, _high) = iterator.size_hint(); 516 // Lower bound of one byte per code point (ASCII only) 517 self.bytes.reserve(low); 518 iterator.for_each(move |code_point| self.push(code_point)); 519 } 520 521 #[inline] 522 fn extend_one(&mut self, code_point: CodePoint) { 523 self.push(code_point); 524 } 525 526 #[inline] 527 fn extend_reserve(&mut self, additional: usize) { 528 // Lower bound of one byte per code point (ASCII only) 529 self.bytes.reserve(additional); 530 } 531 } 532 533 /// A borrowed slice of well-formed WTF-8 data. 534 /// 535 /// Similar to `&str`, but can additionally contain surrogate code points 536 /// if they’re not in a surrogate pair. 537 #[derive(Eq, Ord, PartialEq, PartialOrd)] 538 #[repr(transparent)] 539 pub struct Wtf8 { 540 bytes: [u8], 541 } 542 543 impl AsInner<[u8]> for Wtf8 { 544 #[inline] 545 fn as_inner(&self) -> &[u8] { 546 &self.bytes 547 } 548 } 549 550 /// Format the slice with double quotes, 551 /// and surrogates as `\u` followed by four hexadecimal digits. 552 /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] 553 impl fmt::Debug for Wtf8 { 554 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { 555 fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result { 556 use crate::std::fmt::Write; 557 for c in s.chars().flat_map(|c| c.escape_debug()) { 558 f.write_char(c)? 559 } 560 Ok(()) 561 } 562 563 formatter.write_str("\"")?; 564 let mut pos = 0; 565 while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) { 566 write_str_escaped(formatter, unsafe { 567 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) 568 })?; 569 write!(formatter, "\\u{{{:x}}}", surrogate)?; 570 pos = surrogate_pos + 3; 571 } 572 write_str_escaped(formatter, unsafe { 573 str::from_utf8_unchecked(&self.bytes[pos..]) 574 })?; 575 formatter.write_str("\"") 576 } 577 } 578 579 impl fmt::Display for Wtf8 { 580 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { 581 let wtf8_bytes = &self.bytes; 582 let mut pos = 0; 583 loop { 584 match self.next_surrogate(pos) { 585 Some((surrogate_pos, _)) => { 586 formatter.write_str(unsafe { 587 str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos]) 588 })?; 589 formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?; 590 pos = surrogate_pos + 3; 591 } 592 None => { 593 let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) }; 594 if pos == 0 { 595 return s.fmt(formatter); 596 } else { 597 return formatter.write_str(s); 598 } 599 } 600 } 601 } 602 } 603 } 604 605 impl Wtf8 { 606 /// Creates a WTF-8 slice from a UTF-8 `&str` slice. 607 /// 608 /// Since WTF-8 is a superset of UTF-8, this always succeeds. 609 #[inline] 610 pub fn from_str(value: &str) -> &Wtf8 { 611 unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } 612 } 613 614 /// Creates a WTF-8 slice from a WTF-8 byte slice. 615 /// 616 /// Since the byte slice is not checked for valid WTF-8, this functions is 617 /// marked unsafe. 618 #[inline] 619 pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { 620 mem::transmute(value) 621 } 622 623 /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice. 624 /// 625 /// Since the byte slice is not checked for valid WTF-8, this functions is 626 /// marked unsafe. 627 #[inline] 628 unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { 629 mem::transmute(value) 630 } 631 632 /// Returns the length, in WTF-8 bytes. 633 #[inline] 634 pub fn len(&self) -> usize { 635 self.bytes.len() 636 } 637 638 #[inline] 639 pub fn is_empty(&self) -> bool { 640 self.bytes.is_empty() 641 } 642 643 /// Returns the code point at `position` if it is in the ASCII range, 644 /// or `b'\xFF'` otherwise. 645 /// 646 /// # Panics 647 /// 648 /// Panics if `position` is beyond the end of the string. 649 #[inline] 650 pub fn ascii_byte_at(&self, position: usize) -> u8 { 651 match self.bytes[position] { 652 ascii_byte @ 0x00..=0x7F => ascii_byte, 653 _ => 0xFF, 654 } 655 } 656 657 /// Returns an iterator for the string’s code points. 658 #[inline] 659 pub fn code_points(&self) -> Wtf8CodePoints<'_> { 660 Wtf8CodePoints { 661 bytes: self.bytes.iter(), 662 } 663 } 664 665 /// Access raw bytes of WTF-8 data 666 #[inline] 667 pub fn as_bytes(&self) -> &[u8] { 668 &self.bytes 669 } 670 671 /// Tries to convert the string to UTF-8 and return a `&str` slice. 672 /// 673 /// Returns `None` if the string contains surrogates. 674 /// 675 /// This does not copy the data. 676 #[inline] 677 pub fn as_str(&self) -> Result<&str, str::Utf8Error> { 678 str::from_utf8(&self.bytes) 679 } 680 681 /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. 682 pub fn to_owned(&self) -> Wtf8Buf { 683 Wtf8Buf { 684 bytes: self.bytes.to_vec(), 685 is_known_utf8: false, 686 } 687 } 688 689 /// Lossily converts the string to UTF-8. 690 /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. 691 /// 692 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). 693 /// 694 /// This only copies the data if necessary (if it contains any surrogate). 695 pub fn to_string_lossy(&self) -> Cow<'_, str> { 696 let surrogate_pos = match self.next_surrogate(0) { 697 None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), 698 Some((pos, _)) => pos, 699 }; 700 let wtf8_bytes = &self.bytes; 701 let mut utf8_bytes = Vec::with_capacity(self.len()); 702 utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); 703 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); 704 let mut pos = surrogate_pos + 3; 705 loop { 706 match self.next_surrogate(pos) { 707 Some((surrogate_pos, _)) => { 708 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); 709 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); 710 pos = surrogate_pos + 3; 711 } 712 None => { 713 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); 714 return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); 715 } 716 } 717 } 718 } 719 720 /// Converts the WTF-8 string to potentially ill-formed UTF-16 721 /// and return an iterator of 16-bit code units. 722 /// 723 /// This is lossless: 724 /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units 725 /// would always return the original WTF-8 string. 726 #[inline] 727 pub fn encode_wide(&self) -> EncodeWide<'_> { 728 EncodeWide { 729 code_points: self.code_points(), 730 extra: 0, 731 } 732 } 733 734 #[inline] 735 fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { 736 let mut iter = self.bytes[pos..].iter(); 737 loop { 738 let b = *iter.next()?; 739 if b < 0x80 { 740 pos += 1; 741 } else if b < 0xE0 { 742 iter.next(); 743 pos += 2; 744 } else if b == 0xED { 745 match (iter.next(), iter.next()) { 746 (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { 747 return Some((pos, decode_surrogate(b2, b3))); 748 } 749 _ => pos += 3, 750 } 751 } else if b < 0xF0 { 752 iter.next(); 753 iter.next(); 754 pos += 3; 755 } else { 756 iter.next(); 757 iter.next(); 758 iter.next(); 759 pos += 4; 760 } 761 } 762 } 763 764 #[inline] 765 fn final_lead_surrogate(&self) -> Option<u16> { 766 match self.bytes { 767 [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), 768 _ => None, 769 } 770 } 771 772 #[inline] 773 fn initial_trail_surrogate(&self) -> Option<u16> { 774 match self.bytes { 775 [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)), 776 _ => None, 777 } 778 } 779 780 pub fn clone_into(&self, buf: &mut Wtf8Buf) { 781 buf.is_known_utf8 = false; 782 self.bytes.clone_into(&mut buf.bytes); 783 } 784 785 /// Boxes this `Wtf8`. 786 #[inline] 787 pub fn into_box(&self) -> Box<Wtf8> { 788 let boxed: Box<[u8]> = self.bytes.into(); 789 unsafe { mem::transmute(boxed) } 790 } 791 792 /// Creates a boxed, empty `Wtf8`. 793 pub fn empty_box() -> Box<Wtf8> { 794 let boxed: Box<[u8]> = Default::default(); 795 unsafe { mem::transmute(boxed) } 796 } 797 798 #[inline] 799 pub fn into_arc(&self) -> Arc<Wtf8> { 800 let arc: Arc<[u8]> = Arc::from(&self.bytes); 801 unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) } 802 } 803 804 #[inline] 805 pub fn into_rc(&self) -> Rc<Wtf8> { 806 let rc: Rc<[u8]> = Rc::from(&self.bytes); 807 unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) } 808 } 809 810 #[inline] 811 pub fn make_ascii_lowercase(&mut self) { 812 self.bytes.make_ascii_lowercase() 813 } 814 815 #[inline] 816 pub fn make_ascii_uppercase(&mut self) { 817 self.bytes.make_ascii_uppercase() 818 } 819 820 #[inline] 821 pub fn to_ascii_lowercase(&self) -> Wtf8Buf { 822 Wtf8Buf { 823 bytes: self.bytes.to_ascii_lowercase(), 824 is_known_utf8: false, 825 } 826 } 827 828 #[inline] 829 pub fn to_ascii_uppercase(&self) -> Wtf8Buf { 830 Wtf8Buf { 831 bytes: self.bytes.to_ascii_uppercase(), 832 is_known_utf8: false, 833 } 834 } 835 836 #[inline] 837 pub fn is_ascii(&self) -> bool { 838 self.bytes.is_ascii() 839 } 840 841 #[inline] 842 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { 843 self.bytes.eq_ignore_ascii_case(&other.bytes) 844 } 845 } 846 847 /// Returns a slice of the given string for the byte range \[`begin`..`end`). 848 /// 849 /// # Panics 850 /// 851 /// Panics when `begin` and `end` do not point to code point boundaries, 852 /// or point beyond the end of the string. 853 impl ops::Index<ops::Range<usize>> for Wtf8 { 854 type Output = Wtf8; 855 856 #[inline] 857 fn index(&self, range: ops::Range<usize>) -> &Wtf8 { 858 // is_code_point_boundary checks that the index is in [0, .len()] 859 if range.start <= range.end 860 && is_code_point_boundary(self, range.start) 861 && is_code_point_boundary(self, range.end) 862 { 863 unsafe { slice_unchecked(self, range.start, range.end) } 864 } else { 865 slice_error_fail(self, range.start, range.end) 866 } 867 } 868 } 869 870 /// Returns a slice of the given string from byte `begin` to its end. 871 /// 872 /// # Panics 873 /// 874 /// Panics when `begin` is not at a code point boundary, 875 /// or is beyond the end of the string. 876 impl ops::Index<ops::RangeFrom<usize>> for Wtf8 { 877 type Output = Wtf8; 878 879 #[inline] 880 fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 { 881 // is_code_point_boundary checks that the index is in [0, .len()] 882 if is_code_point_boundary(self, range.start) { 883 unsafe { slice_unchecked(self, range.start, self.len()) } 884 } else { 885 slice_error_fail(self, range.start, self.len()) 886 } 887 } 888 } 889 890 /// Returns a slice of the given string from its beginning to byte `end`. 891 /// 892 /// # Panics 893 /// 894 /// Panics when `end` is not at a code point boundary, 895 /// or is beyond the end of the string. 896 impl ops::Index<ops::RangeTo<usize>> for Wtf8 { 897 type Output = Wtf8; 898 899 #[inline] 900 fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 { 901 // is_code_point_boundary checks that the index is in [0, .len()] 902 if is_code_point_boundary(self, range.end) { 903 unsafe { slice_unchecked(self, 0, range.end) } 904 } else { 905 slice_error_fail(self, 0, range.end) 906 } 907 } 908 } 909 910 impl ops::Index<ops::RangeFull> for Wtf8 { 911 type Output = Wtf8; 912 913 #[inline] 914 fn index(&self, _range: ops::RangeFull) -> &Wtf8 { 915 self 916 } 917 } 918 919 #[inline] 920 fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { 921 // The first byte is assumed to be 0xED 922 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F 923 } 924 925 #[inline] 926 fn decode_surrogate_pair(lead: u16, trail: u16) -> char { 927 let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); 928 unsafe { char::from_u32_unchecked(code_point) } 929 } 930 931 /// Copied from core::str::StrPrelude::is_char_boundary 932 #[inline] 933 pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { 934 if index == slice.len() { 935 return true; 936 } 937 match slice.bytes.get(index) { 938 None => false, 939 Some(&b) => b < 128 || b >= 192, 940 } 941 } 942 943 /// Copied from core::str::raw::slice_unchecked 944 #[inline] 945 pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { 946 // memory layout of a &[u8] and &Wtf8 are the same 947 Wtf8::from_bytes_unchecked(slice::from_raw_parts( 948 s.bytes.as_ptr().add(begin), 949 end - begin, 950 )) 951 } 952 953 /// Copied from core::str::raw::slice_error_fail 954 #[inline(never)] 955 pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { 956 assert!(begin <= end); 957 panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary"); 958 #[allow(unreachable_code)] 959 loop {} 960 } 961 962 /// Iterator for the code points of a WTF-8 string. 963 /// 964 /// Created with the method `.code_points()`. 965 #[derive(Clone)] 966 pub struct Wtf8CodePoints<'a> { 967 bytes: slice::Iter<'a, u8>, 968 } 969 970 impl<'a> Iterator for Wtf8CodePoints<'a> { 971 type Item = CodePoint; 972 973 #[inline] 974 fn next(&mut self) -> Option<CodePoint> { 975 // SAFETY: `self.bytes` has been created from a WTF-8 string 976 unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) } 977 } 978 979 #[inline] 980 fn size_hint(&self) -> (usize, Option<usize>) { 981 let len = self.bytes.len(); 982 (len.saturating_add(3) / 4, Some(len)) 983 } 984 } 985 986 /// Generates a wide character sequence for potentially ill-formed UTF-16. 987 #[derive(Clone)] 988 pub struct EncodeWide<'a> { 989 code_points: Wtf8CodePoints<'a>, 990 extra: u16, 991 } 992 993 // Copied from libunicode/u_str.rs 994 impl<'a> Iterator for EncodeWide<'a> { 995 type Item = u16; 996 997 #[inline] 998 fn next(&mut self) -> Option<u16> { 999 if self.extra != 0 { 1000 let tmp = self.extra; 1001 self.extra = 0; 1002 return Some(tmp); 1003 } 1004 1005 let mut buf = [0; 2]; 1006 self.code_points.next().map(|code_point| { 1007 let n = encode_utf16_raw(code_point.value, &mut buf).len(); 1008 if n == 2 { 1009 self.extra = buf[1]; 1010 } 1011 buf[0] 1012 }) 1013 } 1014 1015 #[inline] 1016 fn size_hint(&self) -> (usize, Option<usize>) { 1017 let (low, high) = self.code_points.size_hint(); 1018 let ext = (self.extra != 0) as usize; 1019 // every code point gets either one u16 or two u16, 1020 // so this iterator is between 1 or 2 times as 1021 // long as the underlying iterator. 1022 ( 1023 low + ext, 1024 high.and_then(|n| n.checked_mul(2)) 1025 .and_then(|n| n.checked_add(ext)), 1026 ) 1027 } 1028 } 1029 1030 impl FusedIterator for EncodeWide<'_> {} 1031 1032 impl Hash for CodePoint { 1033 #[inline] 1034 fn hash<H: Hasher>(&self, state: &mut H) { 1035 self.value.hash(state) 1036 } 1037 } 1038 1039 impl Hash for Wtf8Buf { 1040 #[inline] 1041 fn hash<H: Hasher>(&self, state: &mut H) { 1042 state.write(&self.bytes); 1043 0xfeu8.hash(state) 1044 } 1045 } 1046 1047 impl Hash for Wtf8 { 1048 #[inline] 1049 fn hash<H: Hasher>(&self, state: &mut H) { 1050 state.write(&self.bytes); 1051 0xfeu8.hash(state) 1052 } 1053 } 1054